diff --git a/.gitignore b/.gitignore index e096dc41..f361db7d 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,10 @@ # CMake files build/ +build-*/ +cmake-build-*/ + +# Clion files +.idea/ +CMakeFiles/ +.vscode/ \ No newline at end of file diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 7759bbbc..56dc0693 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -3,7 +3,7 @@ macro(set_compilation_settings NAME) target_include_directories(${NAME} PRIVATE ${CMAKE_CURRENT_LIST_DIR}) target_link_libraries(${NAME} PRIVATE sylvan::sylvan) target_compile_features(${NAME} PRIVATE c_std_11 cxx_std_11) - target_compile_options(${NAME} PRIVATE -Wall -Wextra -Werror -Wno-deprecated) + target_compile_options(${NAME} PRIVATE -Wall -Wextra -Werror -Wno-deprecated -Wno-unused-parameter) endmacro(set_compilation_settings) macro(add_example NAME SOURCE) @@ -24,6 +24,16 @@ add_example(nqueens nqueens.c) add_example(simple simple.cpp) +add_example(aigsynt aigsynt.cpp) +target_sources(aigsynt PRIVATE aag.h) + +# Check if we have Boost +find_package(Boost 1.71.0) +if(Boost_FOUND) + include_directories(${Boost_INCLUDE_DIRS}) + target_link_libraries(aigsynt ${Boost_LIBRARIES}) +endif() + # Check if we have Meddly find_library(MEDDLY_FOUND meddly) if(MEDDLY_FOUND) diff --git a/examples/aag.h b/examples/aag.h new file mode 100644 index 00000000..52c1da48 --- /dev/null +++ b/examples/aag.h @@ -0,0 +1,270 @@ +#ifndef SYLVAN_AAG_H +#define SYLVAN_AAG_H + +#include +#include +#include +#include +#include + +typedef struct aag_header +{ + size_t m; // maximum variable index + size_t i; // number of inputs + size_t l; // number of latches + size_t o; // number of outputs + size_t a; // number of AND gates + size_t b; // number of bad state properties + size_t c; // number of invariant constraints + size_t j; // number of justice properties + size_t f; // number of fairness constraints +} aag_header_t; + +typedef struct aag_file_s +{ + aag_header_t header; + size_t *inputs; + size_t *outputs; + size_t *latches; + size_t *l_next; + int *lookup; + size_t *gatelhs; + size_t *gatelft; + size_t *gatergt; +} aag_file_t; + +typedef struct aag_buffer_s +{ + uint8_t *content; + size_t size; + size_t pos; + int file_descriptor; + struct stat filestat; +} aag_buffer_t; + +void +aag_buffer_open(aag_buffer_t *buffer, const char * filename, int access) +{ + if (buffer->content != NULL) { + munmap(buffer->content, buffer->size); + buffer->content = NULL; + } + if (buffer->file_descriptor != -1) { + close(buffer->file_descriptor); + buffer->file_descriptor = -1; + } + buffer->size = 0; + buffer->pos = 0; + buffer->filestat = {}; + + buffer->file_descriptor = open(filename, access); + if (buffer->file_descriptor == -1) { + fprintf(stderr, "cannot open file %s\n", filename); + exit(-1); + } + if (fstat(buffer->file_descriptor, &buffer->filestat) != 0) { + fprintf(stderr, "cannot stat file %s\n", filename); + exit(-1); + } + buffer->size = buffer->filestat.st_size; + buffer->content = (uint8_t *) mmap(nullptr, buffer->filestat.st_size, PROT_READ, MAP_SHARED, buffer->file_descriptor, 0); + if (buffer->content == MAP_FAILED) { + fprintf(stderr, "mmap failed for file %s\n", filename); + exit(-1); + } +} + +void +aag_buffer_close(aag_buffer_t *buffer) +{ + if (buffer->content != NULL) { + munmap(buffer->content, buffer->size); + buffer->content = NULL; + } + if (buffer->file_descriptor != -1) { + close(buffer->file_descriptor); + buffer->file_descriptor = -1; + } + buffer->size = 0; + buffer->pos = 0; + buffer->filestat = {}; +} + +int +aag_buffer_peek(aag_buffer_t *buffer) +{ + if (buffer->pos == buffer->size) return EOF; + return (int) buffer->content[buffer->pos]; +} + +void +aag_buffer_skip(aag_buffer_t *buffer) +{ + buffer->pos++; +} + +void +aag_buffer_read_wsnl(aag_buffer_t *buffer) +{ + while (true) { + int c = aag_buffer_peek(buffer); + if (c != ' ' && c != '\n' && c != '\t') return; + aag_buffer_skip(buffer); + } +} + +void +aag_buffer_read_ws(aag_buffer_t *buffer) +{ + while (true) { + int c = aag_buffer_peek(buffer); + if (c != ' ' && c != '\t') return; + aag_buffer_skip(buffer); + } +} + +void +aag_buffer_err() +{ + fprintf(stderr, "File read error."); + exit(-1); +} + +int +aag_buffer_read(aag_buffer_t *buffer) +{ + if (buffer->pos == buffer->size) return EOF; + return (int) buffer->content[buffer->pos++]; +} + +void +aag_buffer_read_token(const char *str, aag_buffer_t *buffer) +{ + while (*str != 0) { + if (aag_buffer_read(buffer) != (int) (uint8_t) (*str++)) { + aag_buffer_err(); + } + } +} + +uint64_t +aag_buffer_read_uint(aag_buffer_t *buffer) +{ + uint64_t r = 0; + while (true) { + int c = aag_buffer_peek(buffer); + if (c < '0' || c > '9') return r; + r *= 10; + r += c - '0'; + aag_buffer_skip(buffer); + } +} + +void +aag_buffer_read_string(std::string &s, aag_buffer_t *buffer) +{ + s = ""; + while (true) { + int c = aag_buffer_peek(buffer); + if (c == EOF || c == '\n') return; + s += (char) c; + aag_buffer_skip(buffer); + } +} + +void +aag_header_read(aag_header_t *header, aag_buffer_t *buffer) +{ + aag_buffer_read_wsnl(buffer); + aag_buffer_read_token("aag", buffer); + aag_buffer_read_ws(buffer); + header->m = aag_buffer_read_uint(buffer); + aag_buffer_read_ws(buffer); + header->i = aag_buffer_read_uint(buffer); + aag_buffer_read_ws(buffer); + header->l = aag_buffer_read_uint(buffer); + aag_buffer_read_ws(buffer); + header->o = aag_buffer_read_uint(buffer); + aag_buffer_read_ws(buffer); + header->a = aag_buffer_read_uint(buffer); + aag_buffer_read_ws(buffer); + // optional + header->b = 0; + header->c = 0; + header->j = 0; + header->f = 0; + aag_buffer_read_ws(buffer); + if (aag_buffer_peek(buffer) != '\n') { + header->b = aag_buffer_read_uint(buffer); + aag_buffer_read_ws(buffer); + } + if (aag_buffer_peek(buffer) != '\n') { + header->c = aag_buffer_read_uint(buffer); + aag_buffer_read_ws(buffer); + } + if (aag_buffer_peek(buffer) != '\n') { + header->j = aag_buffer_read_uint(buffer); + aag_buffer_read_ws(buffer); + } + if (aag_buffer_peek(buffer) != '\n') { + header->f = aag_buffer_read_uint(buffer); + } + aag_buffer_read_wsnl(buffer); + + if (header->o != 1) { + fprintf(stderr, "expecting 1 output"); + exit(-1); + } + + if (header->b != 0 or header->c != 0 or header->j != 0 or header->f != 0) { + fprintf(stderr, "No support for new format."); + exit(-1); + } +} + +void +aag_file_read(aag_file_t *aag, aag_buffer_t *buffer) +{ + aag_header_t header; + aag_header_read(&header, buffer); + + aag->header = header; + aag->inputs = (size_t*) calloc(header.i, sizeof(size_t)); + aag->latches = (size_t*) calloc(header.l, sizeof(size_t)); + aag->l_next = (size_t*) calloc(header.l, sizeof(size_t)); + aag->outputs = (size_t*) calloc(header.o, sizeof(size_t)); + aag->gatelhs = (size_t*) calloc(header.a, sizeof(size_t)); + aag->gatelft = (size_t*) calloc(header.a, sizeof(size_t)); + aag->gatergt = (size_t*) calloc(header.a, sizeof(size_t)); + aag->lookup = (int*) calloc(header.m + 1, sizeof(int)); + + for (uint64_t i = 0; i < aag->header.i; i++) { + aag->inputs[i] = aag_buffer_read_uint(buffer); + aag_buffer_read_wsnl(buffer); + } + + for (uint64_t l = 0; l < aag->header.l; l++) { + aag->latches[l] = aag_buffer_read_uint(buffer); + aag_buffer_read_ws(buffer); + aag->l_next[l] = aag_buffer_read_uint(buffer); + aag_buffer_read_wsnl(buffer); + } + + for (uint64_t o = 0; o < aag->header.o; o++) { + aag->outputs[o] = aag_buffer_read_uint(buffer); + aag_buffer_read_wsnl(buffer); + } + + for (uint64_t i = 0; i <= aag->header.m; i++) aag->lookup[i] = -1; // not an and-gate + for (uint64_t a = 0; a < aag->header.a; a++) { + aag->gatelhs[a] = aag_buffer_read_uint(buffer); + aag->lookup[aag->gatelhs[a] / 2] = (int) a; + aag_buffer_read_ws(buffer); + aag->gatelft[a] = aag_buffer_read_uint(buffer); + aag_buffer_read_ws(buffer); + aag->gatergt[a] = aag_buffer_read_uint(buffer); + aag_buffer_read_wsnl(buffer); + } +} + +#endif //SYLVAN_AAG_H diff --git a/examples/aigsynt.cpp b/examples/aigsynt.cpp new file mode 100644 index 00000000..d0a46d8a --- /dev/null +++ b/examples/aigsynt.cpp @@ -0,0 +1,493 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "aag.h" + + +using namespace sylvan; + +typedef struct safety_game +{ + MTBDD *gates; // and gates + MTBDD c_inputs; // controllable inputs + MTBDD u_inputs; // uncontrollable inputs + int *level_to_order; // mapping from variable level to static variable order +} safety_game_t; + +double t_start; +#define INFO(s, ...) fprintf(stdout, "\r[% 8.2f] " s, wctime()-t_start, ##__VA_ARGS__) +#define Abort(s, ...) { fprintf(stderr, "\r[% 8.2f] " s, wctime()-t_start, ##__VA_ARGS__); exit(-1); } + + +/* Configuration */ +static int workers = 1; +static int verbose = 0; +static char *filename = nullptr; // filename of the aag file +static int static_reorder = 0; +static int dynamic_reorder = 0; +static int sloan_w1 = 1; +static int sloan_w2 = 8; + +//static FILE *log_file = nullptr; + +/* Global variables */ +static aag_file_t aag{ + .header = { + .m = 0, + .i = 0, + .l = 0, + .o = 0, + .a = 0, + .b = 0, + .c = 0, + .j = 0, + .f = 0 + }, + .inputs = nullptr, + .outputs = nullptr, + .latches = nullptr, + .l_next = nullptr, + .lookup = nullptr, + .gatelhs = nullptr, + .gatelft = nullptr, + .gatergt = nullptr +}; +static aag_buffer_t aag_buffer{ + .content = nullptr, + .size = 0, + .pos = 0, + .file_descriptor = -1, + .filestat = {} +}; +static safety_game_t game{ + .gates = nullptr, + .c_inputs = sylvan_set_empty(), + .u_inputs = sylvan_set_empty(), + .level_to_order = nullptr +}; + +/* Obtain current wallclock time */ +static double +wctime() +{ + struct timeval tv{}; + gettimeofday(&tv, nullptr); + return (tv.tv_sec + 1E-6 * tv.tv_usec); +} + +static void +print_usage() +{ + printf("Usage: aigsynt [-w ] [-d --dynamic-reordering] [-s --static-reordering]\n"); + printf(" [-v --verbose] [--help] [--usage] []\n"); +} + +static void +print_help() +{ + printf("Usage: aigsynt [OPTION...] []\n\n"); + printf(" Strategy for reachability (default=par)\n"); + printf(" -d, Dynamic variable ordering\n"); + printf(" -w, --workers= Number of workers (default=0: autodetect)\n"); + printf(" -v, Dynamic variable ordering\n"); + printf(" -s, Reorder with Sloan\n"); + printf(" -h, --help Give this help list\n"); + printf(" --usage Give a short usage message\n"); +} + +static void +parse_args(int argc, char **argv) +{ + static const option longopts[] = { + {"workers", required_argument, (int *) 'w', 1}, + {"dynamic-reordering", no_argument, nullptr, 'd'}, + {"static-reordering", no_argument, nullptr, 's'}, + {"verbose", no_argument, nullptr, 'v'}, + {"help", no_argument, nullptr, 'h'}, + {"usage", no_argument, nullptr, 99}, + {nullptr, no_argument, nullptr, 0}, + }; + int key = 0; + int long_index = 0; + while ((key = getopt_long(argc, argv, "w:s:h", longopts, &long_index)) != -1) { + switch (key) { + case 'w': + workers = atoi(optarg); + break; + case 's': + static_reorder = 1; + break; + case 'd': + dynamic_reorder = 1; + break; + case 'v': + verbose = 1; + break; + case 99: + print_usage(); + exit(0); + case 'h': + print_help(); + exit(0); + } + } + if (optind >= argc) { + print_usage(); + exit(0); + } + filename = argv[optind]; +} + +VOID_TASK_0(gc_start) +{ + size_t used, total; + sylvan_table_usage(&used, &total); + printf("\n"); + INFO("GC: str: %zu/%zu size\n", used, total); +} + +VOID_TASK_0(gc_end) +{ + size_t used, total; + sylvan_table_usage(&used, &total); + INFO("GC: end: %zu/%zu size\n\n", used, total); +} + +VOID_TASK_0(reordering_start) { + printf("\r[% 8.2f] RE: from %zu to ... ", wctime()-t_start, llmsset_count_marked(nodes)); +} + +VOID_TASK_0(reordering_end) { + printf("%zu nodes in %f\n", llmsset_count_marked(nodes), wctime() - reorder_db->config.t_start_sifting); +} + +void order_statically() +{ + int *matrix = new int[aag.header.m * aag.header.m]; + for (unsigned m = 0; m < aag.header.m * aag.header.m; m++) matrix[m] = 0; + for (unsigned m = 0; m < aag.header.m; m++) matrix[m * aag.header.m + m] = 1; + + for (uint64_t i = 0; i < aag.header.i; i++) { + int v = (int) aag.inputs[i] / 2 - 1; + matrix[v * aag.header.m + v] = 1; + } + + for (uint64_t l = 0; l < aag.header.l; l++) { + int v = (int) aag.latches[l] / 2 - 1; + int n = (int) aag.l_next[l] / 2 - 1; + matrix[v * aag.header.m + v] = 1; // l -> l + if (n >= 0) { + matrix[v * aag.header.m + n] = 1; // l -> n + matrix[n * aag.header.m + v] = 1; // make symmetric + } + } + + for (uint64_t a = 0; a < aag.header.a; a++) { + int v = (int) aag.gatelhs[a] / 2 - 1; + int x = (int) aag.gatelft[a] / 2 - 1; + int y = (int) aag.gatergt[a] / 2 - 1; + matrix[v * aag.header.m + v] = 1; + if (x >= 0) { + matrix[v * aag.header.m + x] = 1; + matrix[x * aag.header.m + v] = 1; + } + if (y >= 0) { + matrix[v * aag.header.m + y] = 1; + matrix[y * aag.header.m + v] = 1; + } + } + + typedef boost::adjacency_list>>> Graph; + + typedef boost::graph_traits::vertex_descriptor Vertex; + + Graph g = Graph(aag.header.m); + + for (unsigned row = 0; row < aag.header.m; row++) { + for (unsigned col = 0; col < aag.header.m; col++) { + if (matrix[row * aag.header.m + col]) boost::add_edge(row, col, g); + } + } + + std::vector inv_perm(boost::num_vertices(g)); + + boost::sloan_ordering(g, inv_perm.begin(), boost::get(boost::vertex_color, g), boost::make_degree_map(g), + boost::get(boost::vertex_priority, g), sloan_w1, sloan_w2); + + std::vector level_to_var; + + for (uint64_t i = 0; i <= aag.header.m; i++) level_to_var[i] = -1; + + int r = 0; + + r = 0; + for (unsigned long &i: inv_perm) { + uint64_t j = i + 1; + if (level_to_var[j] != -1) { + printf("ERROR: level_to_var of %zu is already %d (%d)\n", (size_t) j, level_to_var[j], r); + for (uint64_t k = 1; k <= aag.header.m; k++) { + if (level_to_var[k] == -1) printf("%zu is still -1\n", (size_t) k); + level_to_var[k] = r++; + } + } else { + level_to_var[j] = r++; + } + } + + printf("r=%d M=%d\n", r, (int) aag.header.m); +#if 1 + for (unsigned m = 0; m < aag.header.m * aag.header.m; m++) matrix[m] = 0; + + for (uint64_t i = 0; i < aag.header.i; i++) { + int v = level_to_var[aag.inputs[i] / 2]; + matrix[v * aag.header.m + v] = 1; + } + + for (uint64_t l = 0; l < aag.header.l; l++) { + int v = level_to_var[aag.latches[l] / 2]; + int n = level_to_var[aag.l_next[l] / 2]; + matrix[v * aag.header.m + v] = 1; // l -> l + if (n >= 0) { + matrix[v * aag.header.m + n] = 1; // l -> n + } + } + + for (uint64_t a = 0; a < aag.header.a; a++) { + int v = level_to_var[aag.gatelhs[a] / 2]; + int x = level_to_var[aag.gatelft[a] / 2]; + int y = level_to_var[aag.gatergt[a] / 2]; + matrix[v * aag.header.m + v] = 1; + if (x >= 0) { + matrix[v * aag.header.m + x] = 1; + } + if (y >= 0) { + matrix[v * aag.header.m + y] = 1; + } + } + + printf("Matrix\n"); + for (unsigned row = 0; row < aag.header.m; row++) { + for (unsigned col = 0; col < aag.header.m; col++) { + printf("%c", matrix[row * aag.header.m + col] ? '+' : '-'); + } + printf("\n"); + } +#endif +} + +#define make_gate(gate) CALL(make_gate, gate) +VOID_TASK_1(make_gate, int, gate) +{ + if (game.gates[gate] != sylvan_invalid) return; + int lft = (int) aag.gatelft[gate] / 2; + int rgt = (int) aag.gatergt[gate] / 2; + + MTBDD l, r; + if (lft == 0) { + l = sylvan_false; + } else if (aag.lookup[lft] != -1) { + make_gate(aag.lookup[lft]); + l = game.gates[aag.lookup[lft]]; + } else { + l = sylvan_ithvar(game.level_to_order[lft]); // always use even variables (prime is odd) + } + if (rgt == 0) { + r = sylvan_false; + } else if (aag.lookup[rgt] != -1) { + make_gate(aag.lookup[rgt]); + r = game.gates[aag.lookup[rgt]]; + } else { + r = sylvan_ithvar(game.level_to_order[rgt]); // always use even variables (prime is odd) + } + if (aag.gatelft[gate] & 1) l = sylvan_not(l); + if (aag.gatergt[gate] & 1) r = sylvan_not(r); + game.gates[gate] = sylvan_and(l, r); + mtbdd_protect(&game.gates[gate]); +} + +#define solve_game() RUN(solve_game) +TASK_0(int, solve_game) +{ + game.level_to_order = (int *) calloc(aag.header.m + 1, sizeof(int)); + + if (static_reorder) { + order_statically(); + } else { + for (int i = 0; i <= (int) aag.header.m; i++) game.level_to_order[i] = i; + } + + INFO("Making the gate BDDs...\n"); + + game.gates = new MTBDD[aag.header.a]; + for (uint64_t a = 0; a < aag.header.a; a++) game.gates[a] = sylvan_invalid; + for (uint64_t gate = 0; gate < aag.header.a; gate++) { + make_gate(gate); + if (dynamic_reorder) { + sylvan_test_reduce_heap(); + } + } + + sylvan_test_reduce_heap(); + if (verbose) INFO("Gates have size %zu\n", mtbdd_nodecount_more(game.gates, aag.header.a)); + + game.c_inputs = sylvan_set_empty(); + game.u_inputs = sylvan_set_empty(); + mtbdd_protect(&game.c_inputs); + mtbdd_protect(&game.u_inputs); + + // Now read the [[optional]] labels to find controllable vars + while (true) { + int c = aag_buffer_peek(&aag_buffer); + if (c != 'l' and c != 'i' and c != 'o') break; + aag_buffer_skip(&aag_buffer); + int pos = (int) aag_buffer_read_uint(&aag_buffer); + aag_buffer_read_token(" ", &aag_buffer); + std::string s; + aag_buffer_read_string(s, &aag_buffer); + aag_buffer_read_wsnl(&aag_buffer); + if (c == 'i') { + if (strncmp(s.c_str(), "controllable_", 13) == 0) { + game.c_inputs = sylvan_set_add(game.c_inputs, game.level_to_order[aag.inputs[pos] / 2]); + } else { + game.u_inputs = sylvan_set_add(game.u_inputs, game.level_to_order[aag.inputs[pos] / 2]); + } + } + } + INFO("There are %zu controllable and %zu uncontrollable inputs.\n", sylvan_set_count(game.c_inputs), sylvan_set_count(game.u_inputs)); + + // Actually just make the compose vector + MTBDD CV = sylvan_map_empty(); + mtbdd_protect(&CV); + + for (uint64_t l = 0; l < aag.header.l; l++) { + MTBDD nxt; + if (aag.lookup[aag.l_next[l] / 2] == -1) { + nxt = sylvan_ithvar(game.level_to_order[aag.l_next[l] / 2]); + } else { + nxt = game.gates[aag.lookup[aag.l_next[l] / 2]]; + } + if (aag.l_next[l] & 1) nxt = sylvan_not(nxt); + CV = sylvan_map_add(CV, game.level_to_order[aag.latches[l] / 2], nxt); + } + + // now make output + INFO("output is %zu (lookup: %d)\n", (size_t) aag.outputs[0], aag.lookup[aag.outputs[0] / 2]); + MTBDD Unsafe; + mtbdd_protect(&Unsafe); + if (aag.lookup[aag.outputs[0] / 2] == -1) { + Unsafe = sylvan_ithvar(aag.outputs[0] / 2); + } else { + Unsafe = game.gates[aag.lookup[aag.outputs[0] / 2]]; + } + if (aag.outputs[0] & 1) Unsafe = sylvan_not(Unsafe); + Unsafe = sylvan_forall(Unsafe, game.c_inputs); + Unsafe = sylvan_exists(Unsafe, game.u_inputs); + + MTBDD OldUnsafe = sylvan_false; // empty set + MTBDD Step = sylvan_false; + mtbdd_protect(&OldUnsafe); + mtbdd_protect(&Step); + + while (Unsafe != OldUnsafe) { + OldUnsafe = Unsafe; + + Step = sylvan_compose(Unsafe, CV); + Step = sylvan_forall(Step, game.c_inputs); + Step = sylvan_exists(Step, game.u_inputs); + + // check if initial state in Step (all 0) + MTBDD Check = Step; + while (Check != sylvan_false) { + if (Check == sylvan_true) { + return 0; + } else { + Check = sylvan_low(Check); + } + } + + Unsafe = sylvan_or(Unsafe, Step); + } + return 1; +} + +int main(int argc, char **argv) +{ + t_start = wctime(); + setlocale(LC_NUMERIC, "en_US.utf-8"); + parse_args(argc, argv); + INFO("Model: %s\n", filename); + if (filename == nullptr) { + Abort("Invalid file name.\n"); + } + + aag_buffer_open(&aag_buffer, filename, O_RDONLY); + aag_file_read(&aag, &aag_buffer); + + if (verbose) { + INFO("----------header----------\n"); + INFO("# of variables \t %lu\n", aag.header.m); + INFO("# of inputs \t %lu\n", aag.header.i); + INFO("# of latches \t %lu\n", aag.header.l); + INFO("# of outputs \t %lu\n", aag.header.o); + INFO("# of AND gates \t %lu\n", aag.header.a); + INFO("# of bad state properties \t %lu\n", aag.header.b); + INFO("# of invariant constraints\t %lu\n", aag.header.c); + INFO("# of justice properties \t %lu\n", aag.header.j); + INFO("# of fairness constraints \t %lu\n", aag.header.f); + INFO("--------------------------\n"); + } + + lace_start(workers, 0); + + // 1LL<<19: 8192 nodes (minimum) + // 1LL<<20: 16384 nodes + // 1LL<<21: 32768 nodes + // 1LL<<22: 65536 nodes + // 1LL<<23: 131072 nodes + // 1LL<<24: 262144 nodes + // 1LL<<25: 524288 nodes + sylvan_set_limits(1LL << 24, 1, 0); + sylvan_init_package(); + sylvan_init_mtbdd(); + sylvan_init_reorder(); + sylvan_gc_disable(); + + sylvan_set_reorder_type(SYLVAN_REORDER_BOUNDED_SIFT); + + // Set hooks for logging garbage collection & dynamic variable reordering + if (verbose) { + sylvan_re_hook_prere(TASK(reordering_start)); + sylvan_re_hook_postre(TASK(reordering_end)); + sylvan_gc_hook_pregc(TASK(gc_start)); + sylvan_gc_hook_postgc(TASK(gc_end)); + } + + int is_realizable = solve_game(); + if (is_realizable) { + INFO("REALIZABLE\n"); + } else { + INFO("UNREALIZABLE\n"); + } + + // Report Sylvan statistics (if SYLVAN_STATS is set) + if (verbose) sylvan_stats_report(stdout); + + aag_buffer_close(&aag_buffer); + sylvan_quit(); + lace_stop(); + + return 0; +} diff --git a/models/add10y.aag b/models/add10y.aag new file mode 100644 index 00000000..9670da52 --- /dev/null +++ b/models/add10y.aag @@ -0,0 +1,234 @@ +aag 161 30 2 1 129 +2 +4 +6 +8 +10 +12 +14 +16 +18 +20 +22 +24 +26 +28 +30 +32 +34 +36 +38 +40 +42 +44 +46 +48 +50 +52 +54 +56 +58 +60 +62 1 +64 323 +66 +66 64 62 +68 58 38 +70 56 36 +72 54 34 +74 52 32 +76 50 30 +78 48 28 +80 46 26 +82 44 24 +84 42 22 +86 45 25 +88 87 84 +90 89 83 +92 47 27 +94 93 91 +96 95 81 +98 49 29 +100 99 97 +102 101 79 +104 51 31 +106 105 103 +108 107 77 +110 53 33 +112 111 109 +114 113 75 +116 55 35 +118 117 115 +120 119 73 +122 57 37 +124 123 121 +126 125 71 +128 59 39 +130 129 127 +132 131 69 +134 133 60 +136 132 61 +138 137 135 +140 139 41 +142 138 40 +144 143 141 +146 144 21 +148 145 20 +150 149 147 +152 58 39 +154 59 38 +156 155 153 +158 157 127 +160 156 126 +162 161 159 +164 162 19 +166 163 18 +168 167 165 +170 56 37 +172 57 36 +174 173 171 +176 175 121 +178 174 120 +180 179 177 +182 180 17 +184 181 16 +186 185 183 +188 54 35 +190 55 34 +192 191 189 +194 193 115 +196 192 114 +198 197 195 +200 198 15 +202 199 14 +204 203 201 +206 52 33 +208 53 32 +210 209 207 +212 211 109 +214 210 108 +216 215 213 +218 216 13 +220 217 12 +222 221 219 +224 50 31 +226 51 30 +228 227 225 +230 229 103 +232 228 102 +234 233 231 +236 234 11 +238 235 10 +240 239 237 +242 48 29 +244 49 28 +246 245 243 +248 247 97 +250 246 96 +252 251 249 +254 252 9 +256 253 8 +258 257 255 +260 46 27 +262 47 26 +264 263 261 +266 265 91 +268 264 90 +270 269 267 +272 270 7 +274 271 6 +276 275 273 +278 43 23 +280 279 85 +282 280 3 +284 281 2 +286 285 283 +288 44 25 +290 45 24 +292 291 289 +294 292 84 +296 293 85 +298 297 295 +300 299 5 +302 298 4 +304 303 301 +306 304 286 +308 306 276 +310 308 258 +312 310 240 +314 312 222 +316 314 204 +318 316 186 +320 318 168 +322 320 150 +i0 controllable_c<0> +i1 controllable_c<1> +i2 controllable_c<2> +i3 controllable_c<3> +i4 controllable_c<4> +i5 controllable_c<5> +i6 controllable_c<6> +i7 controllable_c<7> +i8 controllable_c<8> +i9 controllable_c<9> +i10 a<0> +i11 a<1> +i12 a<2> +i13 a<3> +i14 a<4> +i15 a<5> +i16 a<6> +i17 a<7> +i18 a<8> +i19 a<9> +i20 b<0> +i21 b<1> +i22 b<2> +i23 b<3> +i24 b<4> +i25 b<5> +i26 b<6> +i27 b<7> +i28 b<8> +i29 b<9> +l0 n63 +l1 err_out +o0 err +c +bench +This file was written by ABC on Sat Aug 31 20:25:02 2013 +For information about AIGER format, refer to http://fmv.jku.at/aiger +------------------------------- +This AIGER file has been created by the following sequence of commands: +> vl2mv add10.v ---gives--> add10.mv +> abc -c "read_blif_mv add10.mv; strash; refactor; rewrite; dfraig; rewrite; dfraig; write_aiger -s add10y.aig" ---gives--> add10y.aig +> aigtoaig add10y.aig add10y.aag ---gives--> add10y.aag (this file) +Content of add10.v: +// realizable +module bench(clk, a, b, controllable_c, err); + input clk; + input [9:0] a; + input [9:0] b; + input [9:0] controllable_c; + output err; + reg err; + + initial + begin + err = 1'b0; + end + + always @ (posedge clk) + begin + if(controllable_c == a + b) + err = 1'b0; + else + err = 1'b1; + end +endmodule +------------------------------- +#!SYNTCOMP +STATUS : realizable +SOLVED_BY : 7/8 [SYNTCOMP2014-RealSeq] +SOLVED_IN : 0.008 [SYNTCOMP2014-RealSeq] +#. diff --git a/models/add12y.aag b/models/add12y.aag new file mode 100644 index 00000000..7f79fb40 --- /dev/null +++ b/models/add12y.aag @@ -0,0 +1,274 @@ +aag 195 36 2 1 157 +2 +4 +6 +8 +10 +12 +14 +16 +18 +20 +22 +24 +26 +28 +30 +32 +34 +36 +38 +40 +42 +44 +46 +48 +50 +52 +54 +56 +58 +60 +62 +64 +66 +68 +70 +72 +74 1 +76 391 +78 +78 76 74 +80 70 46 +82 68 44 +84 66 42 +86 64 40 +88 62 38 +90 60 36 +92 58 34 +94 56 32 +96 54 30 +98 52 28 +100 50 26 +102 53 29 +104 103 100 +106 105 99 +108 55 31 +110 109 107 +112 111 97 +114 57 33 +116 115 113 +118 117 95 +120 59 35 +122 121 119 +124 123 93 +126 61 37 +128 127 125 +130 129 91 +132 63 39 +134 133 131 +136 135 89 +138 65 41 +140 139 137 +142 141 87 +144 67 43 +146 145 143 +148 147 85 +150 69 45 +152 151 149 +154 153 83 +156 71 47 +158 157 155 +160 159 81 +162 161 72 +164 160 73 +166 165 163 +168 167 49 +170 166 48 +172 171 169 +174 172 25 +176 173 24 +178 177 175 +180 70 47 +182 71 46 +184 183 181 +186 185 155 +188 184 154 +190 189 187 +192 190 23 +194 191 22 +196 195 193 +198 68 45 +200 69 44 +202 201 199 +204 203 149 +206 202 148 +208 207 205 +210 208 21 +212 209 20 +214 213 211 +216 66 43 +218 67 42 +220 219 217 +222 221 143 +224 220 142 +226 225 223 +228 226 19 +230 227 18 +232 231 229 +234 64 41 +236 65 40 +238 237 235 +240 239 137 +242 238 136 +244 243 241 +246 244 17 +248 245 16 +250 249 247 +252 62 39 +254 63 38 +256 255 253 +258 257 131 +260 256 130 +262 261 259 +264 262 15 +266 263 14 +268 267 265 +270 60 37 +272 61 36 +274 273 271 +276 275 125 +278 274 124 +280 279 277 +282 280 13 +284 281 12 +286 285 283 +288 58 35 +290 59 34 +292 291 289 +294 293 119 +296 292 118 +298 297 295 +300 298 11 +302 299 10 +304 303 301 +306 56 33 +308 57 32 +310 309 307 +312 311 113 +314 310 112 +316 315 313 +318 316 9 +320 317 8 +322 321 319 +324 54 31 +326 55 30 +328 327 325 +330 329 107 +332 328 106 +334 333 331 +336 334 7 +338 335 6 +340 339 337 +342 51 27 +344 343 101 +346 344 3 +348 345 2 +350 349 347 +352 52 29 +354 53 28 +356 355 353 +358 356 100 +360 357 101 +362 361 359 +364 363 5 +366 362 4 +368 367 365 +370 368 350 +372 370 340 +374 372 322 +376 374 304 +378 376 286 +380 378 268 +382 380 250 +384 382 232 +386 384 214 +388 386 196 +390 388 178 +i0 controllable_c<0> +i1 controllable_c<1> +i2 controllable_c<2> +i3 controllable_c<3> +i4 controllable_c<4> +i5 controllable_c<5> +i6 controllable_c<6> +i7 controllable_c<7> +i8 controllable_c<8> +i9 controllable_c<9> +i10 controllable_c<10> +i11 controllable_c<11> +i12 a<0> +i13 a<1> +i14 a<2> +i15 a<3> +i16 a<4> +i17 a<5> +i18 a<6> +i19 a<7> +i20 a<8> +i21 a<9> +i22 a<10> +i23 a<11> +i24 b<0> +i25 b<1> +i26 b<2> +i27 b<3> +i28 b<4> +i29 b<5> +i30 b<6> +i31 b<7> +i32 b<8> +i33 b<9> +i34 b<10> +i35 b<11> +l0 n75 +l1 err_out +o0 err +c +bench +This file was written by ABC on Sat Aug 31 20:25:01 2013 +For information about AIGER format, refer to http://fmv.jku.at/aiger +------------------------------- +This AIGER file has been created by the following sequence of commands: +> vl2mv add12.v ---gives--> add12.mv +> abc -c "read_blif_mv add12.mv; strash; refactor; rewrite; dfraig; rewrite; dfraig; write_aiger -s add12y.aig" ---gives--> add12y.aig +> aigtoaig add12y.aig add12y.aag ---gives--> add12y.aag (this file) +Content of add12.v: +// realizable +module bench(clk, a, b, controllable_c, err); + input clk; + input [11:0] a; + input [11:0] b; + input [11:0] controllable_c; + output err; + reg err; + + initial + begin + err = 1'b0; + end + + always @ (posedge clk) + begin + if(controllable_c == a + b) + err = 1'b0; + else + err = 1'b1; + end +endmodule +------------------------------- +#!SYNTCOMP +STATUS : realizable +SOLVED_BY : 7/8 [SYNTCOMP2014-RealSeq] +SOLVED_IN : 0.012 [SYNTCOMP2014-RealSeq] +#. diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 1e934b7b..96791a4a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -15,8 +15,16 @@ target_sources(sylvan sylvan_refs.c sylvan_sl.c sylvan_stats.c - sylvan_table.c sylvan_zdd.c + sylvan_bitmap.c + roaring.c + roaring.h + sylvan_mrc.c + sylvan_reorder_int.c + sylvan_varswap.c + sylvan_levels.c + sylvan_reorder.c + sylvan_interact.c PUBLIC sylvan.h sylvan_bdd.h @@ -36,6 +44,14 @@ target_sources(sylvan sylvan_tls.h sylvan_zdd.h sylvan_zdd_int.h + libpopcnt.h + sylvan_bitmap.h + sylvan_mrc.h + sylvan_reorder_int.h + sylvan_varswap.h + sylvan_levels.h + sylvan_reorder.h + sylvan_interact.h ) set_target_properties(sylvan PROPERTIES VERSION ${sylvan_VERSION} SOVERSION ${sylvan_VERSION_MAJOR}) @@ -70,3 +86,12 @@ option(SYLVAN_STATS "Let Sylvan collect statistics at runtime" OFF) if(SYLVAN_STATS) set_target_properties(sylvan PROPERTIES COMPILE_DEFINITIONS "SYLVAN_STATS") endif() + +# Do we want to use chaining implementation or linear probing implementation? +option(SYLVAN_USE_LINEAR_PROBING "Let Sylvan use chaining instead of probing" OFF) +if (SYLVAN_USE_LINEAR_PROBING) + set_target_properties(sylvan PROPERTIES COMPILE_DEFINITIONS "SYLVAN_USE_LINEAR_PROBING") + target_sources(sylvan PRIVATE sylvan_table.c) +else () + target_sources(sylvan PRIVATE sylvan_table_chaining.c) +endif () \ No newline at end of file diff --git a/src/libpopcnt.h b/src/libpopcnt.h new file mode 100644 index 00000000..03b16bde --- /dev/null +++ b/src/libpopcnt.h @@ -0,0 +1,841 @@ +/* + * libpopcnt.h - C/C++ library for counting the number of 1 bits (bit + * population count) in an array as quickly as possible using + * specialized CPU instructions i.e. POPCNT, AVX2, AVX512, NEON. + * + * Copyright (c) 2016 - 2019, Kim Walisch + * Copyright (c) 2016 - 2018, Wojciech Muła + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef LIBPOPCNT_H +#define LIBPOPCNT_H + +#include + +#ifndef __has_builtin + #define __has_builtin(x) 0 +#endif + +#ifndef __has_attribute + #define __has_attribute(x) 0 +#endif + +#ifdef __GNUC__ + #define GNUC_PREREQ(x, y) \ + (__GNUC__ > x || (__GNUC__ == x && __GNUC_MINOR__ >= y)) +#else + #define GNUC_PREREQ(x, y) 0 +#endif + +#ifdef __clang__ + #define CLANG_PREREQ(x, y) \ + (__clang_major__ > x || (__clang_major__ == x && __clang_minor__ >= y)) +#else + #define CLANG_PREREQ(x, y) 0 +#endif + +#if (_MSC_VER < 1900) && \ + !defined(__cplusplus) + #define inline __inline +#endif + +#if (defined(__i386__) || \ + defined(__x86_64__) || \ + defined(_M_IX86) || \ + defined(_M_X64)) + #define X86_OR_X64 +#endif + +#if defined(X86_OR_X64) && \ + (defined(__cplusplus) || \ + defined(_MSC_VER) || \ + (GNUC_PREREQ(4, 2) || \ + __has_builtin(__sync_val_compare_and_swap))) + #define HAVE_CPUID +#endif + +#if GNUC_PREREQ(4, 2) || \ + __has_builtin(__builtin_popcount) + #define HAVE_BUILTIN_POPCOUNT +#endif + +#if GNUC_PREREQ(4, 2) || \ + CLANG_PREREQ(3, 0) + #define HAVE_ASM_POPCNT +#endif + +#if defined(HAVE_CPUID) && \ + (defined(HAVE_ASM_POPCNT) || \ + defined(_MSC_VER)) + #define HAVE_POPCNT +#endif + +#if defined(HAVE_CPUID) && \ + GNUC_PREREQ(4, 9) + #define HAVE_AVX2 +#endif + +#if defined(HAVE_CPUID) && \ + GNUC_PREREQ(5, 0) + #define HAVE_AVX512 +#endif + +#if defined(HAVE_CPUID) && \ + defined(_MSC_VER) && \ + defined(__AVX2__) + #define HAVE_AVX2 +#endif + +#if defined(HAVE_CPUID) && \ + defined(_MSC_VER) && \ + defined(__AVX512__) + #define HAVE_AVX512 +#endif + +#if defined(HAVE_CPUID) && \ + CLANG_PREREQ(3, 8) && \ + __has_attribute(target) && \ + (!defined(_MSC_VER) || defined(__AVX2__)) && \ + (!defined(__apple_build_version__) || __apple_build_version__ >= 8000000) + #define HAVE_AVX2 + #define HAVE_AVX512 +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * This uses fewer arithmetic operations than any other known + * implementation on machines with fast multiplication. + * It uses 12 arithmetic operations, one of which is a multiply. + * http://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation + */ +static inline uint64_t popcount64(uint64_t x) +{ + uint64_t m1 = 0x5555555555555555ll; + uint64_t m2 = 0x3333333333333333ll; + uint64_t m4 = 0x0F0F0F0F0F0F0F0Fll; + uint64_t h01 = 0x0101010101010101ll; + + x -= (x >> 1) & m1; + x = (x & m2) + ((x >> 2) & m2); + x = (x + (x >> 4)) & m4; + + return (x * h01) >> 56; +} + +#if defined(HAVE_ASM_POPCNT) && \ + defined(__x86_64__) + +static inline uint64_t popcnt64(uint64_t x) +{ + __asm__ ("popcnt %1, %0" : "=r" (x) : "0" (x)); + return x; +} + +#elif defined(HAVE_ASM_POPCNT) && \ + defined(__i386__) + +static inline uint32_t popcnt32(uint32_t x) +{ + __asm__ ("popcnt %1, %0" : "=r" (x) : "0" (x)); + return x; +} + +static inline uint64_t popcnt64(uint64_t x) +{ + return popcnt32((uint32_t) x) + + popcnt32((uint32_t)(x >> 32)); +} + +#elif defined(_MSC_VER) && \ + defined(_M_X64) + +#include + +static inline uint64_t popcnt64(uint64_t x) +{ + return _mm_popcnt_u64(x); +} + +#elif defined(_MSC_VER) && \ + defined(_M_IX86) + +#include + +static inline uint64_t popcnt64(uint64_t x) +{ + return _mm_popcnt_u32((uint32_t) x) + + _mm_popcnt_u32((uint32_t)(x >> 32)); +} + +/* non x86 CPUs */ +#elif defined(HAVE_BUILTIN_POPCOUNT) + +static inline uint64_t popcnt64(uint64_t x) +{ + return __builtin_popcountll(x); +} + +/* no hardware POPCNT, + * use pure integer algorithm */ +#else + +static inline uint64_t popcnt64(uint64_t x) +{ + return popcount64(x); +} + +#endif + +static inline uint64_t popcnt64_unrolled(const uint64_t* data, uint64_t size) +{ + uint64_t i = 0; + uint64_t limit = size - size % 4; + uint64_t cnt = 0; + + for (; i < limit; i += 4) + { + cnt += popcnt64(data[i+0]); + cnt += popcnt64(data[i+1]); + cnt += popcnt64(data[i+2]); + cnt += popcnt64(data[i+3]); + } + + for (; i < size; i++) + cnt += popcnt64(data[i]); + + return cnt; +} + +#if defined(HAVE_CPUID) + +#if defined(_MSC_VER) + #include + #include +#endif + +/* %ecx bit flags */ +#define bit_POPCNT (1 << 23) + +/* %ebx bit flags */ +#define bit_AVX2 (1 << 5) +#define bit_AVX512 (1 << 30) + +/* xgetbv bit flags */ +#define XSTATE_SSE (1 << 1) +#define XSTATE_YMM (1 << 2) +#define XSTATE_ZMM (7 << 5) + +static inline void run_cpuid(int eax, int ecx, int* abcd) +{ +#if defined(_MSC_VER) + __cpuidex(abcd, eax, ecx); +#else + int ebx = 0; + int edx = 0; + + #if defined(__i386__) && \ + defined(__PIC__) + /* in case of PIC under 32-bit EBX cannot be clobbered */ + __asm__ ("movl %%ebx, %%edi;" + "cpuid;" + "xchgl %%ebx, %%edi;" + : "=D" (ebx), + "+a" (eax), + "+c" (ecx), + "=d" (edx)); + #else + __asm__ ("cpuid;" + : "+b" (ebx), + "+a" (eax), + "+c" (ecx), + "=d" (edx)); + #endif + + abcd[0] = eax; + abcd[1] = ebx; + abcd[2] = ecx; + abcd[3] = edx; +#endif +} + +#if defined(HAVE_AVX2) || \ + defined(HAVE_AVX512) + +static inline int get_xcr0() +{ + int xcr0; + +#if defined(_MSC_VER) + xcr0 = (int) _xgetbv(0); +#else + __asm__ ("xgetbv" : "=a" (xcr0) : "c" (0) : "%edx" ); +#endif + + return xcr0; +} + +#endif + +static inline int get_cpuid() +{ + int flags = 0; + int abcd[4]; + + run_cpuid(1, 0, abcd); + + if ((abcd[2] & bit_POPCNT) == bit_POPCNT) + flags |= bit_POPCNT; + +#if defined(HAVE_AVX2) || \ + defined(HAVE_AVX512) + + int osxsave_mask = (1 << 27); + + /* ensure OS supports extended processor state management */ + if ((abcd[2] & osxsave_mask) != osxsave_mask) + return 0; + + int ymm_mask = XSTATE_SSE | XSTATE_YMM; + int zmm_mask = XSTATE_SSE | XSTATE_YMM | XSTATE_ZMM; + + int xcr0 = get_xcr0(); + + if ((xcr0 & ymm_mask) == ymm_mask) + { + run_cpuid(7, 0, abcd); + + if ((abcd[1] & bit_AVX2) == bit_AVX2) + flags |= bit_AVX2; + + if ((xcr0 & zmm_mask) == zmm_mask) + { + if ((abcd[1] & bit_AVX512) == bit_AVX512) + flags |= bit_AVX512; + } + } + +#endif + + return flags; +} + +#endif /* cpuid */ + +#if defined(HAVE_AVX2) + +#include + +#if !defined(_MSC_VER) + __attribute__ ((target ("avx2"))) +#endif +static inline void CSA256(__m256i* h, __m256i* l, __m256i a, __m256i b, __m256i c) +{ + __m256i u = _mm256_xor_si256(a, b); + *h = _mm256_or_si256(_mm256_and_si256(a, b), _mm256_and_si256(u, c)); + *l = _mm256_xor_si256(u, c); +} + +#if !defined(_MSC_VER) + __attribute__ ((target ("avx2"))) +#endif +static inline __m256i popcnt256(__m256i v) +{ + __m256i lookup1 = _mm256_setr_epi8( + 4, 5, 5, 6, 5, 6, 6, 7, + 5, 6, 6, 7, 6, 7, 7, 8, + 4, 5, 5, 6, 5, 6, 6, 7, + 5, 6, 6, 7, 6, 7, 7, 8 + ); + + __m256i lookup2 = _mm256_setr_epi8( + 4, 3, 3, 2, 3, 2, 2, 1, + 3, 2, 2, 1, 2, 1, 1, 0, + 4, 3, 3, 2, 3, 2, 2, 1, + 3, 2, 2, 1, 2, 1, 1, 0 + ); + + __m256i low_mask = _mm256_set1_epi8(0x0f); + __m256i lo = _mm256_and_si256(v, low_mask); + __m256i hi = _mm256_and_si256(_mm256_srli_epi16(v, 4), low_mask); + __m256i popcnt1 = _mm256_shuffle_epi8(lookup1, lo); + __m256i popcnt2 = _mm256_shuffle_epi8(lookup2, hi); + + return _mm256_sad_epu8(popcnt1, popcnt2); +} + +/* + * AVX2 Harley-Seal popcount (4th iteration). + * The algorithm is based on the paper "Faster Population Counts + * using AVX2 Instructions" by Daniel Lemire, Nathan Kurz and + * Wojciech Mula (23 Nov 2016). + * @see https://arxiv.org/abs/1611.07612 + */ +#if !defined(_MSC_VER) + __attribute__ ((target ("avx2"))) +#endif +static inline uint64_t popcnt_avx2(const __m256i* data, uint64_t size) +{ + __m256i cnt = _mm256_setzero_si256(); + __m256i ones = _mm256_setzero_si256(); + __m256i twos = _mm256_setzero_si256(); + __m256i fours = _mm256_setzero_si256(); + __m256i eights = _mm256_setzero_si256(); + __m256i sixteens = _mm256_setzero_si256(); + __m256i twosA, twosB, foursA, foursB, eightsA, eightsB; + + uint64_t i = 0; + uint64_t limit = size - size % 16; + uint64_t* cnt64; + + for(; i < limit; i += 16) + { + CSA256(&twosA, &ones, ones, data[i+0], data[i+1]); + CSA256(&twosB, &ones, ones, data[i+2], data[i+3]); + CSA256(&foursA, &twos, twos, twosA, twosB); + CSA256(&twosA, &ones, ones, data[i+4], data[i+5]); + CSA256(&twosB, &ones, ones, data[i+6], data[i+7]); + CSA256(&foursB, &twos, twos, twosA, twosB); + CSA256(&eightsA, &fours, fours, foursA, foursB); + CSA256(&twosA, &ones, ones, data[i+8], data[i+9]); + CSA256(&twosB, &ones, ones, data[i+10], data[i+11]); + CSA256(&foursA, &twos, twos, twosA, twosB); + CSA256(&twosA, &ones, ones, data[i+12], data[i+13]); + CSA256(&twosB, &ones, ones, data[i+14], data[i+15]); + CSA256(&foursB, &twos, twos, twosA, twosB); + CSA256(&eightsB, &fours, fours, foursA, foursB); + CSA256(&sixteens, &eights, eights, eightsA, eightsB); + + cnt = _mm256_add_epi64(cnt, popcnt256(sixteens)); + } + + cnt = _mm256_slli_epi64(cnt, 4); + cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(popcnt256(eights), 3)); + cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(popcnt256(fours), 2)); + cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(popcnt256(twos), 1)); + cnt = _mm256_add_epi64(cnt, popcnt256(ones)); + + for(; i < size; i++) + cnt = _mm256_add_epi64(cnt, popcnt256(data[i])); + + cnt64 = (uint64_t*) &cnt; + + return cnt64[0] + + cnt64[1] + + cnt64[2] + + cnt64[3]; +} + +/* Align memory to 32 bytes boundary */ +static inline void align_avx2(const uint8_t** p, uint64_t* size, uint64_t* cnt) +{ + for (; (uintptr_t) *p % 8; (*p)++) + { + *cnt += popcnt64(**p); + *size -= 1; + } + for (; (uintptr_t) *p % 32; (*p) += 8) + { + *cnt += popcnt64( + *(const uint64_t*) *p); + *size -= 8; + } +} + +#endif + +#if defined(HAVE_AVX512) + +#include + +#if !defined(_MSC_VER) + __attribute__ ((target ("avx512bw"))) +#endif +static inline __m512i popcnt512(__m512i v) +{ + __m512i m1 = _mm512_set1_epi8(0x55); + __m512i m2 = _mm512_set1_epi8(0x33); + __m512i m4 = _mm512_set1_epi8(0x0F); + __m512i t1 = _mm512_sub_epi8(v, (_mm512_srli_epi16(v, 1) & m1)); + __m512i t2 = _mm512_add_epi8(t1 & m2, (_mm512_srli_epi16(t1, 2) & m2)); + __m512i t3 = _mm512_add_epi8(t2, _mm512_srli_epi16(t2, 4)) & m4; + + return _mm512_sad_epu8(t3, _mm512_setzero_si512()); +} + +#if !defined(_MSC_VER) + __attribute__ ((target ("avx512bw"))) +#endif +static inline void CSA512(__m512i* h, __m512i* l, __m512i a, __m512i b, __m512i c) +{ + *l = _mm512_ternarylogic_epi32(c, b, a, 0x96); + *h = _mm512_ternarylogic_epi32(c, b, a, 0xe8); +} + +/* + * AVX512 Harley-Seal popcount (4th iteration). + * The algorithm is based on the paper "Faster Population Counts + * using AVX2 Instructions" by Daniel Lemire, Nathan Kurz and + * Wojciech Mula (23 Nov 2016). + * @see https://arxiv.org/abs/1611.07612 + */ +#if !defined(_MSC_VER) + __attribute__ ((target ("avx512bw"))) +#endif +static inline uint64_t popcnt_avx512(const __m512i* data, const uint64_t size) +{ + __m512i cnt = _mm512_setzero_si512(); + __m512i ones = _mm512_setzero_si512(); + __m512i twos = _mm512_setzero_si512(); + __m512i fours = _mm512_setzero_si512(); + __m512i eights = _mm512_setzero_si512(); + __m512i sixteens = _mm512_setzero_si512(); + __m512i twosA, twosB, foursA, foursB, eightsA, eightsB; + + uint64_t i = 0; + uint64_t limit = size - size % 16; + uint64_t* cnt64; + + for(; i < limit; i += 16) + { + CSA512(&twosA, &ones, ones, data[i+0], data[i+1]); + CSA512(&twosB, &ones, ones, data[i+2], data[i+3]); + CSA512(&foursA, &twos, twos, twosA, twosB); + CSA512(&twosA, &ones, ones, data[i+4], data[i+5]); + CSA512(&twosB, &ones, ones, data[i+6], data[i+7]); + CSA512(&foursB, &twos, twos, twosA, twosB); + CSA512(&eightsA, &fours, fours, foursA, foursB); + CSA512(&twosA, &ones, ones, data[i+8], data[i+9]); + CSA512(&twosB, &ones, ones, data[i+10], data[i+11]); + CSA512(&foursA, &twos, twos, twosA, twosB); + CSA512(&twosA, &ones, ones, data[i+12], data[i+13]); + CSA512(&twosB, &ones, ones, data[i+14], data[i+15]); + CSA512(&foursB, &twos, twos, twosA, twosB); + CSA512(&eightsB, &fours, fours, foursA, foursB); + CSA512(&sixteens, &eights, eights, eightsA, eightsB); + + cnt = _mm512_add_epi64(cnt, popcnt512(sixteens)); + } + + cnt = _mm512_slli_epi64(cnt, 4); + cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(popcnt512(eights), 3)); + cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(popcnt512(fours), 2)); + cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(popcnt512(twos), 1)); + cnt = _mm512_add_epi64(cnt, popcnt512(ones)); + + for(; i < size; i++) + cnt = _mm512_add_epi64(cnt, popcnt512(data[i])); + + cnt64 = (uint64_t*) &cnt; + + return cnt64[0] + + cnt64[1] + + cnt64[2] + + cnt64[3] + + cnt64[4] + + cnt64[5] + + cnt64[6] + + cnt64[7]; +} + +/* Align memory to 64 bytes boundary */ +static inline void align_avx512(const uint8_t** p, uint64_t* size, uint64_t* cnt) +{ + for (; (uintptr_t) *p % 8; (*p)++) + { + *cnt += popcnt64(**p); + *size -= 1; + } + for (; (uintptr_t) *p % 64; (*p) += 8) + { + *cnt += popcnt64( + *(const uint64_t*) *p); + *size -= 8; + } +} + +#endif + +/* x86 CPUs */ +#if defined(X86_OR_X64) + +/* Align memory to 8 bytes boundary */ +static inline void align_8(const uint8_t** p, uint64_t* size, uint64_t* cnt) +{ + for (; *size > 0 && (uintptr_t) *p % 8; (*p)++) + { + *cnt += popcount64(**p); + *size -= 1; + } +} + +static inline uint64_t popcount64_unrolled(const uint64_t* data, uint64_t size) +{ + uint64_t i = 0; + uint64_t limit = size - size % 4; + uint64_t cnt = 0; + + for (; i < limit; i += 4) + { + cnt += popcount64(data[i+0]); + cnt += popcount64(data[i+1]); + cnt += popcount64(data[i+2]); + cnt += popcount64(data[i+3]); + } + + for (; i < size; i++) + cnt += popcount64(data[i]); + + return cnt; +} + +/* + * Count the number of 1 bits in the data array + * @data: An array + * @size: Size of data in bytes + */ +static inline uint64_t popcnt(const void* data, uint64_t size) +{ + const uint8_t* ptr = (const uint8_t*) data; + uint64_t cnt = 0; + uint64_t i; + +#if defined(HAVE_CPUID) + #if defined(__cplusplus) + /* C++11 thread-safe singleton */ + static const int cpuid = get_cpuid(); + #else + static int cpuid_ = -1; + int cpuid = cpuid_; + if (cpuid == -1) + { + cpuid = get_cpuid(); + + #if defined(_MSC_VER) + _InterlockedCompareExchange(&cpuid_, cpuid, -1); + #else + __sync_val_compare_and_swap(&cpuid_, -1, cpuid); + #endif + } + #endif +#endif + +#if defined(HAVE_AVX512) + + /* AVX512 requires arrays >= 1024 bytes */ + if ((cpuid & bit_AVX512) && + size >= 1024) + { + align_avx512(&ptr, &size, &cnt); + cnt += popcnt_avx512((const __m512i*) ptr, size / 64); + ptr += size - size % 64; + size = size % 64; + } + +#endif + +#if defined(HAVE_AVX2) + + /* AVX2 requires arrays >= 512 bytes */ + if ((cpuid & bit_AVX2) && + size >= 512) + { + align_avx2(&ptr, &size, &cnt); + cnt += popcnt_avx2((const __m256i*) ptr, size / 32); + ptr += size - size % 32; + size = size % 32; + } + +#endif + +#if defined(HAVE_POPCNT) + + if (cpuid & bit_POPCNT) + { + cnt += popcnt64_unrolled((const uint64_t*) ptr, size / 8); + ptr += size - size % 8; + size = size % 8; + for (i = 0; i < size; i++) + cnt += popcnt64(ptr[i]); + + return cnt; + } + +#endif + + /* pure integer popcount algorithm */ + if (size >= 8) + { + align_8(&ptr, &size, &cnt); + cnt += popcount64_unrolled((const uint64_t*) ptr, size / 8); + ptr += size - size % 8; + size = size % 8; + } + + /* pure integer popcount algorithm */ + for (i = 0; i < size; i++) + cnt += popcount64(ptr[i]); + + return cnt; +} + +#elif defined(__ARM_NEON) || \ + defined(__aarch64__) + +#include + +/* Align memory to 8 bytes boundary */ +static inline void align_8(const uint8_t** p, uint64_t* size, uint64_t* cnt) +{ + for (; *size > 0 && (uintptr_t) *p % 8; (*p)++) + { + *cnt += popcnt64(**p); + *size -= 1; + } +} + +static inline uint64x2_t vpadalq(uint64x2_t sum, uint8x16_t t) +{ + return vpadalq_u32(sum, vpaddlq_u16(vpaddlq_u8(t))); +} + +/* + * Count the number of 1 bits in the data array + * @data: An array + * @size: Size of data in bytes + */ +static inline uint64_t popcnt(const void* data, uint64_t size) +{ + uint64_t cnt = 0; + uint64_t chunk_size = 64; + const uint8_t* ptr = (const uint8_t*) data; + + if (size >= chunk_size) + { + uint64_t i = 0; + uint64_t iters = size / chunk_size; + uint64x2_t sum = vcombine_u64(vcreate_u64(0), vcreate_u64(0)); + uint8x16_t zero = vcombine_u8(vcreate_u8(0), vcreate_u8(0)); + + do + { + uint8x16_t t0 = zero; + uint8x16_t t1 = zero; + uint8x16_t t2 = zero; + uint8x16_t t3 = zero; + + /* + * After every 31 iterations we need to add the + * temporary sums (t0, t1, t2, t3) to the total sum. + * We must ensure that the temporary sums <= 255 + * and 31 * 8 bits = 248 which is OK. + */ + uint64_t limit = (i + 31 < iters) ? i + 31 : iters; + + /* Each iteration processes 64 bytes */ + for (; i < limit; i++) + { + uint8x16x4_t input = vld4q_u8(ptr); + ptr += chunk_size; + + t0 = vaddq_u8(t0, vcntq_u8(input.val[0])); + t1 = vaddq_u8(t1, vcntq_u8(input.val[1])); + t2 = vaddq_u8(t2, vcntq_u8(input.val[2])); + t3 = vaddq_u8(t3, vcntq_u8(input.val[3])); + } + + sum = vpadalq(sum, t0); + sum = vpadalq(sum, t1); + sum = vpadalq(sum, t2); + sum = vpadalq(sum, t3); + } + while (i < iters); + + uint64_t tmp[2]; + vst1q_u64(tmp, sum); + cnt += tmp[0]; + cnt += tmp[1]; + } + + size %= chunk_size; + align_8(&ptr, &size, &cnt); + const uint64_t* ptr64 = (const uint64_t*) ptr; + uint64_t iters = size / 8; + + for (uint64_t i = 0; i < iters; i++) + cnt += popcnt64(ptr64[i]); + + ptr += size - size % 8; + size = size % 8; + + for (uint64_t i = 0; i < size; i++) + cnt += popcnt64(ptr[i]); + + return cnt; +} + +/* all other CPUs */ +#else + +/* Align memory to 8 bytes boundary */ +static inline void align_8(const uint8_t** p, uint64_t* size, uint64_t* cnt) +{ + for (; *size > 0 && (uintptr_t) *p % 8; (*p)++) + { + *cnt += popcnt64(**p); + *size -= 1; + } +} + +/* + * Count the number of 1 bits in the data array + * @data: An array + * @size: Size of data in bytes + */ +static inline uint64_t popcnt(const void* data, uint64_t size) +{ + const uint8_t* ptr = (const uint8_t*) data; + uint64_t cnt = 0; + uint64_t i; + + align_8(&ptr, &size, &cnt); + cnt += popcnt64_unrolled((const uint64_t*) ptr, size / 8); + ptr += size - size % 8; + size = size % 8; + for (i = 0; i < size; i++) + cnt += popcnt64(ptr[i]); + + return cnt; +} + +#endif + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* LIBPOPCNT_H */ diff --git a/src/roaring.c b/src/roaring.c new file mode 100644 index 00000000..4b28dd73 --- /dev/null +++ b/src/roaring.c @@ -0,0 +1,20961 @@ +// !!! DO NOT EDIT - THIS IS AN AUTO-GENERATED FILE !!! +// Created by amalgamation.sh on 2023-06-08T13:19:03Z + +/* + * The CRoaring project is under a dual license (Apache/MIT). + * Users of the library may choose one or the other license. + */ +/* + * Copyright 2016-2022 The CRoaring authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ +/* + * MIT License + * + * Copyright 2016-2022 The CRoaring authors + * + * Permission is hereby granted, free of charge, to any + * person obtaining a copy of this software and associated + * documentation files (the "Software"), to deal in the + * Software without restriction, including without + * limitation the rights to use, copy, modify, merge, + * publish, distribute, sublicense, and/or sell copies of + * the Software, and to permit persons to whom the Software + * is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice + * shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF + * ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED + * TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A + * PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT + * SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR + * IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * SPDX-License-Identifier: MIT + */ + +#include "roaring.h" + +/* used for http://dmalloc.com/ Dmalloc - Debug Malloc Library */ +#ifdef DMALLOC +#include "dmalloc.h" +#endif + +#include "roaring.h" /* include public API definitions */ +/* begin file include/roaring/isadetection.h */ +#ifndef ROARING_ISADETECTION_H +#define ROARING_ISADETECTION_H +#if defined(__x86_64__) || defined(_M_AMD64) // x64 + + + + +#ifndef CROARING_COMPILER_SUPPORTS_AVX512 +#ifdef __has_include +// We want to make sure that the AVX-512 functions are only built on compilers +// fully supporting AVX-512. +#if __has_include() +#define CROARING_COMPILER_SUPPORTS_AVX512 1 +#endif // #if __has_include() +#endif // #ifdef __has_include + +// Visual Studio 2019 and up support AVX-512 +#ifdef _MSC_VER +#if _MSC_VER >= 1920 +#define CROARING_COMPILER_SUPPORTS_AVX512 1 +#endif // #if _MSC_VER >= 1920 +#endif // #ifdef _MSC_VER + +#ifndef CROARING_COMPILER_SUPPORTS_AVX512 +#define CROARING_COMPILER_SUPPORTS_AVX512 0 +#endif // #ifndef CROARING_COMPILER_SUPPORTS_AVX512 +#endif // #ifndef CROARING_COMPILER_SUPPORTS_AVX512 + + +#ifdef __cplusplus +extern "C" { namespace roaring { namespace internal { +#endif +enum { + ROARING_SUPPORTS_AVX2 = 1, + ROARING_SUPPORTS_AVX512 = 2, +}; +int croaring_hardware_support(void); +#ifdef __cplusplus +} } } // extern "C" { namespace roaring { namespace internal { +#endif +#endif // x64 +#endif // ROARING_ISADETECTION_H +/* end file include/roaring/isadetection.h */ +/* begin file include/roaring/containers/perfparameters.h */ +#ifndef PERFPARAMETERS_H_ +#define PERFPARAMETERS_H_ + +#include + +#ifdef __cplusplus +extern "C" { namespace roaring { namespace internal { +#endif + +/** +During lazy computations, we can transform array containers into bitset +containers as +long as we can expect them to have ARRAY_LAZY_LOWERBOUND values. +*/ +enum { ARRAY_LAZY_LOWERBOUND = 1024 }; + +/* default initial size of a run container + setting it to zero delays the malloc.*/ +enum { RUN_DEFAULT_INIT_SIZE = 0 }; + +/* default initial size of an array container + setting it to zero delays the malloc */ +enum { ARRAY_DEFAULT_INIT_SIZE = 0 }; + +/* automatic bitset conversion during lazy or */ +#ifndef LAZY_OR_BITSET_CONVERSION +#define LAZY_OR_BITSET_CONVERSION true +#endif + +/* automatically attempt to convert a bitset to a full run during lazy + * evaluation */ +#ifndef LAZY_OR_BITSET_CONVERSION_TO_FULL +#define LAZY_OR_BITSET_CONVERSION_TO_FULL true +#endif + +/* automatically attempt to convert a bitset to a full run */ +#ifndef OR_BITSET_CONVERSION_TO_FULL +#define OR_BITSET_CONVERSION_TO_FULL true +#endif + +#ifdef __cplusplus +} } } // extern "C" { namespace roaring { namespace internal { +#endif + +#endif +/* end file include/roaring/containers/perfparameters.h */ +/* begin file include/roaring/containers/container_defs.h */ +/* + * container_defs.h + * + * Unlike containers.h (which is a file aggregating all the container includes, + * like array.h, bitset.h, and run.h) this is a file included BY those headers + * to do things like define the container base class `container_t`. + */ + +#ifndef INCLUDE_CONTAINERS_CONTAINER_DEFS_H_ +#define INCLUDE_CONTAINERS_CONTAINER_DEFS_H_ + +#ifdef __cplusplus + #include // used by casting helper for compile-time check +#endif + +// The preferences are a separate file to separate out tweakable parameters + +#ifdef __cplusplus +namespace roaring { namespace internal { // No extern "C" (contains template) +#endif + + +/* + * Since roaring_array_t's definition is not opaque, the container type is + * part of the API. If it's not going to be `void*` then it needs a name, and + * expectations are to prefix C library-exported names with `roaring_` etc. + * + * Rather than force the whole codebase to use the name `roaring_container_t`, + * the few API appearances use the macro ROARING_CONTAINER_T. Those includes + * are prior to containers.h, so make a short private alias of `container_t`. + * Then undefine the awkward macro so it's not used any more than it has to be. + */ +typedef ROARING_CONTAINER_T container_t; +#undef ROARING_CONTAINER_T + + +/* + * See ROARING_CONTAINER_T for notes on using container_t as a base class. + * This macro helps make the following pattern look nicer: + * + * #ifdef __cplusplus + * struct roaring_array_s : public container_t { + * #else + * struct roaring_array_s { + * #endif + * int32_t cardinality; + * int32_t capacity; + * uint16_t *array; + * } + */ +#if defined(__cplusplus) + #define STRUCT_CONTAINER(name) \ + struct name : public container_t /* { ... } */ +#else + #define STRUCT_CONTAINER(name) \ + struct name /* { ... } */ +#endif + + +/** + * Since container_t* is not void* in C++, "dangerous" casts are not needed to + * downcast; only a static_cast<> is needed. Define a macro for static casting + * which helps make casts more visible, and catches problems at compile-time + * when building the C sources in C++ mode: + * + * void some_func(container_t **c, ...) { // double pointer, not single + * array_container_t *ac1 = (array_container_t *)(c); // uncaught!! + * + * array_container_t *ac2 = CAST(array_container_t *, c) // C++ errors + * array_container_t *ac3 = CAST_array(c); // shorthand for #2, errors + * } + * + * Trickier to do is a cast from `container**` to `array_container_t**`. This + * needs a reinterpret_cast<>, which sacrifices safety...so a template is used + * leveraging to make sure it's legal in the C++ build. + */ +#ifdef __cplusplus + #define CAST(type,value) static_cast(value) + #define movable_CAST(type,value) movable_CAST_HELPER(value) + + template + PPDerived movable_CAST_HELPER(Base **ptr_to_ptr) { + typedef typename std::remove_pointer::type PDerived; + typedef typename std::remove_pointer::type Derived; + static_assert( + std::is_base_of::value, + "use movable_CAST() for container_t** => xxx_container_t**" + ); + return reinterpret_cast(ptr_to_ptr); + } +#else + #define CAST(type,value) ((type)value) + #define movable_CAST(type, value) ((type)value) +#endif + +// Use for converting e.g. an `array_container_t**` to a `container_t**` +// +#define movable_CAST_base(c) movable_CAST(container_t **, c) + + +#ifdef __cplusplus +} } // namespace roaring { namespace internal { +#endif + +#endif /* INCLUDE_CONTAINERS_CONTAINER_DEFS_H_ */ +/* end file include/roaring/containers/container_defs.h */ +/* begin file include/roaring/array_util.h */ +#ifndef ARRAY_UTIL_H +#define ARRAY_UTIL_H + +#include // for size_t +#include + + +#if CROARING_IS_X64 +#ifndef CROARING_COMPILER_SUPPORTS_AVX512 +#error "CROARING_COMPILER_SUPPORTS_AVX512 needs to be defined." +#endif // CROARING_COMPILER_SUPPORTS_AVX512 +#endif + +#ifdef __cplusplus +extern "C" { namespace roaring { namespace internal { +#endif + +/* + * Good old binary search. + * Assumes that array is sorted, has logarithmic complexity. + * if the result is x, then: + * if ( x>0 ) you have array[x] = ikey + * if ( x<0 ) then inserting ikey at position -x-1 in array (insuring that array[-x-1]=ikey) + * keys the array sorted. + */ +inline int32_t binarySearch(const uint16_t *array, int32_t lenarray, + uint16_t ikey) { + int32_t low = 0; + int32_t high = lenarray - 1; + while (low <= high) { + int32_t middleIndex = (low + high) >> 1; + uint16_t middleValue = array[middleIndex]; + if (middleValue < ikey) { + low = middleIndex + 1; + } else if (middleValue > ikey) { + high = middleIndex - 1; + } else { + return middleIndex; + } + } + return -(low + 1); +} + +/** + * Galloping search + * Assumes that array is sorted, has logarithmic complexity. + * if the result is x, then if x = length, you have that all values in array between pos and length + * are smaller than min. + * otherwise returns the first index x such that array[x] >= min. + */ +static inline int32_t advanceUntil(const uint16_t *array, int32_t pos, + int32_t length, uint16_t min) { + int32_t lower = pos + 1; + + if ((lower >= length) || (array[lower] >= min)) { + return lower; + } + + int32_t spansize = 1; + + while ((lower + spansize < length) && (array[lower + spansize] < min)) { + spansize <<= 1; + } + int32_t upper = (lower + spansize < length) ? lower + spansize : length - 1; + + if (array[upper] == min) { + return upper; + } + if (array[upper] < min) { + // means + // array + // has no + // item + // >= min + // pos = array.length; + return length; + } + + // we know that the next-smallest span was too small + lower += (spansize >> 1); + + int32_t mid = 0; + while (lower + 1 != upper) { + mid = (lower + upper) >> 1; + if (array[mid] == min) { + return mid; + } else if (array[mid] < min) { + lower = mid; + } else { + upper = mid; + } + } + return upper; +} + +/** + * Returns number of elements which are less than ikey. + * Array elements must be unique and sorted. + */ +static inline int32_t count_less(const uint16_t *array, int32_t lenarray, + uint16_t ikey) { + if (lenarray == 0) return 0; + int32_t pos = binarySearch(array, lenarray, ikey); + return pos >= 0 ? pos : -(pos+1); +} + +/** + * Returns number of elements which are greater than ikey. + * Array elements must be unique and sorted. + */ +static inline int32_t count_greater(const uint16_t *array, int32_t lenarray, + uint16_t ikey) { + if (lenarray == 0) return 0; + int32_t pos = binarySearch(array, lenarray, ikey); + if (pos >= 0) { + return lenarray - (pos+1); + } else { + return lenarray - (-pos-1); + } +} + +/** + * From Schlegel et al., Fast Sorted-Set Intersection using SIMD Instructions + * Optimized by D. Lemire on May 3rd 2013 + * + * C should have capacity greater than the minimum of s_1 and s_b + 8 + * where 8 is sizeof(__m128i)/sizeof(uint16_t). + */ +int32_t intersect_vector16(const uint16_t *__restrict__ A, size_t s_a, + const uint16_t *__restrict__ B, size_t s_b, + uint16_t *C); + +int32_t intersect_vector16_inplace(uint16_t *__restrict__ A, size_t s_a, + const uint16_t *__restrict__ B, size_t s_b); + +/** + * Take an array container and write it out to a 32-bit array, using base + * as the offset. + */ +int array_container_to_uint32_array_vector16(void *vout, const uint16_t* array, size_t cardinality, + uint32_t base); +#if CROARING_COMPILER_SUPPORTS_AVX512 +int avx512_array_container_to_uint32_array(void *vout, const uint16_t* array, size_t cardinality, + uint32_t base); +#endif +/** + * Compute the cardinality of the intersection using SSE4 instructions + */ +int32_t intersect_vector16_cardinality(const uint16_t *__restrict__ A, + size_t s_a, + const uint16_t *__restrict__ B, + size_t s_b); + +/* Computes the intersection between one small and one large set of uint16_t. + * Stores the result into buffer and return the number of elements. */ +int32_t intersect_skewed_uint16(const uint16_t *smallarray, size_t size_s, + const uint16_t *largearray, size_t size_l, + uint16_t *buffer); + +/* Computes the size of the intersection between one small and one large set of + * uint16_t. */ +int32_t intersect_skewed_uint16_cardinality(const uint16_t *smallarray, + size_t size_s, + const uint16_t *largearray, + size_t size_l); + + +/* Check whether the size of the intersection between one small and one large set of uint16_t is non-zero. */ +bool intersect_skewed_uint16_nonempty(const uint16_t *smallarray, size_t size_s, + const uint16_t *largearray, size_t size_l); +/** + * Generic intersection function. + */ +int32_t intersect_uint16(const uint16_t *A, const size_t lenA, + const uint16_t *B, const size_t lenB, uint16_t *out); +/** + * Compute the size of the intersection (generic). + */ +int32_t intersect_uint16_cardinality(const uint16_t *A, const size_t lenA, + const uint16_t *B, const size_t lenB); + +/** + * Checking whether the size of the intersection is non-zero. + */ +bool intersect_uint16_nonempty(const uint16_t *A, const size_t lenA, + const uint16_t *B, const size_t lenB); +/** + * Generic union function. + */ +size_t union_uint16(const uint16_t *set_1, size_t size_1, const uint16_t *set_2, + size_t size_2, uint16_t *buffer); + +/** + * Generic XOR function. + */ +int32_t xor_uint16(const uint16_t *array_1, int32_t card_1, + const uint16_t *array_2, int32_t card_2, uint16_t *out); + +/** + * Generic difference function (ANDNOT). + */ +int difference_uint16(const uint16_t *a1, int length1, const uint16_t *a2, + int length2, uint16_t *a_out); + +/** + * Generic intersection function. + */ +size_t intersection_uint32(const uint32_t *A, const size_t lenA, + const uint32_t *B, const size_t lenB, uint32_t *out); + +/** + * Generic intersection function, returns just the cardinality. + */ +size_t intersection_uint32_card(const uint32_t *A, const size_t lenA, + const uint32_t *B, const size_t lenB); + +/** + * Generic union function. + */ +size_t union_uint32(const uint32_t *set_1, size_t size_1, const uint32_t *set_2, + size_t size_2, uint32_t *buffer); + +/** + * A fast SSE-based union function. + */ +uint32_t union_vector16(const uint16_t *__restrict__ set_1, uint32_t size_1, + const uint16_t *__restrict__ set_2, uint32_t size_2, + uint16_t *__restrict__ buffer); +/** + * A fast SSE-based XOR function. + */ +uint32_t xor_vector16(const uint16_t *__restrict__ array1, uint32_t length1, + const uint16_t *__restrict__ array2, uint32_t length2, + uint16_t *__restrict__ output); + +/** + * A fast SSE-based difference function. + */ +int32_t difference_vector16(const uint16_t *__restrict__ A, size_t s_a, + const uint16_t *__restrict__ B, size_t s_b, + uint16_t *C); + +/** + * Generic union function, returns just the cardinality. + */ +size_t union_uint32_card(const uint32_t *set_1, size_t size_1, + const uint32_t *set_2, size_t size_2); + +/** +* combines union_uint16 and union_vector16 optimally +*/ +size_t fast_union_uint16(const uint16_t *set_1, size_t size_1, const uint16_t *set_2, + size_t size_2, uint16_t *buffer); + + +bool memequals(const void *s1, const void *s2, size_t n); + +#ifdef __cplusplus +} } } // extern "C" { namespace roaring { namespace internal { +#endif + +#endif +/* end file include/roaring/array_util.h */ +/* begin file include/roaring/utilasm.h */ +/* + * utilasm.h + * + */ + +#ifndef INCLUDE_UTILASM_H_ +#define INCLUDE_UTILASM_H_ + + +#ifdef __cplusplus +extern "C" { namespace roaring { +#endif + +#if defined(CROARING_INLINE_ASM) +#define CROARING_ASMBITMANIPOPTIMIZATION // optimization flag + +#define ASM_SHIFT_RIGHT(srcReg, bitsReg, destReg) \ + __asm volatile("shrx %1, %2, %0" \ + : "=r"(destReg) \ + : /* write */ \ + "r"(bitsReg), /* read only */ \ + "r"(srcReg) /* read only */ \ + ) + +#define ASM_INPLACESHIFT_RIGHT(srcReg, bitsReg) \ + __asm volatile("shrx %1, %0, %0" \ + : "+r"(srcReg) \ + : /* read/write */ \ + "r"(bitsReg) /* read only */ \ + ) + +#define ASM_SHIFT_LEFT(srcReg, bitsReg, destReg) \ + __asm volatile("shlx %1, %2, %0" \ + : "=r"(destReg) \ + : /* write */ \ + "r"(bitsReg), /* read only */ \ + "r"(srcReg) /* read only */ \ + ) +// set bit at position testBit within testByte to 1 and +// copy cmovDst to cmovSrc if that bit was previously clear +#define ASM_SET_BIT_INC_WAS_CLEAR(testByte, testBit, count) \ + __asm volatile( \ + "bts %2, %0\n" \ + "sbb $-1, %1\n" \ + : "+r"(testByte), /* read/write */ \ + "+r"(count) \ + : /* read/write */ \ + "r"(testBit) /* read only */ \ + ) + +#define ASM_CLEAR_BIT_DEC_WAS_SET(testByte, testBit, count) \ + __asm volatile( \ + "btr %2, %0\n" \ + "sbb $0, %1\n" \ + : "+r"(testByte), /* read/write */ \ + "+r"(count) \ + : /* read/write */ \ + "r"(testBit) /* read only */ \ + ) + +#define ASM_BT64(testByte, testBit, count) \ + __asm volatile( \ + "bt %2,%1\n" \ + "sbb %0,%0" /*could use setb */ \ + : "=r"(count) \ + : /* write */ \ + "r"(testByte), /* read only */ \ + "r"(testBit) /* read only */ \ + ) + +#endif + +#ifdef __cplusplus +} } // extern "C" { namespace roaring { +#endif + +#endif /* INCLUDE_UTILASM_H_ */ +/* end file include/roaring/utilasm.h */ +/* begin file include/roaring/bitset_util.h */ +#ifndef BITSET_UTIL_H +#define BITSET_UTIL_H + +#include + + +#if CROARING_IS_X64 +#ifndef CROARING_COMPILER_SUPPORTS_AVX512 +#error "CROARING_COMPILER_SUPPORTS_AVX512 needs to be defined." +#endif // CROARING_COMPILER_SUPPORTS_AVX512 +#endif + +#ifdef __cplusplus +extern "C" { namespace roaring { namespace internal { +#endif + +/* + * Set all bits in indexes [begin,end) to true. + */ +static inline void bitset_set_range(uint64_t *words, uint32_t start, + uint32_t end) { + if (start == end) return; + uint32_t firstword = start / 64; + uint32_t endword = (end - 1) / 64; + if (firstword == endword) { + words[firstword] |= ((~UINT64_C(0)) << (start % 64)) & + ((~UINT64_C(0)) >> ((~end + 1) % 64)); + return; + } + words[firstword] |= (~UINT64_C(0)) << (start % 64); + for (uint32_t i = firstword + 1; i < endword; i++) { + words[i] = ~UINT64_C(0); + } + words[endword] |= (~UINT64_C(0)) >> ((~end + 1) % 64); +} + + +/* + * Find the cardinality of the bitset in [begin,begin+lenminusone] + */ +static inline int bitset_lenrange_cardinality(const uint64_t *words, + uint32_t start, + uint32_t lenminusone) { + uint32_t firstword = start / 64; + uint32_t endword = (start + lenminusone) / 64; + if (firstword == endword) { + return roaring_hamming(words[firstword] & + ((~UINT64_C(0)) >> ((63 - lenminusone) % 64)) + << (start % 64)); + } + int answer = roaring_hamming(words[firstword] & ((~UINT64_C(0)) << (start % 64))); + for (uint32_t i = firstword + 1; i < endword; i++) { + answer += roaring_hamming(words[i]); + } + answer += + roaring_hamming(words[endword] & + (~UINT64_C(0)) >> (((~start + 1) - lenminusone - 1) % 64)); + return answer; +} + +/* + * Check whether the cardinality of the bitset in [begin,begin+lenminusone] is 0 + */ +static inline bool bitset_lenrange_empty(const uint64_t *words, uint32_t start, + uint32_t lenminusone) { + uint32_t firstword = start / 64; + uint32_t endword = (start + lenminusone) / 64; + if (firstword == endword) { + return (words[firstword] & ((~UINT64_C(0)) >> ((63 - lenminusone) % 64)) + << (start % 64)) == 0; + } + if (((words[firstword] & ((~UINT64_C(0)) << (start%64)))) != 0) { + return false; + } + for (uint32_t i = firstword + 1; i < endword; i++) { + if (words[i] != 0) { + return false; + } + } + if ((words[endword] & (~UINT64_C(0)) >> (((~start + 1) - lenminusone - 1) % 64)) != 0) { + return false; + } + return true; +} + + +/* + * Set all bits in indexes [begin,begin+lenminusone] to true. + */ +static inline void bitset_set_lenrange(uint64_t *words, uint32_t start, + uint32_t lenminusone) { + uint32_t firstword = start / 64; + uint32_t endword = (start + lenminusone) / 64; + if (firstword == endword) { + words[firstword] |= ((~UINT64_C(0)) >> ((63 - lenminusone) % 64)) + << (start % 64); + return; + } + uint64_t temp = words[endword]; + words[firstword] |= (~UINT64_C(0)) << (start % 64); + for (uint32_t i = firstword + 1; i < endword; i += 2) + words[i] = words[i + 1] = ~UINT64_C(0); + words[endword] = + temp | (~UINT64_C(0)) >> (((~start + 1) - lenminusone - 1) % 64); +} + +/* + * Flip all the bits in indexes [begin,end). + */ +static inline void bitset_flip_range(uint64_t *words, uint32_t start, + uint32_t end) { + if (start == end) return; + uint32_t firstword = start / 64; + uint32_t endword = (end - 1) / 64; + words[firstword] ^= ~((~UINT64_C(0)) << (start % 64)); + for (uint32_t i = firstword; i < endword; i++) { + words[i] = ~words[i]; + } + words[endword] ^= ((~UINT64_C(0)) >> ((~end + 1) % 64)); +} + +/* + * Set all bits in indexes [begin,end) to false. + */ +static inline void bitset_reset_range(uint64_t *words, uint32_t start, + uint32_t end) { + if (start == end) return; + uint32_t firstword = start / 64; + uint32_t endword = (end - 1) / 64; + if (firstword == endword) { + words[firstword] &= ~(((~UINT64_C(0)) << (start % 64)) & + ((~UINT64_C(0)) >> ((~end + 1) % 64))); + return; + } + words[firstword] &= ~((~UINT64_C(0)) << (start % 64)); + for (uint32_t i = firstword + 1; i < endword; i++) { + words[i] = UINT64_C(0); + } + words[endword] &= ~((~UINT64_C(0)) >> ((~end + 1) % 64)); +} + +/* + * Given a bitset containing "length" 64-bit words, write out the position + * of all the set bits to "out", values start at "base". + * + * The "out" pointer should be sufficient to store the actual number of bits + * set. + * + * Returns how many values were actually decoded. + * + * This function should only be expected to be faster than + * bitset_extract_setbits + * when the density of the bitset is high. + * + * This function uses AVX2 decoding. + */ +size_t bitset_extract_setbits_avx2(const uint64_t *words, size_t length, + uint32_t *out, size_t outcapacity, + uint32_t base); + +size_t bitset_extract_setbits_avx512(const uint64_t *words, size_t length, + uint32_t *out, size_t outcapacity, + uint32_t base); +/* + * Given a bitset containing "length" 64-bit words, write out the position + * of all the set bits to "out", values start at "base". + * + * The "out" pointer should be sufficient to store the actual number of bits + *set. + * + * Returns how many values were actually decoded. + */ +size_t bitset_extract_setbits(const uint64_t *words, size_t length, + uint32_t *out, uint32_t base); + +/* + * Given a bitset containing "length" 64-bit words, write out the position + * of all the set bits to "out" as 16-bit integers, values start at "base" (can + *be set to zero) + * + * The "out" pointer should be sufficient to store the actual number of bits + *set. + * + * Returns how many values were actually decoded. + * + * This function should only be expected to be faster than + *bitset_extract_setbits_uint16 + * when the density of the bitset is high. + * + * This function uses SSE decoding. + */ +size_t bitset_extract_setbits_sse_uint16(const uint64_t *words, size_t length, + uint16_t *out, size_t outcapacity, + uint16_t base); + +size_t bitset_extract_setbits_avx512_uint16(const uint64_t *words, size_t length, + uint16_t *out, size_t outcapacity, + uint16_t base); + +/* + * Given a bitset containing "length" 64-bit words, write out the position + * of all the set bits to "out", values start at "base" + * (can be set to zero) + * + * The "out" pointer should be sufficient to store the actual number of bits + *set. + * + * Returns how many values were actually decoded. + */ +size_t bitset_extract_setbits_uint16(const uint64_t *words, size_t length, + uint16_t *out, uint16_t base); + +/* + * Given two bitsets containing "length" 64-bit words, write out the position + * of all the common set bits to "out", values start at "base" + * (can be set to zero) + * + * The "out" pointer should be sufficient to store the actual number of bits + * set. + * + * Returns how many values were actually decoded. + */ +size_t bitset_extract_intersection_setbits_uint16(const uint64_t * __restrict__ words1, + const uint64_t * __restrict__ words2, + size_t length, uint16_t *out, + uint16_t base); + +/* + * Given a bitset having cardinality card, set all bit values in the list (there + * are length of them) + * and return the updated cardinality. This evidently assumes that the bitset + * already contained data. + */ +uint64_t bitset_set_list_withcard(uint64_t *words, uint64_t card, + const uint16_t *list, uint64_t length); +/* + * Given a bitset, set all bit values in the list (there + * are length of them). + */ +void bitset_set_list(uint64_t *words, const uint16_t *list, uint64_t length); + +/* + * Given a bitset having cardinality card, unset all bit values in the list + * (there are length of them) + * and return the updated cardinality. This evidently assumes that the bitset + * already contained data. + */ +uint64_t bitset_clear_list(uint64_t *words, uint64_t card, const uint16_t *list, + uint64_t length); + +/* + * Given a bitset having cardinality card, toggle all bit values in the list + * (there are length of them) + * and return the updated cardinality. This evidently assumes that the bitset + * already contained data. + */ + +uint64_t bitset_flip_list_withcard(uint64_t *words, uint64_t card, + const uint16_t *list, uint64_t length); + +void bitset_flip_list(uint64_t *words, const uint16_t *list, uint64_t length); + +#if CROARING_IS_X64 +/*** + * BEGIN Harley-Seal popcount functions. + */ +CROARING_TARGET_AVX2 +/** + * Compute the population count of a 256-bit word + * This is not especially fast, but it is convenient as part of other functions. + */ +static inline __m256i popcount256(__m256i v) { + const __m256i lookuppos = _mm256_setr_epi8( + /* 0 */ 4 + 0, /* 1 */ 4 + 1, /* 2 */ 4 + 1, /* 3 */ 4 + 2, + /* 4 */ 4 + 1, /* 5 */ 4 + 2, /* 6 */ 4 + 2, /* 7 */ 4 + 3, + /* 8 */ 4 + 1, /* 9 */ 4 + 2, /* a */ 4 + 2, /* b */ 4 + 3, + /* c */ 4 + 2, /* d */ 4 + 3, /* e */ 4 + 3, /* f */ 4 + 4, + + /* 0 */ 4 + 0, /* 1 */ 4 + 1, /* 2 */ 4 + 1, /* 3 */ 4 + 2, + /* 4 */ 4 + 1, /* 5 */ 4 + 2, /* 6 */ 4 + 2, /* 7 */ 4 + 3, + /* 8 */ 4 + 1, /* 9 */ 4 + 2, /* a */ 4 + 2, /* b */ 4 + 3, + /* c */ 4 + 2, /* d */ 4 + 3, /* e */ 4 + 3, /* f */ 4 + 4); + const __m256i lookupneg = _mm256_setr_epi8( + /* 0 */ 4 - 0, /* 1 */ 4 - 1, /* 2 */ 4 - 1, /* 3 */ 4 - 2, + /* 4 */ 4 - 1, /* 5 */ 4 - 2, /* 6 */ 4 - 2, /* 7 */ 4 - 3, + /* 8 */ 4 - 1, /* 9 */ 4 - 2, /* a */ 4 - 2, /* b */ 4 - 3, + /* c */ 4 - 2, /* d */ 4 - 3, /* e */ 4 - 3, /* f */ 4 - 4, + + /* 0 */ 4 - 0, /* 1 */ 4 - 1, /* 2 */ 4 - 1, /* 3 */ 4 - 2, + /* 4 */ 4 - 1, /* 5 */ 4 - 2, /* 6 */ 4 - 2, /* 7 */ 4 - 3, + /* 8 */ 4 - 1, /* 9 */ 4 - 2, /* a */ 4 - 2, /* b */ 4 - 3, + /* c */ 4 - 2, /* d */ 4 - 3, /* e */ 4 - 3, /* f */ 4 - 4); + const __m256i low_mask = _mm256_set1_epi8(0x0f); + + const __m256i lo = _mm256_and_si256(v, low_mask); + const __m256i hi = _mm256_and_si256(_mm256_srli_epi16(v, 4), low_mask); + const __m256i popcnt1 = _mm256_shuffle_epi8(lookuppos, lo); + const __m256i popcnt2 = _mm256_shuffle_epi8(lookupneg, hi); + return _mm256_sad_epu8(popcnt1, popcnt2); +} +CROARING_UNTARGET_AVX2 + +CROARING_TARGET_AVX2 +/** + * Simple CSA over 256 bits + */ +static inline void CSA(__m256i *h, __m256i *l, __m256i a, __m256i b, + __m256i c) { + const __m256i u = _mm256_xor_si256(a, b); + *h = _mm256_or_si256(_mm256_and_si256(a, b), _mm256_and_si256(u, c)); + *l = _mm256_xor_si256(u, c); +} +CROARING_UNTARGET_AVX2 + +CROARING_TARGET_AVX2 +/** + * Fast Harley-Seal AVX population count function + */ +inline static uint64_t avx2_harley_seal_popcount256(const __m256i *data, + const uint64_t size) { + __m256i total = _mm256_setzero_si256(); + __m256i ones = _mm256_setzero_si256(); + __m256i twos = _mm256_setzero_si256(); + __m256i fours = _mm256_setzero_si256(); + __m256i eights = _mm256_setzero_si256(); + __m256i sixteens = _mm256_setzero_si256(); + __m256i twosA, twosB, foursA, foursB, eightsA, eightsB; + + const uint64_t limit = size - size % 16; + uint64_t i = 0; + + for (; i < limit; i += 16) { + CSA(&twosA, &ones, ones, _mm256_lddqu_si256(data + i), + _mm256_lddqu_si256(data + i + 1)); + CSA(&twosB, &ones, ones, _mm256_lddqu_si256(data + i + 2), + _mm256_lddqu_si256(data + i + 3)); + CSA(&foursA, &twos, twos, twosA, twosB); + CSA(&twosA, &ones, ones, _mm256_lddqu_si256(data + i + 4), + _mm256_lddqu_si256(data + i + 5)); + CSA(&twosB, &ones, ones, _mm256_lddqu_si256(data + i + 6), + _mm256_lddqu_si256(data + i + 7)); + CSA(&foursB, &twos, twos, twosA, twosB); + CSA(&eightsA, &fours, fours, foursA, foursB); + CSA(&twosA, &ones, ones, _mm256_lddqu_si256(data + i + 8), + _mm256_lddqu_si256(data + i + 9)); + CSA(&twosB, &ones, ones, _mm256_lddqu_si256(data + i + 10), + _mm256_lddqu_si256(data + i + 11)); + CSA(&foursA, &twos, twos, twosA, twosB); + CSA(&twosA, &ones, ones, _mm256_lddqu_si256(data + i + 12), + _mm256_lddqu_si256(data + i + 13)); + CSA(&twosB, &ones, ones, _mm256_lddqu_si256(data + i + 14), + _mm256_lddqu_si256(data + i + 15)); + CSA(&foursB, &twos, twos, twosA, twosB); + CSA(&eightsB, &fours, fours, foursA, foursB); + CSA(&sixteens, &eights, eights, eightsA, eightsB); + + total = _mm256_add_epi64(total, popcount256(sixteens)); + } + + total = _mm256_slli_epi64(total, 4); // * 16 + total = _mm256_add_epi64( + total, _mm256_slli_epi64(popcount256(eights), 3)); // += 8 * ... + total = _mm256_add_epi64( + total, _mm256_slli_epi64(popcount256(fours), 2)); // += 4 * ... + total = _mm256_add_epi64( + total, _mm256_slli_epi64(popcount256(twos), 1)); // += 2 * ... + total = _mm256_add_epi64(total, popcount256(ones)); + for (; i < size; i++) + total = + _mm256_add_epi64(total, popcount256(_mm256_lddqu_si256(data + i))); + + return (uint64_t)(_mm256_extract_epi64(total, 0)) + + (uint64_t)(_mm256_extract_epi64(total, 1)) + + (uint64_t)(_mm256_extract_epi64(total, 2)) + + (uint64_t)(_mm256_extract_epi64(total, 3)); +} +CROARING_UNTARGET_AVX2 + +#define AVXPOPCNTFNC(opname, avx_intrinsic) \ + static inline uint64_t avx2_harley_seal_popcount256_##opname( \ + const __m256i *data1, const __m256i *data2, const uint64_t size) { \ + __m256i total = _mm256_setzero_si256(); \ + __m256i ones = _mm256_setzero_si256(); \ + __m256i twos = _mm256_setzero_si256(); \ + __m256i fours = _mm256_setzero_si256(); \ + __m256i eights = _mm256_setzero_si256(); \ + __m256i sixteens = _mm256_setzero_si256(); \ + __m256i twosA, twosB, foursA, foursB, eightsA, eightsB; \ + __m256i A1, A2; \ + const uint64_t limit = size - size % 16; \ + uint64_t i = 0; \ + for (; i < limit; i += 16) { \ + A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i), \ + _mm256_lddqu_si256(data2 + i)); \ + A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 1), \ + _mm256_lddqu_si256(data2 + i + 1)); \ + CSA(&twosA, &ones, ones, A1, A2); \ + A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 2), \ + _mm256_lddqu_si256(data2 + i + 2)); \ + A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 3), \ + _mm256_lddqu_si256(data2 + i + 3)); \ + CSA(&twosB, &ones, ones, A1, A2); \ + CSA(&foursA, &twos, twos, twosA, twosB); \ + A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 4), \ + _mm256_lddqu_si256(data2 + i + 4)); \ + A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 5), \ + _mm256_lddqu_si256(data2 + i + 5)); \ + CSA(&twosA, &ones, ones, A1, A2); \ + A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 6), \ + _mm256_lddqu_si256(data2 + i + 6)); \ + A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 7), \ + _mm256_lddqu_si256(data2 + i + 7)); \ + CSA(&twosB, &ones, ones, A1, A2); \ + CSA(&foursB, &twos, twos, twosA, twosB); \ + CSA(&eightsA, &fours, fours, foursA, foursB); \ + A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 8), \ + _mm256_lddqu_si256(data2 + i + 8)); \ + A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 9), \ + _mm256_lddqu_si256(data2 + i + 9)); \ + CSA(&twosA, &ones, ones, A1, A2); \ + A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 10), \ + _mm256_lddqu_si256(data2 + i + 10)); \ + A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 11), \ + _mm256_lddqu_si256(data2 + i + 11)); \ + CSA(&twosB, &ones, ones, A1, A2); \ + CSA(&foursA, &twos, twos, twosA, twosB); \ + A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 12), \ + _mm256_lddqu_si256(data2 + i + 12)); \ + A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 13), \ + _mm256_lddqu_si256(data2 + i + 13)); \ + CSA(&twosA, &ones, ones, A1, A2); \ + A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 14), \ + _mm256_lddqu_si256(data2 + i + 14)); \ + A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 15), \ + _mm256_lddqu_si256(data2 + i + 15)); \ + CSA(&twosB, &ones, ones, A1, A2); \ + CSA(&foursB, &twos, twos, twosA, twosB); \ + CSA(&eightsB, &fours, fours, foursA, foursB); \ + CSA(&sixteens, &eights, eights, eightsA, eightsB); \ + total = _mm256_add_epi64(total, popcount256(sixteens)); \ + } \ + total = _mm256_slli_epi64(total, 4); \ + total = _mm256_add_epi64(total, \ + _mm256_slli_epi64(popcount256(eights), 3)); \ + total = \ + _mm256_add_epi64(total, _mm256_slli_epi64(popcount256(fours), 2)); \ + total = \ + _mm256_add_epi64(total, _mm256_slli_epi64(popcount256(twos), 1)); \ + total = _mm256_add_epi64(total, popcount256(ones)); \ + for (; i < size; i++) { \ + A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i), \ + _mm256_lddqu_si256(data2 + i)); \ + total = _mm256_add_epi64(total, popcount256(A1)); \ + } \ + return (uint64_t)(_mm256_extract_epi64(total, 0)) + \ + (uint64_t)(_mm256_extract_epi64(total, 1)) + \ + (uint64_t)(_mm256_extract_epi64(total, 2)) + \ + (uint64_t)(_mm256_extract_epi64(total, 3)); \ + } \ + static inline uint64_t avx2_harley_seal_popcount256andstore_##opname( \ + const __m256i *__restrict__ data1, const __m256i *__restrict__ data2, \ + __m256i *__restrict__ out, const uint64_t size) { \ + __m256i total = _mm256_setzero_si256(); \ + __m256i ones = _mm256_setzero_si256(); \ + __m256i twos = _mm256_setzero_si256(); \ + __m256i fours = _mm256_setzero_si256(); \ + __m256i eights = _mm256_setzero_si256(); \ + __m256i sixteens = _mm256_setzero_si256(); \ + __m256i twosA, twosB, foursA, foursB, eightsA, eightsB; \ + __m256i A1, A2; \ + const uint64_t limit = size - size % 16; \ + uint64_t i = 0; \ + for (; i < limit; i += 16) { \ + A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i), \ + _mm256_lddqu_si256(data2 + i)); \ + _mm256_storeu_si256(out + i, A1); \ + A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 1), \ + _mm256_lddqu_si256(data2 + i + 1)); \ + _mm256_storeu_si256(out + i + 1, A2); \ + CSA(&twosA, &ones, ones, A1, A2); \ + A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 2), \ + _mm256_lddqu_si256(data2 + i + 2)); \ + _mm256_storeu_si256(out + i + 2, A1); \ + A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 3), \ + _mm256_lddqu_si256(data2 + i + 3)); \ + _mm256_storeu_si256(out + i + 3, A2); \ + CSA(&twosB, &ones, ones, A1, A2); \ + CSA(&foursA, &twos, twos, twosA, twosB); \ + A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 4), \ + _mm256_lddqu_si256(data2 + i + 4)); \ + _mm256_storeu_si256(out + i + 4, A1); \ + A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 5), \ + _mm256_lddqu_si256(data2 + i + 5)); \ + _mm256_storeu_si256(out + i + 5, A2); \ + CSA(&twosA, &ones, ones, A1, A2); \ + A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 6), \ + _mm256_lddqu_si256(data2 + i + 6)); \ + _mm256_storeu_si256(out + i + 6, A1); \ + A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 7), \ + _mm256_lddqu_si256(data2 + i + 7)); \ + _mm256_storeu_si256(out + i + 7, A2); \ + CSA(&twosB, &ones, ones, A1, A2); \ + CSA(&foursB, &twos, twos, twosA, twosB); \ + CSA(&eightsA, &fours, fours, foursA, foursB); \ + A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 8), \ + _mm256_lddqu_si256(data2 + i + 8)); \ + _mm256_storeu_si256(out + i + 8, A1); \ + A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 9), \ + _mm256_lddqu_si256(data2 + i + 9)); \ + _mm256_storeu_si256(out + i + 9, A2); \ + CSA(&twosA, &ones, ones, A1, A2); \ + A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 10), \ + _mm256_lddqu_si256(data2 + i + 10)); \ + _mm256_storeu_si256(out + i + 10, A1); \ + A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 11), \ + _mm256_lddqu_si256(data2 + i + 11)); \ + _mm256_storeu_si256(out + i + 11, A2); \ + CSA(&twosB, &ones, ones, A1, A2); \ + CSA(&foursA, &twos, twos, twosA, twosB); \ + A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 12), \ + _mm256_lddqu_si256(data2 + i + 12)); \ + _mm256_storeu_si256(out + i + 12, A1); \ + A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 13), \ + _mm256_lddqu_si256(data2 + i + 13)); \ + _mm256_storeu_si256(out + i + 13, A2); \ + CSA(&twosA, &ones, ones, A1, A2); \ + A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 14), \ + _mm256_lddqu_si256(data2 + i + 14)); \ + _mm256_storeu_si256(out + i + 14, A1); \ + A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 15), \ + _mm256_lddqu_si256(data2 + i + 15)); \ + _mm256_storeu_si256(out + i + 15, A2); \ + CSA(&twosB, &ones, ones, A1, A2); \ + CSA(&foursB, &twos, twos, twosA, twosB); \ + CSA(&eightsB, &fours, fours, foursA, foursB); \ + CSA(&sixteens, &eights, eights, eightsA, eightsB); \ + total = _mm256_add_epi64(total, popcount256(sixteens)); \ + } \ + total = _mm256_slli_epi64(total, 4); \ + total = _mm256_add_epi64(total, \ + _mm256_slli_epi64(popcount256(eights), 3)); \ + total = \ + _mm256_add_epi64(total, _mm256_slli_epi64(popcount256(fours), 2)); \ + total = \ + _mm256_add_epi64(total, _mm256_slli_epi64(popcount256(twos), 1)); \ + total = _mm256_add_epi64(total, popcount256(ones)); \ + for (; i < size; i++) { \ + A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i), \ + _mm256_lddqu_si256(data2 + i)); \ + _mm256_storeu_si256(out + i, A1); \ + total = _mm256_add_epi64(total, popcount256(A1)); \ + } \ + return (uint64_t)(_mm256_extract_epi64(total, 0)) + \ + (uint64_t)(_mm256_extract_epi64(total, 1)) + \ + (uint64_t)(_mm256_extract_epi64(total, 2)) + \ + (uint64_t)(_mm256_extract_epi64(total, 3)); \ + } + +CROARING_TARGET_AVX2 +AVXPOPCNTFNC(or, _mm256_or_si256) +CROARING_UNTARGET_AVX2 + +CROARING_TARGET_AVX2 +AVXPOPCNTFNC(union, _mm256_or_si256) +CROARING_UNTARGET_AVX2 + +CROARING_TARGET_AVX2 +AVXPOPCNTFNC(and, _mm256_and_si256) +CROARING_UNTARGET_AVX2 + +CROARING_TARGET_AVX2 +AVXPOPCNTFNC(intersection, _mm256_and_si256) +CROARING_UNTARGET_AVX2 + +CROARING_TARGET_AVX2 +AVXPOPCNTFNC (xor, _mm256_xor_si256) +CROARING_UNTARGET_AVX2 + +CROARING_TARGET_AVX2 +AVXPOPCNTFNC(andnot, _mm256_andnot_si256) +CROARING_UNTARGET_AVX2 + + +#define VPOPCNT_AND_ADD(ptr, i, accu) \ + const __m512i v##i = _mm512_loadu_si512((const __m512i*)ptr + i); \ + const __m512i p##i = _mm512_popcnt_epi64(v##i); \ + accu = _mm512_add_epi64(accu, p##i); + +#if CROARING_COMPILER_SUPPORTS_AVX512 +CROARING_TARGET_AVX512 +static inline uint64_t sum_epu64_256(const __m256i v) { + + return (uint64_t)(_mm256_extract_epi64(v, 0)) + + (uint64_t)(_mm256_extract_epi64(v, 1)) + + (uint64_t)(_mm256_extract_epi64(v, 2)) + + (uint64_t)(_mm256_extract_epi64(v, 3)); +} + + +static inline uint64_t simd_sum_epu64(const __m512i v) { + + __m256i lo = _mm512_extracti64x4_epi64(v, 0); + __m256i hi = _mm512_extracti64x4_epi64(v, 1); + + return sum_epu64_256(lo) + sum_epu64_256(hi); +} + +static inline uint64_t avx512_vpopcount(const __m512i* data, const uint64_t size) +{ + const uint64_t limit = size - size % 4; + __m512i total = _mm512_setzero_si512(); + uint64_t i = 0; + + for (; i < limit; i += 4) + { + VPOPCNT_AND_ADD(data + i, 0, total); + VPOPCNT_AND_ADD(data + i, 1, total); + VPOPCNT_AND_ADD(data + i, 2, total); + VPOPCNT_AND_ADD(data + i, 3, total); + } + + for (; i < size; i++) + { + total = _mm512_add_epi64(total, _mm512_popcnt_epi64(_mm512_loadu_si512(data + i))); + } + + return simd_sum_epu64(total); +} +CROARING_UNTARGET_AVX512 +#endif + +#define AVXPOPCNTFNC512(opname, avx_intrinsic) \ + static inline uint64_t avx512_harley_seal_popcount512_##opname( \ + const __m512i *data1, const __m512i *data2, const uint64_t size) { \ + __m512i total = _mm512_setzero_si512(); \ + const uint64_t limit = size - size % 4; \ + uint64_t i = 0; \ + for (; i < limit; i += 4) { \ + __m512i a1 = avx_intrinsic(_mm512_loadu_si512(data1 + i), \ + _mm512_loadu_si512(data2 + i)); \ + total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a1)); \ + __m512i a2 = avx_intrinsic(_mm512_loadu_si512(data1 + i + 1), \ + _mm512_loadu_si512(data2 + i + 1)); \ + total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a2)); \ + __m512i a3 = avx_intrinsic(_mm512_loadu_si512(data1 + i + 2), \ + _mm512_loadu_si512(data2 + i + 2)); \ + total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a3)); \ + __m512i a4 = avx_intrinsic(_mm512_loadu_si512(data1 + i + 3), \ + _mm512_loadu_si512(data2 + i + 3)); \ + total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a4)); \ + } \ + for(; i < size; i++) { \ + __m512i a = avx_intrinsic(_mm512_loadu_si512(data1 + i), \ + _mm512_loadu_si512(data2 + i)); \ + total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a)); \ + } \ + return simd_sum_epu64(total); \ + } \ + static inline uint64_t avx512_harley_seal_popcount512andstore_##opname( \ + const __m512i *__restrict__ data1, const __m512i *__restrict__ data2, \ + __m512i *__restrict__ out, const uint64_t size) { \ + __m512i total = _mm512_setzero_si512(); \ + const uint64_t limit = size - size % 4; \ + uint64_t i = 0; \ + for (; i < limit; i += 4) { \ + __m512i a1 = avx_intrinsic(_mm512_loadu_si512(data1 + i), \ + _mm512_loadu_si512(data2 + i)); \ + _mm512_storeu_si512(out + i, a1); \ + total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a1)); \ + __m512i a2 = avx_intrinsic(_mm512_loadu_si512(data1 + i + 1), \ + _mm512_loadu_si512(data2 + i + 1)); \ + _mm512_storeu_si512(out + i + 1, a2); \ + total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a2)); \ + __m512i a3 = avx_intrinsic(_mm512_loadu_si512(data1 + i + 2), \ + _mm512_loadu_si512(data2 + i + 2)); \ + _mm512_storeu_si512(out + i + 2, a3); \ + total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a3)); \ + __m512i a4 = avx_intrinsic(_mm512_loadu_si512(data1 + i + 3), \ + _mm512_loadu_si512(data2 + i + 3)); \ + _mm512_storeu_si512(out + i + 3, a4); \ + total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a4)); \ + } \ + for(; i < size; i++) { \ + __m512i a = avx_intrinsic(_mm512_loadu_si512(data1 + i), \ + _mm512_loadu_si512(data2 + i)); \ + _mm512_storeu_si512(out + i, a); \ + total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a)); \ + } \ + return simd_sum_epu64(total); \ + } \ + +#if CROARING_COMPILER_SUPPORTS_AVX512 +CROARING_TARGET_AVX512 +AVXPOPCNTFNC512(or, _mm512_or_si512) +AVXPOPCNTFNC512(union, _mm512_or_si512) +AVXPOPCNTFNC512(and, _mm512_and_si512) +AVXPOPCNTFNC512(intersection, _mm512_and_si512) +AVXPOPCNTFNC512(xor, _mm512_xor_si512) +AVXPOPCNTFNC512(andnot, _mm512_andnot_si512) +CROARING_UNTARGET_AVX512 +#endif +/*** + * END Harley-Seal popcount functions. + */ + +#endif // CROARING_IS_X64 + +#ifdef __cplusplus +} } } // extern "C" { namespace roaring { namespace internal +#endif + +#endif +/* end file include/roaring/bitset_util.h */ +/* begin file include/roaring/containers/array.h */ +/* + * array.h + * + */ + +#ifndef INCLUDE_CONTAINERS_ARRAY_H_ +#define INCLUDE_CONTAINERS_ARRAY_H_ + +#include + + + +#ifdef __cplusplus +extern "C" { namespace roaring { + +// Note: in pure C++ code, you should avoid putting `using` in header files +using api::roaring_iterator; +using api::roaring_iterator64; + +namespace internal { +#endif + +/* Containers with DEFAULT_MAX_SIZE or less integers should be arrays */ +enum { DEFAULT_MAX_SIZE = 4096 }; + +/* struct array_container - sparse representation of a bitmap + * + * @cardinality: number of indices in `array` (and the bitmap) + * @capacity: allocated size of `array` + * @array: sorted list of integers + */ +STRUCT_CONTAINER(array_container_s) { + int32_t cardinality; + int32_t capacity; + uint16_t *array; +}; + +typedef struct array_container_s array_container_t; + +#define CAST_array(c) CAST(array_container_t *, c) // safer downcast +#define const_CAST_array(c) CAST(const array_container_t *, c) +#define movable_CAST_array(c) movable_CAST(array_container_t **, c) + +/* Create a new array with default. Return NULL in case of failure. See also + * array_container_create_given_capacity. */ +array_container_t *array_container_create(void); + +/* Create a new array with a specified capacity size. Return NULL in case of + * failure. */ +array_container_t *array_container_create_given_capacity(int32_t size); + +/* Create a new array containing all values in [min,max). */ +array_container_t * array_container_create_range(uint32_t min, uint32_t max); + +/* + * Shrink the capacity to the actual size, return the number of bytes saved. + */ +int array_container_shrink_to_fit(array_container_t *src); + +/* Free memory owned by `array'. */ +void array_container_free(array_container_t *array); + +/* Duplicate container */ +array_container_t *array_container_clone(const array_container_t *src); + +/* Get the cardinality of `array'. */ +ALLOW_UNALIGNED +static inline int array_container_cardinality(const array_container_t *array) { + return array->cardinality; +} + +static inline bool array_container_nonzero_cardinality( + const array_container_t *array) { + return array->cardinality > 0; +} + +/* Copy one container into another. We assume that they are distinct. */ +void array_container_copy(const array_container_t *src, array_container_t *dst); + +/* Add all the values in [min,max) (included) at a distance k*step from min. + The container must have a size less or equal to DEFAULT_MAX_SIZE after this + addition. */ +void array_container_add_from_range(array_container_t *arr, uint32_t min, + uint32_t max, uint16_t step); + + +static inline bool array_container_empty(const array_container_t *array) { + return array->cardinality == 0; +} + +/* check whether the cardinality is equal to the capacity (this does not mean +* that it contains 1<<16 elements) */ +static inline bool array_container_full(const array_container_t *array) { + return array->cardinality == array->capacity; +} + + +/* Compute the union of `src_1' and `src_2' and write the result to `dst' + * It is assumed that `dst' is distinct from both `src_1' and `src_2'. */ +void array_container_union(const array_container_t *src_1, + const array_container_t *src_2, + array_container_t *dst); + +/* symmetric difference, see array_container_union */ +void array_container_xor(const array_container_t *array_1, + const array_container_t *array_2, + array_container_t *out); + +/* Computes the intersection of src_1 and src_2 and write the result to + * dst. It is assumed that dst is distinct from both src_1 and src_2. */ +void array_container_intersection(const array_container_t *src_1, + const array_container_t *src_2, + array_container_t *dst); + +/* Check whether src_1 and src_2 intersect. */ +bool array_container_intersect(const array_container_t *src_1, + const array_container_t *src_2); + + +/* computers the size of the intersection between two arrays. + */ +int array_container_intersection_cardinality(const array_container_t *src_1, + const array_container_t *src_2); + +/* computes the intersection of array1 and array2 and write the result to + * array1. + * */ +void array_container_intersection_inplace(array_container_t *src_1, + const array_container_t *src_2); + +/* + * Write out the 16-bit integers contained in this container as a list of 32-bit + * integers using base + * as the starting value (it might be expected that base has zeros in its 16 + * least significant bits). + * The function returns the number of values written. + * The caller is responsible for allocating enough memory in out. + */ +int array_container_to_uint32_array(void *vout, const array_container_t *cont, + uint32_t base); + +/* Compute the number of runs */ +int32_t array_container_number_of_runs(const array_container_t *ac); + +/* + * Print this container using printf (useful for debugging). + */ +void array_container_printf(const array_container_t *v); + +/* + * Print this container using printf as a comma-separated list of 32-bit + * integers starting at base. + */ +void array_container_printf_as_uint32_array(const array_container_t *v, + uint32_t base); + +/** + * Return the serialized size in bytes of a container having cardinality "card". + */ +static inline int32_t array_container_serialized_size_in_bytes(int32_t card) { + return card * 2 + 2; +} + +/** + * Increase capacity to at least min. + * Whether the existing data needs to be copied over depends on the "preserve" + * parameter. If preserve is false, then the new content will be uninitialized, + * otherwise the old content is copied. + */ +void array_container_grow(array_container_t *container, int32_t min, + bool preserve); + +bool array_container_iterate(const array_container_t *cont, uint32_t base, + roaring_iterator iterator, void *ptr); +bool array_container_iterate64(const array_container_t *cont, uint32_t base, + roaring_iterator64 iterator, uint64_t high_bits, + void *ptr); + +/** + * Writes the underlying array to buf, outputs how many bytes were written. + * This is meant to be byte-by-byte compatible with the Java and Go versions of + * Roaring. + * The number of bytes written should be + * array_container_size_in_bytes(container). + * + */ +int32_t array_container_write(const array_container_t *container, char *buf); +/** + * Reads the instance from buf, outputs how many bytes were read. + * This is meant to be byte-by-byte compatible with the Java and Go versions of + * Roaring. + * The number of bytes read should be array_container_size_in_bytes(container). + * You need to provide the (known) cardinality. + */ +int32_t array_container_read(int32_t cardinality, array_container_t *container, + const char *buf); + +/** + * Return the serialized size in bytes of a container (see + * bitset_container_write) + * This is meant to be compatible with the Java and Go versions of Roaring and + * assumes + * that the cardinality of the container is already known. + * + */ +static inline int32_t array_container_size_in_bytes( + const array_container_t *container) { + return container->cardinality * sizeof(uint16_t); +} + +/** + * Return true if the two arrays have the same content. + */ +ALLOW_UNALIGNED +static inline bool array_container_equals( + const array_container_t *container1, + const array_container_t *container2) { + + if (container1->cardinality != container2->cardinality) { + return false; + } + return memequals(container1->array, container2->array, container1->cardinality*2); +} + +/** + * Return true if container1 is a subset of container2. + */ +bool array_container_is_subset(const array_container_t *container1, + const array_container_t *container2); + +/** + * If the element of given rank is in this container, supposing that the first + * element has rank start_rank, then the function returns true and sets element + * accordingly. + * Otherwise, it returns false and update start_rank. + */ +static inline bool array_container_select(const array_container_t *container, + uint32_t *start_rank, uint32_t rank, + uint32_t *element) { + int card = array_container_cardinality(container); + if (*start_rank + card <= rank) { + *start_rank += card; + return false; + } else { + *element = container->array[rank - *start_rank]; + return true; + } +} + +/* Computes the difference of array1 and array2 and write the result + * to array out. + * Array out does not need to be distinct from array_1 + */ +void array_container_andnot(const array_container_t *array_1, + const array_container_t *array_2, + array_container_t *out); + +/* Append x to the set. Assumes that the value is larger than any preceding + * values. */ +static inline void array_container_append(array_container_t *arr, + uint16_t pos) { + const int32_t capacity = arr->capacity; + + if (array_container_full(arr)) { + array_container_grow(arr, capacity + 1, true); + } + + arr->array[arr->cardinality++] = pos; +} + +/** + * Add value to the set if final cardinality doesn't exceed max_cardinality. + * Return code: + * 1 -- value was added + * 0 -- value was already present + * -1 -- value was not added because cardinality would exceed max_cardinality + */ +static inline int array_container_try_add(array_container_t *arr, uint16_t value, + int32_t max_cardinality) { + const int32_t cardinality = arr->cardinality; + + // best case, we can append. + if ((array_container_empty(arr) || arr->array[cardinality - 1] < value) && + cardinality < max_cardinality) { + array_container_append(arr, value); + return 1; + } + + const int32_t loc = binarySearch(arr->array, cardinality, value); + + if (loc >= 0) { + return 0; + } else if (cardinality < max_cardinality) { + if (array_container_full(arr)) { + array_container_grow(arr, arr->capacity + 1, true); + } + const int32_t insert_idx = -loc - 1; + memmove(arr->array + insert_idx + 1, arr->array + insert_idx, + (cardinality - insert_idx) * sizeof(uint16_t)); + arr->array[insert_idx] = value; + arr->cardinality++; + return 1; + } else { + return -1; + } +} + +/* Add value to the set. Returns true if x was not already present. */ +static inline bool array_container_add(array_container_t *arr, uint16_t value) { + return array_container_try_add(arr, value, INT32_MAX) == 1; +} + +/* Remove x from the set. Returns true if x was present. */ +static inline bool array_container_remove(array_container_t *arr, + uint16_t pos) { + const int32_t idx = binarySearch(arr->array, arr->cardinality, pos); + const bool is_present = idx >= 0; + if (is_present) { + memmove(arr->array + idx, arr->array + idx + 1, + (arr->cardinality - idx - 1) * sizeof(uint16_t)); + arr->cardinality--; + } + + return is_present; +} + +/* Check whether x is present. */ +inline bool array_container_contains(const array_container_t *arr, + uint16_t pos) { + // return binarySearch(arr->array, arr->cardinality, pos) >= 0; + // binary search with fallback to linear search for short ranges + int32_t low = 0; + const uint16_t * carr = (const uint16_t *) arr->array; + int32_t high = arr->cardinality - 1; + // while (high - low >= 0) { + while(high >= low + 16) { + int32_t middleIndex = (low + high)>>1; + uint16_t middleValue = carr[middleIndex]; + if (middleValue < pos) { + low = middleIndex + 1; + } else if (middleValue > pos) { + high = middleIndex - 1; + } else { + return true; + } + } + + for (int i=low; i <= high; i++) { + uint16_t v = carr[i]; + if (v == pos) { + return true; + } + if ( v > pos ) return false; + } + return false; + +} + +void array_container_offset(const array_container_t *c, + container_t **loc, container_t **hic, + uint16_t offset); + +//* Check whether a range of values from range_start (included) to range_end (excluded) is present. */ +static inline bool array_container_contains_range(const array_container_t *arr, + uint32_t range_start, uint32_t range_end) { + const int32_t range_count = range_end - range_start; + const uint16_t rs_included = range_start; + const uint16_t re_included = range_end - 1; + + // Empty range is always included + if (range_count <= 0) { + return true; + } + if (range_count > arr->cardinality) { + return false; + } + + const int32_t start = binarySearch(arr->array, arr->cardinality, rs_included); + // If this sorted array contains all items in the range: + // * the start item must be found + // * the last item in range range_count must exist, and be the expected end value + return (start >= 0) && (arr->cardinality >= start + range_count) && + (arr->array[start + range_count - 1] == re_included); +} + +/* Returns the smallest value (assumes not empty) */ +inline uint16_t array_container_minimum(const array_container_t *arr) { + if (arr->cardinality == 0) return 0; + return arr->array[0]; +} + +/* Returns the largest value (assumes not empty) */ +inline uint16_t array_container_maximum(const array_container_t *arr) { + if (arr->cardinality == 0) return 0; + return arr->array[arr->cardinality - 1]; +} + +/* Returns the number of values equal or smaller than x */ +inline int array_container_rank(const array_container_t *arr, uint16_t x) { + const int32_t idx = binarySearch(arr->array, arr->cardinality, x); + const bool is_present = idx >= 0; + if (is_present) { + return idx + 1; + } else { + return -idx - 1; + } +} + +/* Returns the index of x , if not exsist return -1 */ +inline int array_container_get_index(const array_container_t *arr, uint16_t x) { + const int32_t idx = binarySearch(arr->array, arr->cardinality, x); + const bool is_present = idx >= 0; + if (is_present) { + return idx; + } else { + return -1; + } +} + +/* Returns the index of the first value equal or larger than x, or -1 */ +inline int array_container_index_equalorlarger(const array_container_t *arr, uint16_t x) { + const int32_t idx = binarySearch(arr->array, arr->cardinality, x); + const bool is_present = idx >= 0; + if (is_present) { + return idx; + } else { + int32_t candidate = - idx - 1; + if(candidate < arr->cardinality) return candidate; + return -1; + } +} + +/* + * Adds all values in range [min,max] using hint: + * nvals_less is the number of array values less than $min + * nvals_greater is the number of array values greater than $max + */ +static inline void array_container_add_range_nvals(array_container_t *array, + uint32_t min, uint32_t max, + int32_t nvals_less, + int32_t nvals_greater) { + int32_t union_cardinality = nvals_less + (max - min + 1) + nvals_greater; + if (union_cardinality > array->capacity) { + array_container_grow(array, union_cardinality, true); + } + memmove(&(array->array[union_cardinality - nvals_greater]), + &(array->array[array->cardinality - nvals_greater]), + nvals_greater * sizeof(uint16_t)); + for (uint32_t i = 0; i <= max - min; i++) { + array->array[nvals_less + i] = min + i; + } + array->cardinality = union_cardinality; +} + +/** + * Adds all values in range [min,max]. This function is currently unused + * and left as a documentation. + */ +/*static inline void array_container_add_range(array_container_t *array, + uint32_t min, uint32_t max) { + int32_t nvals_greater = count_greater(array->array, array->cardinality, max); + int32_t nvals_less = count_less(array->array, array->cardinality - nvals_greater, min); + array_container_add_range_nvals(array, min, max, nvals_less, nvals_greater); +}*/ + +/* + * Removes all elements array[pos] .. array[pos+count-1] + */ +static inline void array_container_remove_range(array_container_t *array, + uint32_t pos, uint32_t count) { + if (count != 0) { + memmove(&(array->array[pos]), &(array->array[pos+count]), + (array->cardinality - pos - count) * sizeof(uint16_t)); + array->cardinality -= count; + } +} + +#ifdef __cplusplus +} } } // extern "C" { namespace roaring { namespace internal { +#endif + +#endif /* INCLUDE_CONTAINERS_ARRAY_H_ */ +/* end file include/roaring/containers/array.h */ +/* begin file include/roaring/containers/bitset.h */ +/* + * bitset.h + * + */ + +#ifndef INCLUDE_CONTAINERS_BITSET_H_ +#define INCLUDE_CONTAINERS_BITSET_H_ + +#include +#include + + + +#ifdef __cplusplus +extern "C" { namespace roaring { + +// Note: in pure C++ code, you should avoid putting `using` in header files +using api::roaring_iterator; +using api::roaring_iterator64; + +namespace internal { +#endif + + + +enum { + BITSET_CONTAINER_SIZE_IN_WORDS = (1 << 16) / 64, + BITSET_UNKNOWN_CARDINALITY = -1 +}; + +STRUCT_CONTAINER(bitset_container_s) { + int32_t cardinality; + uint64_t *words; +}; + +typedef struct bitset_container_s bitset_container_t; + +#define CAST_bitset(c) CAST(bitset_container_t *, c) // safer downcast +#define const_CAST_bitset(c) CAST(const bitset_container_t *, c) +#define movable_CAST_bitset(c) movable_CAST(bitset_container_t **, c) + +/* Create a new bitset. Return NULL in case of failure. */ +bitset_container_t *bitset_container_create(void); + +/* Free memory. */ +void bitset_container_free(bitset_container_t *bitset); + +/* Clear bitset (sets bits to 0). */ +void bitset_container_clear(bitset_container_t *bitset); + +/* Set all bits to 1. */ +void bitset_container_set_all(bitset_container_t *bitset); + +/* Duplicate bitset */ +bitset_container_t *bitset_container_clone(const bitset_container_t *src); + +/* Set the bit in [begin,end). WARNING: as of April 2016, this method is slow + * and + * should not be used in performance-sensitive code. Ever. */ +void bitset_container_set_range(bitset_container_t *bitset, uint32_t begin, + uint32_t end); + +#if defined(CROARING_ASMBITMANIPOPTIMIZATION) && defined(__AVX2__) +/* Set the ith bit. */ +static inline void bitset_container_set(bitset_container_t *bitset, + uint16_t pos) { + uint64_t shift = 6; + uint64_t offset; + uint64_t p = pos; + ASM_SHIFT_RIGHT(p, shift, offset); + uint64_t load = bitset->words[offset]; + ASM_SET_BIT_INC_WAS_CLEAR(load, p, bitset->cardinality); + bitset->words[offset] = load; +} + +/* Unset the ith bit. Currently unused. Could be used for optimization. */ +/*static inline void bitset_container_unset(bitset_container_t *bitset, + uint16_t pos) { + uint64_t shift = 6; + uint64_t offset; + uint64_t p = pos; + ASM_SHIFT_RIGHT(p, shift, offset); + uint64_t load = bitset->words[offset]; + ASM_CLEAR_BIT_DEC_WAS_SET(load, p, bitset->cardinality); + bitset->words[offset] = load; +}*/ + +/* Add `pos' to `bitset'. Returns true if `pos' was not present. Might be slower + * than bitset_container_set. */ +static inline bool bitset_container_add(bitset_container_t *bitset, + uint16_t pos) { + uint64_t shift = 6; + uint64_t offset; + uint64_t p = pos; + ASM_SHIFT_RIGHT(p, shift, offset); + uint64_t load = bitset->words[offset]; + // could be possibly slightly further optimized + const int32_t oldcard = bitset->cardinality; + ASM_SET_BIT_INC_WAS_CLEAR(load, p, bitset->cardinality); + bitset->words[offset] = load; + return bitset->cardinality - oldcard; +} + +/* Remove `pos' from `bitset'. Returns true if `pos' was present. Might be + * slower than bitset_container_unset. */ +static inline bool bitset_container_remove(bitset_container_t *bitset, + uint16_t pos) { + uint64_t shift = 6; + uint64_t offset; + uint64_t p = pos; + ASM_SHIFT_RIGHT(p, shift, offset); + uint64_t load = bitset->words[offset]; + // could be possibly slightly further optimized + const int32_t oldcard = bitset->cardinality; + ASM_CLEAR_BIT_DEC_WAS_SET(load, p, bitset->cardinality); + bitset->words[offset] = load; + return oldcard - bitset->cardinality; +} + +/* Get the value of the ith bit. */ +inline bool bitset_container_get(const bitset_container_t *bitset, + uint16_t pos) { + uint64_t word = bitset->words[pos >> 6]; + const uint64_t p = pos; + ASM_INPLACESHIFT_RIGHT(word, p); + return word & 1; +} + +#else + +/* Set the ith bit. */ +static inline void bitset_container_set(bitset_container_t *bitset, + uint16_t pos) { + const uint64_t old_word = bitset->words[pos >> 6]; + const int index = pos & 63; + const uint64_t new_word = old_word | (UINT64_C(1) << index); + bitset->cardinality += (uint32_t)((old_word ^ new_word) >> index); + bitset->words[pos >> 6] = new_word; +} + +/* Unset the ith bit. Currently unused. */ +/*static inline void bitset_container_unset(bitset_container_t *bitset, + uint16_t pos) { + const uint64_t old_word = bitset->words[pos >> 6]; + const int index = pos & 63; + const uint64_t new_word = old_word & (~(UINT64_C(1) << index)); + bitset->cardinality -= (uint32_t)((old_word ^ new_word) >> index); + bitset->words[pos >> 6] = new_word; +}*/ + +/* Add `pos' to `bitset'. Returns true if `pos' was not present. Might be slower + * than bitset_container_set. */ +static inline bool bitset_container_add(bitset_container_t *bitset, + uint16_t pos) { + const uint64_t old_word = bitset->words[pos >> 6]; + const int index = pos & 63; + const uint64_t new_word = old_word | (UINT64_C(1) << index); + const uint64_t increment = (old_word ^ new_word) >> index; + bitset->cardinality += (uint32_t)increment; + bitset->words[pos >> 6] = new_word; + return increment > 0; +} + +/* Remove `pos' from `bitset'. Returns true if `pos' was present. Might be + * slower than bitset_container_unset. */ +static inline bool bitset_container_remove(bitset_container_t *bitset, + uint16_t pos) { + const uint64_t old_word = bitset->words[pos >> 6]; + const int index = pos & 63; + const uint64_t new_word = old_word & (~(UINT64_C(1) << index)); + const uint64_t increment = (old_word ^ new_word) >> index; + bitset->cardinality -= (uint32_t)increment; + bitset->words[pos >> 6] = new_word; + return increment > 0; +} + +/* Get the value of the ith bit. */ +inline bool bitset_container_get(const bitset_container_t *bitset, + uint16_t pos) { + const uint64_t word = bitset->words[pos >> 6]; + return (word >> (pos & 63)) & 1; +} + +#endif + +/* +* Check if all bits are set in a range of positions from pos_start (included) to +* pos_end (excluded). +*/ +static inline bool bitset_container_get_range(const bitset_container_t *bitset, + uint32_t pos_start, uint32_t pos_end) { + + const uint32_t start = pos_start >> 6; + const uint32_t end = pos_end >> 6; + + const uint64_t first = ~((1ULL << (pos_start & 0x3F)) - 1); + const uint64_t last = (1ULL << (pos_end & 0x3F)) - 1; + + if (start == end) return ((bitset->words[end] & first & last) == (first & last)); + if ((bitset->words[start] & first) != first) return false; + + if ((end < BITSET_CONTAINER_SIZE_IN_WORDS) && ((bitset->words[end] & last) != last)){ + + return false; + } + + for (uint16_t i = start + 1; (i < BITSET_CONTAINER_SIZE_IN_WORDS) && (i < end); ++i){ + + if (bitset->words[i] != UINT64_C(0xFFFFFFFFFFFFFFFF)) return false; + } + + return true; +} + +/* Check whether `bitset' is present in `array'. Calls bitset_container_get. */ +inline bool bitset_container_contains(const bitset_container_t *bitset, + uint16_t pos) { + return bitset_container_get(bitset, pos); +} + +/* +* Check whether a range of bits from position `pos_start' (included) to `pos_end' (excluded) +* is present in `bitset'. Calls bitset_container_get_all. +*/ +static inline bool bitset_container_contains_range(const bitset_container_t *bitset, + uint32_t pos_start, uint32_t pos_end) { + return bitset_container_get_range(bitset, pos_start, pos_end); +} + +/* Get the number of bits set */ +ALLOW_UNALIGNED +static inline int bitset_container_cardinality( + const bitset_container_t *bitset) { + return bitset->cardinality; +} + + + + +/* Copy one container into another. We assume that they are distinct. */ +void bitset_container_copy(const bitset_container_t *source, + bitset_container_t *dest); + +/* Add all the values [min,max) at a distance k*step from min: min, + * min+step,.... */ +void bitset_container_add_from_range(bitset_container_t *bitset, uint32_t min, + uint32_t max, uint16_t step); + +/* Get the number of bits set (force computation). This does not modify bitset. + * To update the cardinality, you should do + * bitset->cardinality = bitset_container_compute_cardinality(bitset).*/ +int bitset_container_compute_cardinality(const bitset_container_t *bitset); + +/* Check whether this bitset is empty, + * it never modifies the bitset struct. */ +static inline bool bitset_container_empty( + const bitset_container_t *bitset) { + if (bitset->cardinality == BITSET_UNKNOWN_CARDINALITY) { + for (int i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i ++) { + if((bitset->words[i]) != 0) return false; + } + return true; + } + return bitset->cardinality == 0; +} + + +/* Get whether there is at least one bit set (see bitset_container_empty for the reverse), + the bitset is never modified */ +static inline bool bitset_container_const_nonzero_cardinality( + const bitset_container_t *bitset) { + return !bitset_container_empty(bitset); +} + +/* + * Check whether the two bitsets intersect + */ +bool bitset_container_intersect(const bitset_container_t *src_1, + const bitset_container_t *src_2); + +/* Computes the union of bitsets `src_1' and `src_2' into `dst' and return the + * cardinality. */ +int bitset_container_or(const bitset_container_t *src_1, + const bitset_container_t *src_2, + bitset_container_t *dst); + +/* Computes the union of bitsets `src_1' and `src_2' and return the cardinality. + */ +int bitset_container_or_justcard(const bitset_container_t *src_1, + const bitset_container_t *src_2); + +/* Computes the union of bitsets `src_1' and `src_2' into `dst' and return the + * cardinality. Same as bitset_container_or. */ +int bitset_container_union(const bitset_container_t *src_1, + const bitset_container_t *src_2, + bitset_container_t *dst); + +/* Computes the union of bitsets `src_1' and `src_2' and return the + * cardinality. Same as bitset_container_or_justcard. */ +int bitset_container_union_justcard(const bitset_container_t *src_1, + const bitset_container_t *src_2); + +/* Computes the union of bitsets `src_1' and `src_2' into `dst', but does + * not update the cardinality. Provided to optimize chained operations. */ +int bitset_container_union_nocard(const bitset_container_t *src_1, + const bitset_container_t *src_2, + bitset_container_t *dst); + +/* Computes the union of bitsets `src_1' and `src_2' into `dst', but does not + * update the cardinality. Provided to optimize chained operations. */ +int bitset_container_or_nocard(const bitset_container_t *src_1, + const bitset_container_t *src_2, + bitset_container_t *dst); + +/* Computes the intersection of bitsets `src_1' and `src_2' into `dst' and + * return the cardinality. */ +int bitset_container_and(const bitset_container_t *src_1, + const bitset_container_t *src_2, + bitset_container_t *dst); + +/* Computes the intersection of bitsets `src_1' and `src_2' and return the + * cardinality. */ +int bitset_container_and_justcard(const bitset_container_t *src_1, + const bitset_container_t *src_2); + +/* Computes the intersection of bitsets `src_1' and `src_2' into `dst' and + * return the cardinality. Same as bitset_container_and. */ +int bitset_container_intersection(const bitset_container_t *src_1, + const bitset_container_t *src_2, + bitset_container_t *dst); + +/* Computes the intersection of bitsets `src_1' and `src_2' and return the + * cardinality. Same as bitset_container_and_justcard. */ +int bitset_container_intersection_justcard(const bitset_container_t *src_1, + const bitset_container_t *src_2); + +/* Computes the intersection of bitsets `src_1' and `src_2' into `dst', but does + * not update the cardinality. Provided to optimize chained operations. */ +int bitset_container_intersection_nocard(const bitset_container_t *src_1, + const bitset_container_t *src_2, + bitset_container_t *dst); + +/* Computes the intersection of bitsets `src_1' and `src_2' into `dst', but does + * not update the cardinality. Provided to optimize chained operations. */ +int bitset_container_and_nocard(const bitset_container_t *src_1, + const bitset_container_t *src_2, + bitset_container_t *dst); + +/* Computes the exclusive or of bitsets `src_1' and `src_2' into `dst' and + * return the cardinality. */ +int bitset_container_xor(const bitset_container_t *src_1, + const bitset_container_t *src_2, + bitset_container_t *dst); + +/* Computes the exclusive or of bitsets `src_1' and `src_2' and return the + * cardinality. */ +int bitset_container_xor_justcard(const bitset_container_t *src_1, + const bitset_container_t *src_2); + +/* Computes the exclusive or of bitsets `src_1' and `src_2' into `dst', but does + * not update the cardinality. Provided to optimize chained operations. */ +int bitset_container_xor_nocard(const bitset_container_t *src_1, + const bitset_container_t *src_2, + bitset_container_t *dst); + +/* Computes the and not of bitsets `src_1' and `src_2' into `dst' and return the + * cardinality. */ +int bitset_container_andnot(const bitset_container_t *src_1, + const bitset_container_t *src_2, + bitset_container_t *dst); + +/* Computes the and not of bitsets `src_1' and `src_2' and return the + * cardinality. */ +int bitset_container_andnot_justcard(const bitset_container_t *src_1, + const bitset_container_t *src_2); + +/* Computes the and not or of bitsets `src_1' and `src_2' into `dst', but does + * not update the cardinality. Provided to optimize chained operations. */ +int bitset_container_andnot_nocard(const bitset_container_t *src_1, + const bitset_container_t *src_2, + bitset_container_t *dst); + +void bitset_container_offset(const bitset_container_t *c, + container_t **loc, container_t **hic, + uint16_t offset); +/* + * Write out the 16-bit integers contained in this container as a list of 32-bit + * integers using base + * as the starting value (it might be expected that base has zeros in its 16 + * least significant bits). + * The function returns the number of values written. + * The caller is responsible for allocating enough memory in out. + * The out pointer should point to enough memory (the cardinality times 32 + * bits). + */ +int bitset_container_to_uint32_array(uint32_t *out, + const bitset_container_t *bc, + uint32_t base); + +/* + * Print this container using printf (useful for debugging). + */ +void bitset_container_printf(const bitset_container_t *v); + +/* + * Print this container using printf as a comma-separated list of 32-bit + * integers starting at base. + */ +void bitset_container_printf_as_uint32_array(const bitset_container_t *v, + uint32_t base); + +/** + * Return the serialized size in bytes of a container. + */ +static inline int32_t bitset_container_serialized_size_in_bytes(void) { + return BITSET_CONTAINER_SIZE_IN_WORDS * 8; +} + +/** + * Return the the number of runs. + */ +int bitset_container_number_of_runs(bitset_container_t *bc); + +bool bitset_container_iterate(const bitset_container_t *cont, uint32_t base, + roaring_iterator iterator, void *ptr); +bool bitset_container_iterate64(const bitset_container_t *cont, uint32_t base, + roaring_iterator64 iterator, uint64_t high_bits, + void *ptr); + +/** + * Writes the underlying array to buf, outputs how many bytes were written. + * This is meant to be byte-by-byte compatible with the Java and Go versions of + * Roaring. + * The number of bytes written should be + * bitset_container_size_in_bytes(container). + */ +int32_t bitset_container_write(const bitset_container_t *container, char *buf); + +/** + * Reads the instance from buf, outputs how many bytes were read. + * This is meant to be byte-by-byte compatible with the Java and Go versions of + * Roaring. + * The number of bytes read should be bitset_container_size_in_bytes(container). + * You need to provide the (known) cardinality. + */ +int32_t bitset_container_read(int32_t cardinality, + bitset_container_t *container, const char *buf); +/** + * Return the serialized size in bytes of a container (see + * bitset_container_write). + * This is meant to be compatible with the Java and Go versions of Roaring and + * assumes + * that the cardinality of the container is already known or can be computed. + */ +static inline int32_t bitset_container_size_in_bytes( + const bitset_container_t *container) { + (void)container; + return BITSET_CONTAINER_SIZE_IN_WORDS * sizeof(uint64_t); +} + +/** + * Return true if the two containers have the same content. + */ +bool bitset_container_equals(const bitset_container_t *container1, + const bitset_container_t *container2); + +/** +* Return true if container1 is a subset of container2. +*/ +bool bitset_container_is_subset(const bitset_container_t *container1, + const bitset_container_t *container2); + +/** + * If the element of given rank is in this container, supposing that the first + * element has rank start_rank, then the function returns true and sets element + * accordingly. + * Otherwise, it returns false and update start_rank. + */ +bool bitset_container_select(const bitset_container_t *container, + uint32_t *start_rank, uint32_t rank, + uint32_t *element); + +/* Returns the smallest value (assumes not empty) */ +uint16_t bitset_container_minimum(const bitset_container_t *container); + +/* Returns the largest value (assumes not empty) */ +uint16_t bitset_container_maximum(const bitset_container_t *container); + +/* Returns the number of values equal or smaller than x */ +int bitset_container_rank(const bitset_container_t *container, uint16_t x); + +/* Returns the index of x , if not exsist return -1 */ +int bitset_container_get_index(const bitset_container_t *container, uint16_t x); + +/* Returns the index of the first value equal or larger than x, or -1 */ +int bitset_container_index_equalorlarger(const bitset_container_t *container, uint16_t x); + +#ifdef __cplusplus +} } } // extern "C" { namespace roaring { namespace internal { +#endif + +#endif /* INCLUDE_CONTAINERS_BITSET_H_ */ +/* end file include/roaring/containers/bitset.h */ +/* begin file include/roaring/containers/run.h */ +/* + * run.h + * + */ + +#ifndef INCLUDE_CONTAINERS_RUN_H_ +#define INCLUDE_CONTAINERS_RUN_H_ + +#include +#include +#include +#include + + + +#ifdef __cplusplus +extern "C" { namespace roaring { + +// Note: in pure C++ code, you should avoid putting `using` in header files +using api::roaring_iterator; +using api::roaring_iterator64; + +namespace internal { +#endif + +/* struct rle16_s - run length pair + * + * @value: start position of the run + * @length: length of the run is `length + 1` + * + * An RLE pair {v, l} would represent the integers between the interval + * [v, v+l+1], e.g. {3, 2} = [3, 4, 5]. + */ +struct rle16_s { + uint16_t value; + uint16_t length; +}; + +typedef struct rle16_s rle16_t; + +#ifdef __cplusplus + #define MAKE_RLE16(val,len) \ + {(uint16_t)(val), (uint16_t)(len)} // no tagged structs until c++20 +#else + #define MAKE_RLE16(val,len) \ + (rle16_t){.value = (uint16_t)(val), .length = (uint16_t)(len)} +#endif + +/* struct run_container_s - run container bitmap + * + * @n_runs: number of rle_t pairs in `runs`. + * @capacity: capacity in rle_t pairs `runs` can hold. + * @runs: pairs of rle_t. + */ +STRUCT_CONTAINER(run_container_s) { + int32_t n_runs; + int32_t capacity; + rle16_t *runs; +}; + +typedef struct run_container_s run_container_t; + +#define CAST_run(c) CAST(run_container_t *, c) // safer downcast +#define const_CAST_run(c) CAST(const run_container_t *, c) +#define movable_CAST_run(c) movable_CAST(run_container_t **, c) + +/* Create a new run container. Return NULL in case of failure. */ +run_container_t *run_container_create(void); + +/* Create a new run container with given capacity. Return NULL in case of + * failure. */ +run_container_t *run_container_create_given_capacity(int32_t size); + +/* + * Shrink the capacity to the actual size, return the number of bytes saved. + */ +int run_container_shrink_to_fit(run_container_t *src); + +/* Free memory owned by `run'. */ +void run_container_free(run_container_t *run); + +/* Duplicate container */ +run_container_t *run_container_clone(const run_container_t *src); + +/* + * Effectively deletes the value at index index, repacking data. + */ +static inline void recoverRoomAtIndex(run_container_t *run, uint16_t index) { + memmove(run->runs + index, run->runs + (1 + index), + (run->n_runs - index - 1) * sizeof(rle16_t)); + run->n_runs--; +} + +/** + * Good old binary search through rle data + */ +inline int32_t interleavedBinarySearch(const rle16_t *array, int32_t lenarray, + uint16_t ikey) { + int32_t low = 0; + int32_t high = lenarray - 1; + while (low <= high) { + int32_t middleIndex = (low + high) >> 1; + uint16_t middleValue = array[middleIndex].value; + if (middleValue < ikey) { + low = middleIndex + 1; + } else if (middleValue > ikey) { + high = middleIndex - 1; + } else { + return middleIndex; + } + } + return -(low + 1); +} + +/* + * Returns index of the run which contains $ikey + */ +static inline int32_t rle16_find_run(const rle16_t *array, int32_t lenarray, + uint16_t ikey) { + int32_t low = 0; + int32_t high = lenarray - 1; + while (low <= high) { + int32_t middleIndex = (low + high) >> 1; + uint16_t min = array[middleIndex].value; + uint16_t max = array[middleIndex].value + array[middleIndex].length; + if (ikey > max) { + low = middleIndex + 1; + } else if (ikey < min) { + high = middleIndex - 1; + } else { + return middleIndex; + } + } + return -(low + 1); +} + + +/** + * Returns number of runs which can'be be merged with the key because they + * are less than the key. + * Note that [5,6,7,8] can be merged with the key 9 and won't be counted. + */ +static inline int32_t rle16_count_less(const rle16_t* array, int32_t lenarray, + uint16_t key) { + if (lenarray == 0) return 0; + int32_t low = 0; + int32_t high = lenarray - 1; + while (low <= high) { + int32_t middleIndex = (low + high) >> 1; + uint16_t min_value = array[middleIndex].value; + uint16_t max_value = array[middleIndex].value + array[middleIndex].length; + if (max_value + UINT32_C(1) < key) { // uint32 arithmetic + low = middleIndex + 1; + } else if (key < min_value) { + high = middleIndex - 1; + } else { + return middleIndex; + } + } + return low; +} + +static inline int32_t rle16_count_greater(const rle16_t* array, int32_t lenarray, + uint16_t key) { + if (lenarray == 0) return 0; + int32_t low = 0; + int32_t high = lenarray - 1; + while (low <= high) { + int32_t middleIndex = (low + high) >> 1; + uint16_t min_value = array[middleIndex].value; + uint16_t max_value = array[middleIndex].value + array[middleIndex].length; + if (max_value < key) { + low = middleIndex + 1; + } else if (key + UINT32_C(1) < min_value) { // uint32 arithmetic + high = middleIndex - 1; + } else { + return lenarray - (middleIndex + 1); + } + } + return lenarray - low; +} + +/** + * increase capacity to at least min. Whether the + * existing data needs to be copied over depends on copy. If "copy" is false, + * then the new content will be uninitialized, otherwise a copy is made. + */ +void run_container_grow(run_container_t *run, int32_t min, bool copy); + +/** + * Moves the data so that we can write data at index + */ +static inline void makeRoomAtIndex(run_container_t *run, uint16_t index) { + /* This function calls realloc + memmove sequentially to move by one index. + * Potentially copying twice the array. + */ + if (run->n_runs + 1 > run->capacity) + run_container_grow(run, run->n_runs + 1, true); + memmove(run->runs + 1 + index, run->runs + index, + (run->n_runs - index) * sizeof(rle16_t)); + run->n_runs++; +} + +/* Add `pos' to `run'. Returns true if `pos' was not present. */ +bool run_container_add(run_container_t *run, uint16_t pos); + +/* Remove `pos' from `run'. Returns true if `pos' was present. */ +static inline bool run_container_remove(run_container_t *run, uint16_t pos) { + int32_t index = interleavedBinarySearch(run->runs, run->n_runs, pos); + if (index >= 0) { + int32_t le = run->runs[index].length; + if (le == 0) { + recoverRoomAtIndex(run, (uint16_t)index); + } else { + run->runs[index].value++; + run->runs[index].length--; + } + return true; + } + index = -index - 2; // points to preceding value, possibly -1 + if (index >= 0) { // possible match + int32_t offset = pos - run->runs[index].value; + int32_t le = run->runs[index].length; + if (offset < le) { + // need to break in two + run->runs[index].length = (uint16_t)(offset - 1); + // need to insert + uint16_t newvalue = pos + 1; + int32_t newlength = le - offset - 1; + makeRoomAtIndex(run, (uint16_t)(index + 1)); + run->runs[index + 1].value = newvalue; + run->runs[index + 1].length = (uint16_t)newlength; + return true; + + } else if (offset == le) { + run->runs[index].length--; + return true; + } + } + // no match + return false; +} + +/* Check whether `pos' is present in `run'. */ +inline bool run_container_contains(const run_container_t *run, uint16_t pos) { + int32_t index = interleavedBinarySearch(run->runs, run->n_runs, pos); + if (index >= 0) return true; + index = -index - 2; // points to preceding value, possibly -1 + if (index != -1) { // possible match + int32_t offset = pos - run->runs[index].value; + int32_t le = run->runs[index].length; + if (offset <= le) return true; + } + return false; +} + +/* +* Check whether all positions in a range of positions from pos_start (included) +* to pos_end (excluded) is present in `run'. +*/ +static inline bool run_container_contains_range(const run_container_t *run, + uint32_t pos_start, uint32_t pos_end) { + uint32_t count = 0; + int32_t index = interleavedBinarySearch(run->runs, run->n_runs, pos_start); + if (index < 0) { + index = -index - 2; + if ((index == -1) || ((pos_start - run->runs[index].value) > run->runs[index].length)){ + return false; + } + } + for (int32_t i = index; i < run->n_runs; ++i) { + const uint32_t stop = run->runs[i].value + run->runs[i].length; + if (run->runs[i].value >= pos_end) break; + if (stop >= pos_end) { + count += (((pos_end - run->runs[i].value) > 0) ? (pos_end - run->runs[i].value) : 0); + break; + } + const uint32_t min = (stop - pos_start) > 0 ? (stop - pos_start) : 0; + count += (min < run->runs[i].length) ? min : run->runs[i].length; + } + return count >= (pos_end - pos_start - 1); +} + +/* Get the cardinality of `run'. Requires an actual computation. */ +int run_container_cardinality(const run_container_t *run); + +/* Card > 0?, see run_container_empty for the reverse */ +static inline bool run_container_nonzero_cardinality( + const run_container_t *run) { + return run->n_runs > 0; // runs never empty +} + +/* Card == 0?, see run_container_nonzero_cardinality for the reverse */ +static inline bool run_container_empty( + const run_container_t *run) { + return run->n_runs == 0; // runs never empty +} + + + +/* Copy one container into another. We assume that they are distinct. */ +void run_container_copy(const run_container_t *src, run_container_t *dst); + +/** + * Append run described by vl to the run container, possibly merging. + * It is assumed that the run would be inserted at the end of the container, no + * check is made. + * It is assumed that the run container has the necessary capacity: caller is + * responsible for checking memory capacity. + * + * + * This is not a safe function, it is meant for performance: use with care. + */ +static inline void run_container_append(run_container_t *run, rle16_t vl, + rle16_t *previousrl) { + const uint32_t previousend = previousrl->value + previousrl->length; + if (vl.value > previousend + 1) { // we add a new one + run->runs[run->n_runs] = vl; + run->n_runs++; + *previousrl = vl; + } else { + uint32_t newend = vl.value + vl.length + UINT32_C(1); + if (newend > previousend) { // we merge + previousrl->length = (uint16_t)(newend - 1 - previousrl->value); + run->runs[run->n_runs - 1] = *previousrl; + } + } +} + +/** + * Like run_container_append but it is assumed that the content of run is empty. + */ +static inline rle16_t run_container_append_first(run_container_t *run, + rle16_t vl) { + run->runs[run->n_runs] = vl; + run->n_runs++; + return vl; +} + +/** + * append a single value given by val to the run container, possibly merging. + * It is assumed that the value would be inserted at the end of the container, + * no check is made. + * It is assumed that the run container has the necessary capacity: caller is + * responsible for checking memory capacity. + * + * This is not a safe function, it is meant for performance: use with care. + */ +static inline void run_container_append_value(run_container_t *run, + uint16_t val, + rle16_t *previousrl) { + const uint32_t previousend = previousrl->value + previousrl->length; + if (val > previousend + 1) { // we add a new one + *previousrl = MAKE_RLE16(val, 0); + run->runs[run->n_runs] = *previousrl; + run->n_runs++; + } else if (val == previousend + 1) { // we merge + previousrl->length++; + run->runs[run->n_runs - 1] = *previousrl; + } +} + +/** + * Like run_container_append_value but it is assumed that the content of run is + * empty. + */ +static inline rle16_t run_container_append_value_first(run_container_t *run, + uint16_t val) { + rle16_t newrle = MAKE_RLE16(val, 0); + run->runs[run->n_runs] = newrle; + run->n_runs++; + return newrle; +} + +/* Check whether the container spans the whole chunk (cardinality = 1<<16). + * This check can be done in constant time (inexpensive). */ +static inline bool run_container_is_full(const run_container_t *run) { + rle16_t vl = run->runs[0]; + return (run->n_runs == 1) && (vl.value == 0) && (vl.length == 0xFFFF); +} + +/* Compute the union of `src_1' and `src_2' and write the result to `dst' + * It is assumed that `dst' is distinct from both `src_1' and `src_2'. */ +void run_container_union(const run_container_t *src_1, + const run_container_t *src_2, run_container_t *dst); + +/* Compute the union of `src_1' and `src_2' and write the result to `src_1' */ +void run_container_union_inplace(run_container_t *src_1, + const run_container_t *src_2); + +/* Compute the intersection of src_1 and src_2 and write the result to + * dst. It is assumed that dst is distinct from both src_1 and src_2. */ +void run_container_intersection(const run_container_t *src_1, + const run_container_t *src_2, + run_container_t *dst); + +/* Compute the size of the intersection of src_1 and src_2 . */ +int run_container_intersection_cardinality(const run_container_t *src_1, + const run_container_t *src_2); + +/* Check whether src_1 and src_2 intersect. */ +bool run_container_intersect(const run_container_t *src_1, + const run_container_t *src_2); + +/* Compute the symmetric difference of `src_1' and `src_2' and write the result + * to `dst' + * It is assumed that `dst' is distinct from both `src_1' and `src_2'. */ +void run_container_xor(const run_container_t *src_1, + const run_container_t *src_2, run_container_t *dst); + +/* + * Write out the 16-bit integers contained in this container as a list of 32-bit + * integers using base + * as the starting value (it might be expected that base has zeros in its 16 + * least significant bits). + * The function returns the number of values written. + * The caller is responsible for allocating enough memory in out. + */ +int run_container_to_uint32_array(void *vout, const run_container_t *cont, + uint32_t base); + +/* + * Print this container using printf (useful for debugging). + */ +void run_container_printf(const run_container_t *v); + +/* + * Print this container using printf as a comma-separated list of 32-bit + * integers starting at base. + */ +void run_container_printf_as_uint32_array(const run_container_t *v, + uint32_t base); + +/** + * Return the serialized size in bytes of a container having "num_runs" runs. + */ +static inline int32_t run_container_serialized_size_in_bytes(int32_t num_runs) { + return sizeof(uint16_t) + + sizeof(rle16_t) * num_runs; // each run requires 2 2-byte entries. +} + +bool run_container_iterate(const run_container_t *cont, uint32_t base, + roaring_iterator iterator, void *ptr); +bool run_container_iterate64(const run_container_t *cont, uint32_t base, + roaring_iterator64 iterator, uint64_t high_bits, + void *ptr); + +/** + * Writes the underlying array to buf, outputs how many bytes were written. + * This is meant to be byte-by-byte compatible with the Java and Go versions of + * Roaring. + * The number of bytes written should be run_container_size_in_bytes(container). + */ +int32_t run_container_write(const run_container_t *container, char *buf); + +/** + * Reads the instance from buf, outputs how many bytes were read. + * This is meant to be byte-by-byte compatible with the Java and Go versions of + * Roaring. + * The number of bytes read should be bitset_container_size_in_bytes(container). + * The cardinality parameter is provided for consistency with other containers, + * but + * it might be effectively ignored.. + */ +int32_t run_container_read(int32_t cardinality, run_container_t *container, + const char *buf); + +/** + * Return the serialized size in bytes of a container (see run_container_write). + * This is meant to be compatible with the Java and Go versions of Roaring. + */ +static inline int32_t run_container_size_in_bytes( + const run_container_t *container) { + return run_container_serialized_size_in_bytes(container->n_runs); +} + +/** + * Return true if the two containers have the same content. + */ +ALLOW_UNALIGNED +static inline bool run_container_equals(const run_container_t *container1, + const run_container_t *container2) { + if (container1->n_runs != container2->n_runs) { + return false; + } + return memequals(container1->runs, container2->runs, + container1->n_runs * sizeof(rle16_t)); +} + +/** +* Return true if container1 is a subset of container2. +*/ +bool run_container_is_subset(const run_container_t *container1, + const run_container_t *container2); + +/** + * Used in a start-finish scan that appends segments, for XOR and NOT + */ + +void run_container_smart_append_exclusive(run_container_t *src, + const uint16_t start, + const uint16_t length); + +/** +* The new container consists of a single run [start,stop). +* It is required that stop>start, the caller is responsability for this check. +* It is required that stop <= (1<<16), the caller is responsability for this check. +* The cardinality of the created container is stop - start. +* Returns NULL on failure +*/ +static inline run_container_t *run_container_create_range(uint32_t start, + uint32_t stop) { + run_container_t *rc = run_container_create_given_capacity(1); + if (rc) { + rle16_t r; + r.value = (uint16_t)start; + r.length = (uint16_t)(stop - start - 1); + run_container_append_first(rc, r); + } + return rc; +} + +/** + * If the element of given rank is in this container, supposing that the first + * element has rank start_rank, then the function returns true and sets element + * accordingly. + * Otherwise, it returns false and update start_rank. + */ +bool run_container_select(const run_container_t *container, + uint32_t *start_rank, uint32_t rank, + uint32_t *element); + +/* Compute the difference of src_1 and src_2 and write the result to + * dst. It is assumed that dst is distinct from both src_1 and src_2. */ + +void run_container_andnot(const run_container_t *src_1, + const run_container_t *src_2, run_container_t *dst); + +void run_container_offset(const run_container_t *c, + container_t **loc, container_t **hic, + uint16_t offset); + +/* Returns the smallest value (assumes not empty) */ +inline uint16_t run_container_minimum(const run_container_t *run) { + if (run->n_runs == 0) return 0; + return run->runs[0].value; +} + +/* Returns the largest value (assumes not empty) */ +inline uint16_t run_container_maximum(const run_container_t *run) { + if (run->n_runs == 0) return 0; + return run->runs[run->n_runs - 1].value + run->runs[run->n_runs - 1].length; +} + +/* Returns the number of values equal or smaller than x */ +int run_container_rank(const run_container_t *arr, uint16_t x); + +/* Returns the index of x, if not exsist return -1 */ +int run_container_get_index(const run_container_t *arr, uint16_t x); + +/* Returns the index of the first run containing a value at least as large as x, or -1 */ +inline int run_container_index_equalorlarger(const run_container_t *arr, uint16_t x) { + int32_t index = interleavedBinarySearch(arr->runs, arr->n_runs, x); + if (index >= 0) return index; + index = -index - 2; // points to preceding run, possibly -1 + if (index != -1) { // possible match + int32_t offset = x - arr->runs[index].value; + int32_t le = arr->runs[index].length; + if (offset <= le) return index; + } + index += 1; + if(index < arr->n_runs) { + return index; + } + return -1; +} + +/* + * Add all values in range [min, max] using hint. + */ +static inline void run_container_add_range_nruns(run_container_t* run, + uint32_t min, uint32_t max, + int32_t nruns_less, + int32_t nruns_greater) { + int32_t nruns_common = run->n_runs - nruns_less - nruns_greater; + if (nruns_common == 0) { + makeRoomAtIndex(run, nruns_less); + run->runs[nruns_less].value = min; + run->runs[nruns_less].length = max - min; + } else { + uint32_t common_min = run->runs[nruns_less].value; + uint32_t common_max = run->runs[nruns_less + nruns_common - 1].value + + run->runs[nruns_less + nruns_common - 1].length; + uint32_t result_min = (common_min < min) ? common_min : min; + uint32_t result_max = (common_max > max) ? common_max : max; + + run->runs[nruns_less].value = result_min; + run->runs[nruns_less].length = result_max - result_min; + + memmove(&(run->runs[nruns_less + 1]), + &(run->runs[run->n_runs - nruns_greater]), + nruns_greater*sizeof(rle16_t)); + run->n_runs = nruns_less + 1 + nruns_greater; + } +} + +/** + * Add all values in range [min, max]. This function is currently unused + * and left as documentation. + */ +/*static inline void run_container_add_range(run_container_t* run, + uint32_t min, uint32_t max) { + int32_t nruns_greater = rle16_count_greater(run->runs, run->n_runs, max); + int32_t nruns_less = rle16_count_less(run->runs, run->n_runs - nruns_greater, min); + run_container_add_range_nruns(run, min, max, nruns_less, nruns_greater); +}*/ + +/** + * Shifts last $count elements either left (distance < 0) or right (distance > 0) + */ +static inline void run_container_shift_tail(run_container_t* run, + int32_t count, int32_t distance) { + if (distance > 0) { + if (run->capacity < count+distance) { + run_container_grow(run, count+distance, true); + } + } + int32_t srcpos = run->n_runs - count; + int32_t dstpos = srcpos + distance; + memmove(&(run->runs[dstpos]), &(run->runs[srcpos]), sizeof(rle16_t) * count); + run->n_runs += distance; +} + +/** + * Remove all elements in range [min, max] + */ +static inline void run_container_remove_range(run_container_t *run, uint32_t min, uint32_t max) { + int32_t first = rle16_find_run(run->runs, run->n_runs, min); + int32_t last = rle16_find_run(run->runs, run->n_runs, max); + + if (first >= 0 && min > run->runs[first].value && + max < ((uint32_t)run->runs[first].value + (uint32_t)run->runs[first].length)) { + // split this run into two adjacent runs + + // right subinterval + makeRoomAtIndex(run, first+1); + run->runs[first+1].value = max + 1; + run->runs[first+1].length = (run->runs[first].value + run->runs[first].length) - (max + 1); + + // left subinterval + run->runs[first].length = (min - 1) - run->runs[first].value; + + return; + } + + // update left-most partial run + if (first >= 0) { + if (min > run->runs[first].value) { + run->runs[first].length = (min - 1) - run->runs[first].value; + first++; + } + } else { + first = -first-1; + } + + // update right-most run + if (last >= 0) { + uint16_t run_max = run->runs[last].value + run->runs[last].length; + if (run_max > max) { + run->runs[last].value = max + 1; + run->runs[last].length = run_max - (max + 1); + last--; + } + } else { + last = (-last-1) - 1; + } + + // remove intermediate runs + if (first <= last) { + run_container_shift_tail(run, run->n_runs - (last+1), -(last-first+1)); + } +} + +#ifdef __cplusplus +} } } // extern "C" { namespace roaring { namespace internal { +#endif + +#endif /* INCLUDE_CONTAINERS_RUN_H_ */ +/* end file include/roaring/containers/run.h */ +/* begin file include/roaring/containers/convert.h */ +/* + * convert.h + * + */ + +#ifndef INCLUDE_CONTAINERS_CONVERT_H_ +#define INCLUDE_CONTAINERS_CONVERT_H_ + + +#ifdef __cplusplus +extern "C" { namespace roaring { namespace internal { +#endif + +/* Convert an array into a bitset. The input container is not freed or modified. + */ +bitset_container_t *bitset_container_from_array(const array_container_t *arr); + +/* Convert a run into a bitset. The input container is not freed or modified. */ +bitset_container_t *bitset_container_from_run(const run_container_t *arr); + +/* Convert a run into an array. The input container is not freed or modified. */ +array_container_t *array_container_from_run(const run_container_t *arr); + +/* Convert a bitset into an array. The input container is not freed or modified. + */ +array_container_t *array_container_from_bitset(const bitset_container_t *bits); + +/* Convert an array into a run. The input container is not freed or modified. + */ +run_container_t *run_container_from_array(const array_container_t *c); + +/* convert a run into either an array or a bitset + * might free the container. This does not free the input run container. */ +container_t *convert_to_bitset_or_array_container( + run_container_t *rc, int32_t card, + uint8_t *resulttype); + +/* convert containers to and from runcontainers, as is most space efficient. + * The container might be freed. */ +container_t *convert_run_optimize( + container_t *c, uint8_t typecode_original, + uint8_t *typecode_after); + +/* converts a run container to either an array or a bitset, IF it saves space. + */ +/* If a conversion occurs, the caller is responsible to free the original + * container and + * he becomes reponsible to free the new one. */ +container_t *convert_run_to_efficient_container( + run_container_t *c, uint8_t *typecode_after); + +// like convert_run_to_efficient_container but frees the old result if needed +container_t *convert_run_to_efficient_container_and_free( + run_container_t *c, uint8_t *typecode_after); + +/** + * Create new container which is a union of run container and + * range [min, max]. Caller is responsible for freeing run container. + */ +container_t *container_from_run_range( + const run_container_t *run, + uint32_t min, uint32_t max, + uint8_t *typecode_after); + +#ifdef __cplusplus +} } } // extern "C" { namespace roaring { namespace internal { +#endif + +#endif /* INCLUDE_CONTAINERS_CONVERT_H_ */ +/* end file include/roaring/containers/convert.h */ +/* begin file include/roaring/containers/mixed_equal.h */ +/* + * mixed_equal.h + * + */ + +#ifndef CONTAINERS_MIXED_EQUAL_H_ +#define CONTAINERS_MIXED_EQUAL_H_ + + +#ifdef __cplusplus +extern "C" { namespace roaring { namespace internal { +#endif + +/** + * Return true if the two containers have the same content. + */ +bool array_container_equal_bitset(const array_container_t* container1, + const bitset_container_t* container2); + +/** + * Return true if the two containers have the same content. + */ +bool run_container_equals_array(const run_container_t* container1, + const array_container_t* container2); +/** + * Return true if the two containers have the same content. + */ +bool run_container_equals_bitset(const run_container_t* container1, + const bitset_container_t* container2); + +#ifdef __cplusplus +} } } // extern "C" { namespace roaring { namespace internal { +#endif + +#endif /* CONTAINERS_MIXED_EQUAL_H_ */ +/* end file include/roaring/containers/mixed_equal.h */ +/* begin file include/roaring/containers/mixed_subset.h */ +/* + * mixed_subset.h + * + */ + +#ifndef CONTAINERS_MIXED_SUBSET_H_ +#define CONTAINERS_MIXED_SUBSET_H_ + + +#ifdef __cplusplus +extern "C" { namespace roaring { namespace internal { +#endif + +/** + * Return true if container1 is a subset of container2. + */ +bool array_container_is_subset_bitset(const array_container_t* container1, + const bitset_container_t* container2); + +/** +* Return true if container1 is a subset of container2. + */ +bool run_container_is_subset_array(const run_container_t* container1, + const array_container_t* container2); + +/** +* Return true if container1 is a subset of container2. + */ +bool array_container_is_subset_run(const array_container_t* container1, + const run_container_t* container2); + +/** +* Return true if container1 is a subset of container2. + */ +bool run_container_is_subset_bitset(const run_container_t* container1, + const bitset_container_t* container2); + +/** +* Return true if container1 is a subset of container2. +*/ +bool bitset_container_is_subset_run(const bitset_container_t* container1, + const run_container_t* container2); + +#ifdef __cplusplus +} } } // extern "C" { namespace roaring { namespace internal { +#endif + +#endif /* CONTAINERS_MIXED_SUBSET_H_ */ +/* end file include/roaring/containers/mixed_subset.h */ +/* begin file include/roaring/containers/mixed_andnot.h */ +/* + * mixed_andnot.h + */ +#ifndef INCLUDE_CONTAINERS_MIXED_ANDNOT_H_ +#define INCLUDE_CONTAINERS_MIXED_ANDNOT_H_ + + +#ifdef __cplusplus +extern "C" { namespace roaring { namespace internal { +#endif + +/* Compute the andnot of src_1 and src_2 and write the result to + * dst, a valid array container that could be the same as dst.*/ +void array_bitset_container_andnot(const array_container_t *src_1, + const bitset_container_t *src_2, + array_container_t *dst); + +/* Compute the andnot of src_1 and src_2 and write the result to + * src_1 */ + +void array_bitset_container_iandnot(array_container_t *src_1, + const bitset_container_t *src_2); + +/* Compute the andnot of src_1 and src_2 and write the result to + * dst, which does not initially have a valid container. + * Return true for a bitset result; false for array + */ + +bool bitset_array_container_andnot( + const bitset_container_t *src_1, const array_container_t *src_2, + container_t **dst); + +/* Compute the andnot of src_1 and src_2 and write the result to + * dst (which has no container initially). It will modify src_1 + * to be dst if the result is a bitset. Otherwise, it will + * free src_1 and dst will be a new array container. In both + * cases, the caller is responsible for deallocating dst. + * Returns true iff dst is a bitset */ + +bool bitset_array_container_iandnot( + bitset_container_t *src_1, const array_container_t *src_2, + container_t **dst); + +/* Compute the andnot of src_1 and src_2 and write the result to + * dst. Result may be either a bitset or an array container + * (returns "result is bitset"). dst does not initially have + * any container, but becomes either a bitset container (return + * result true) or an array container. + */ + +bool run_bitset_container_andnot( + const run_container_t *src_1, const bitset_container_t *src_2, + container_t **dst); + +/* Compute the andnot of src_1 and src_2 and write the result to + * dst. Result may be either a bitset or an array container + * (returns "result is bitset"). dst does not initially have + * any container, but becomes either a bitset container (return + * result true) or an array container. + */ + +bool run_bitset_container_iandnot( + run_container_t *src_1, const bitset_container_t *src_2, + container_t **dst); + +/* Compute the andnot of src_1 and src_2 and write the result to + * dst. Result may be either a bitset or an array container + * (returns "result is bitset"). dst does not initially have + * any container, but becomes either a bitset container (return + * result true) or an array container. + */ + +bool bitset_run_container_andnot( + const bitset_container_t *src_1, const run_container_t *src_2, + container_t **dst); + +/* Compute the andnot of src_1 and src_2 and write the result to + * dst (which has no container initially). It will modify src_1 + * to be dst if the result is a bitset. Otherwise, it will + * free src_1 and dst will be a new array container. In both + * cases, the caller is responsible for deallocating dst. + * Returns true iff dst is a bitset */ + +bool bitset_run_container_iandnot( + bitset_container_t *src_1, const run_container_t *src_2, + container_t **dst); + +/* dst does not indicate a valid container initially. Eventually it + * can become any type of container. + */ + +int run_array_container_andnot( + const run_container_t *src_1, const array_container_t *src_2, + container_t **dst); + +/* Compute the andnot of src_1 and src_2 and write the result to + * dst (which has no container initially). It will modify src_1 + * to be dst if the result is a bitset. Otherwise, it will + * free src_1 and dst will be a new array container. In both + * cases, the caller is responsible for deallocating dst. + * Returns true iff dst is a bitset */ + +int run_array_container_iandnot( + run_container_t *src_1, const array_container_t *src_2, + container_t **dst); + +/* dst must be a valid array container, allowed to be src_1 */ + +void array_run_container_andnot(const array_container_t *src_1, + const run_container_t *src_2, + array_container_t *dst); + +/* dst does not indicate a valid container initially. Eventually it + * can become any kind of container. + */ + +void array_run_container_iandnot(array_container_t *src_1, + const run_container_t *src_2); + +/* dst does not indicate a valid container initially. Eventually it + * can become any kind of container. + */ + +int run_run_container_andnot( + const run_container_t *src_1, const run_container_t *src_2, + container_t **dst); + +/* Compute the andnot of src_1 and src_2 and write the result to + * dst (which has no container initially). It will modify src_1 + * to be dst if the result is a bitset. Otherwise, it will + * free src_1 and dst will be a new array container. In both + * cases, the caller is responsible for deallocating dst. + * Returns true iff dst is a bitset */ + +int run_run_container_iandnot( + run_container_t *src_1, const run_container_t *src_2, + container_t **dst); + +/* + * dst is a valid array container and may be the same as src_1 + */ + +void array_array_container_andnot(const array_container_t *src_1, + const array_container_t *src_2, + array_container_t *dst); + +/* inplace array-array andnot will always be able to reuse the space of + * src_1 */ +void array_array_container_iandnot(array_container_t *src_1, + const array_container_t *src_2); + +/* Compute the andnot of src_1 and src_2 and write the result to + * dst (which has no container initially). Return value is + * "dst is a bitset" + */ + +bool bitset_bitset_container_andnot( + const bitset_container_t *src_1, const bitset_container_t *src_2, + container_t **dst); + +/* Compute the andnot of src_1 and src_2 and write the result to + * dst (which has no container initially). It will modify src_1 + * to be dst if the result is a bitset. Otherwise, it will + * free src_1 and dst will be a new array container. In both + * cases, the caller is responsible for deallocating dst. + * Returns true iff dst is a bitset */ + +bool bitset_bitset_container_iandnot( + bitset_container_t *src_1, const bitset_container_t *src_2, + container_t **dst); + +#ifdef __cplusplus +} } } // extern "C" { namespace roaring { namespace internal { +#endif + +#endif +/* end file include/roaring/containers/mixed_andnot.h */ +/* begin file include/roaring/containers/mixed_intersection.h */ +/* + * mixed_intersection.h + * + */ + +#ifndef INCLUDE_CONTAINERS_MIXED_INTERSECTION_H_ +#define INCLUDE_CONTAINERS_MIXED_INTERSECTION_H_ + +/* These functions appear to exclude cases where the + * inputs have the same type and the output is guaranteed + * to have the same type as the inputs. Eg, array intersection + */ + + +#ifdef __cplusplus +extern "C" { namespace roaring { namespace internal { +#endif + +/* Compute the intersection of src_1 and src_2 and write the result to + * dst. It is allowed for dst to be equal to src_1. We assume that dst is a + * valid container. */ +void array_bitset_container_intersection(const array_container_t *src_1, + const bitset_container_t *src_2, + array_container_t *dst); + +/* Compute the size of the intersection of src_1 and src_2. */ +int array_bitset_container_intersection_cardinality( + const array_container_t *src_1, const bitset_container_t *src_2); + + + +/* Checking whether src_1 and src_2 intersect. */ +bool array_bitset_container_intersect(const array_container_t *src_1, + const bitset_container_t *src_2); + +/* + * Compute the intersection between src_1 and src_2 and write the result + * to *dst. If the return function is true, the result is a bitset_container_t + * otherwise is a array_container_t. We assume that dst is not pre-allocated. In + * case of failure, *dst will be NULL. + */ +bool bitset_bitset_container_intersection(const bitset_container_t *src_1, + const bitset_container_t *src_2, + container_t **dst); + +/* Compute the intersection between src_1 and src_2 and write the result to + * dst. It is allowed for dst to be equal to src_1. We assume that dst is a + * valid container. */ +void array_run_container_intersection(const array_container_t *src_1, + const run_container_t *src_2, + array_container_t *dst); + +/* Compute the intersection between src_1 and src_2 and write the result to + * *dst. If the result is true then the result is a bitset_container_t + * otherwise is a array_container_t. + * If *dst == src_2, then an in-place intersection is attempted + **/ +bool run_bitset_container_intersection(const run_container_t *src_1, + const bitset_container_t *src_2, + container_t **dst); + +/* Compute the size of the intersection between src_1 and src_2 . */ +int array_run_container_intersection_cardinality(const array_container_t *src_1, + const run_container_t *src_2); + +/* Compute the size of the intersection between src_1 and src_2 + **/ +int run_bitset_container_intersection_cardinality(const run_container_t *src_1, + const bitset_container_t *src_2); + + +/* Check that src_1 and src_2 intersect. */ +bool array_run_container_intersect(const array_container_t *src_1, + const run_container_t *src_2); + +/* Check that src_1 and src_2 intersect. + **/ +bool run_bitset_container_intersect(const run_container_t *src_1, + const bitset_container_t *src_2); + +/* + * Same as bitset_bitset_container_intersection except that if the output is to + * be a + * bitset_container_t, then src_1 is modified and no allocation is made. + * If the output is to be an array_container_t, then caller is responsible + * to free the container. + * In all cases, the result is in *dst. + */ +bool bitset_bitset_container_intersection_inplace( + bitset_container_t *src_1, const bitset_container_t *src_2, + container_t **dst); + +#ifdef __cplusplus +} } } // extern "C" { namespace roaring { namespace internal { +#endif + +#endif /* INCLUDE_CONTAINERS_MIXED_INTERSECTION_H_ */ +/* end file include/roaring/containers/mixed_intersection.h */ +/* begin file include/roaring/containers/mixed_negation.h */ +/* + * mixed_negation.h + * + */ + +#ifndef INCLUDE_CONTAINERS_MIXED_NEGATION_H_ +#define INCLUDE_CONTAINERS_MIXED_NEGATION_H_ + + +#ifdef __cplusplus +extern "C" { namespace roaring { namespace internal { +#endif + +/* Negation across the entire range of the container. + * Compute the negation of src and write the result + * to *dst. The complement of a + * sufficiently sparse set will always be dense and a hence a bitmap + * We assume that dst is pre-allocated and a valid bitset container + * There can be no in-place version. + */ +void array_container_negation(const array_container_t *src, + bitset_container_t *dst); + +/* Negation across the entire range of the container + * Compute the negation of src and write the result + * to *dst. A true return value indicates a bitset result, + * otherwise the result is an array container. + * We assume that dst is not pre-allocated. In + * case of failure, *dst will be NULL. + */ +bool bitset_container_negation( + const bitset_container_t *src, + container_t **dst); + +/* inplace version */ +/* + * Same as bitset_container_negation except that if the output is to + * be a + * bitset_container_t, then src is modified and no allocation is made. + * If the output is to be an array_container_t, then caller is responsible + * to free the container. + * In all cases, the result is in *dst. + */ +bool bitset_container_negation_inplace( + bitset_container_t *src, + container_t **dst); + +/* Negation across the entire range of container + * Compute the negation of src and write the result + * to *dst. + * Return values are the *_TYPECODES as defined * in containers.h + * We assume that dst is not pre-allocated. In + * case of failure, *dst will be NULL. + */ +int run_container_negation(const run_container_t *src, container_t **dst); + +/* + * Same as run_container_negation except that if the output is to + * be a + * run_container_t, and has the capacity to hold the result, + * then src is modified and no allocation is made. + * In all cases, the result is in *dst. + */ +int run_container_negation_inplace(run_container_t *src, container_t **dst); + +/* Negation across a range of the container. + * Compute the negation of src and write the result + * to *dst. Returns true if the result is a bitset container + * and false for an array container. *dst is not preallocated. + */ +bool array_container_negation_range( + const array_container_t *src, + const int range_start, const int range_end, + container_t **dst); + +/* Even when the result would fit, it is unclear how to make an + * inplace version without inefficient copying. Thus this routine + * may be a wrapper for the non-in-place version + */ +bool array_container_negation_range_inplace( + array_container_t *src, + const int range_start, const int range_end, + container_t **dst); + +/* Negation across a range of the container + * Compute the negation of src and write the result + * to *dst. A true return value indicates a bitset result, + * otherwise the result is an array container. + * We assume that dst is not pre-allocated. In + * case of failure, *dst will be NULL. + */ +bool bitset_container_negation_range( + const bitset_container_t *src, + const int range_start, const int range_end, + container_t **dst); + +/* inplace version */ +/* + * Same as bitset_container_negation except that if the output is to + * be a + * bitset_container_t, then src is modified and no allocation is made. + * If the output is to be an array_container_t, then caller is responsible + * to free the container. + * In all cases, the result is in *dst. + */ +bool bitset_container_negation_range_inplace( + bitset_container_t *src, + const int range_start, const int range_end, + container_t **dst); + +/* Negation across a range of container + * Compute the negation of src and write the result + * to *dst. Return values are the *_TYPECODES as defined * in containers.h + * We assume that dst is not pre-allocated. In + * case of failure, *dst will be NULL. + */ +int run_container_negation_range( + const run_container_t *src, + const int range_start, const int range_end, + container_t **dst); + +/* + * Same as run_container_negation except that if the output is to + * be a + * run_container_t, and has the capacity to hold the result, + * then src is modified and no allocation is made. + * In all cases, the result is in *dst. + */ +int run_container_negation_range_inplace( + run_container_t *src, + const int range_start, const int range_end, + container_t **dst); + +#ifdef __cplusplus +} } } // extern "C" { namespace roaring { namespace internal { +#endif + +#endif /* INCLUDE_CONTAINERS_MIXED_NEGATION_H_ */ +/* end file include/roaring/containers/mixed_negation.h */ +/* begin file include/roaring/containers/mixed_union.h */ +/* + * mixed_intersection.h + * + */ + +#ifndef INCLUDE_CONTAINERS_MIXED_UNION_H_ +#define INCLUDE_CONTAINERS_MIXED_UNION_H_ + +/* These functions appear to exclude cases where the + * inputs have the same type and the output is guaranteed + * to have the same type as the inputs. Eg, bitset unions + */ + + +#ifdef __cplusplus +extern "C" { namespace roaring { namespace internal { +#endif + +/* Compute the union of src_1 and src_2 and write the result to + * dst. It is allowed for src_2 to be dst. */ +void array_bitset_container_union(const array_container_t *src_1, + const bitset_container_t *src_2, + bitset_container_t *dst); + +/* Compute the union of src_1 and src_2 and write the result to + * dst. It is allowed for src_2 to be dst. This version does not + * update the cardinality of dst (it is set to BITSET_UNKNOWN_CARDINALITY). */ +void array_bitset_container_lazy_union(const array_container_t *src_1, + const bitset_container_t *src_2, + bitset_container_t *dst); + +/* + * Compute the union between src_1 and src_2 and write the result + * to *dst. If the return function is true, the result is a bitset_container_t + * otherwise is a array_container_t. We assume that dst is not pre-allocated. In + * case of failure, *dst will be NULL. + */ +bool array_array_container_union( + const array_container_t *src_1, const array_container_t *src_2, + container_t **dst); + +/* + * Compute the union between src_1 and src_2 and write the result + * to *dst if it cannot be written to src_1. If the return function is true, + * the result is a bitset_container_t + * otherwise is a array_container_t. When the result is an array_container_t, it + * it either written to src_1 (if *dst is null) or to *dst. + * If the result is a bitset_container_t and *dst is null, then there was a failure. + */ +bool array_array_container_inplace_union( + array_container_t *src_1, const array_container_t *src_2, + container_t **dst); + +/* + * Same as array_array_container_union except that it will more eagerly produce + * a bitset. + */ +bool array_array_container_lazy_union( + const array_container_t *src_1, const array_container_t *src_2, + container_t **dst); + +/* + * Same as array_array_container_inplace_union except that it will more eagerly produce + * a bitset. + */ +bool array_array_container_lazy_inplace_union( + array_container_t *src_1, const array_container_t *src_2, + container_t **dst); + +/* Compute the union of src_1 and src_2 and write the result to + * dst. We assume that dst is a + * valid container. The result might need to be further converted to array or + * bitset container, + * the caller is responsible for the eventual conversion. */ +void array_run_container_union(const array_container_t *src_1, + const run_container_t *src_2, + run_container_t *dst); + +/* Compute the union of src_1 and src_2 and write the result to + * src2. The result might need to be further converted to array or + * bitset container, + * the caller is responsible for the eventual conversion. */ +void array_run_container_inplace_union(const array_container_t *src_1, + run_container_t *src_2); + +/* Compute the union of src_1 and src_2 and write the result to + * dst. It is allowed for dst to be src_2. + * If run_container_is_full(src_1) is true, you must not be calling this + *function. + **/ +void run_bitset_container_union(const run_container_t *src_1, + const bitset_container_t *src_2, + bitset_container_t *dst); + +/* Compute the union of src_1 and src_2 and write the result to + * dst. It is allowed for dst to be src_2. This version does not + * update the cardinality of dst (it is set to BITSET_UNKNOWN_CARDINALITY). + * If run_container_is_full(src_1) is true, you must not be calling this + * function. + * */ +void run_bitset_container_lazy_union(const run_container_t *src_1, + const bitset_container_t *src_2, + bitset_container_t *dst); + +#ifdef __cplusplus +} } } // extern "C" { namespace roaring { namespace internal { +#endif + +#endif /* INCLUDE_CONTAINERS_MIXED_UNION_H_ */ +/* end file include/roaring/containers/mixed_union.h */ +/* begin file include/roaring/containers/mixed_xor.h */ +/* + * mixed_xor.h + * + */ + +#ifndef INCLUDE_CONTAINERS_MIXED_XOR_H_ +#define INCLUDE_CONTAINERS_MIXED_XOR_H_ + +/* These functions appear to exclude cases where the + * inputs have the same type and the output is guaranteed + * to have the same type as the inputs. Eg, bitset unions + */ + +/* + * Java implementation (as of May 2016) for array_run, run_run + * and bitset_run don't do anything different for inplace. + * (They are not truly in place.) + */ + + + +#ifdef __cplusplus +extern "C" { namespace roaring { namespace internal { +#endif + +/* Compute the xor of src_1 and src_2 and write the result to + * dst (which has no container initially). + * Result is true iff dst is a bitset */ +bool array_bitset_container_xor( + const array_container_t *src_1, const bitset_container_t *src_2, + container_t **dst); + +/* Compute the xor of src_1 and src_2 and write the result to + * dst. It is allowed for src_2 to be dst. This version does not + * update the cardinality of dst (it is set to BITSET_UNKNOWN_CARDINALITY). + */ + +void array_bitset_container_lazy_xor(const array_container_t *src_1, + const bitset_container_t *src_2, + bitset_container_t *dst); +/* Compute the xor of src_1 and src_2 and write the result to + * dst (which has no container initially). Return value is + * "dst is a bitset" + */ + +bool bitset_bitset_container_xor( + const bitset_container_t *src_1, const bitset_container_t *src_2, + container_t **dst); + +/* Compute the xor of src_1 and src_2 and write the result to + * dst. Result may be either a bitset or an array container + * (returns "result is bitset"). dst does not initially have + * any container, but becomes either a bitset container (return + * result true) or an array container. + */ + +bool run_bitset_container_xor( + const run_container_t *src_1, const bitset_container_t *src_2, + container_t **dst); + +/* lazy xor. Dst is initialized and may be equal to src_2. + * Result is left as a bitset container, even if actual + * cardinality would dictate an array container. + */ + +void run_bitset_container_lazy_xor(const run_container_t *src_1, + const bitset_container_t *src_2, + bitset_container_t *dst); + +/* dst does not indicate a valid container initially. Eventually it + * can become any kind of container. + */ + +int array_run_container_xor( + const array_container_t *src_1, const run_container_t *src_2, + container_t **dst); + +/* dst does not initially have a valid container. Creates either + * an array or a bitset container, indicated by return code + */ + +bool array_array_container_xor( + const array_container_t *src_1, const array_container_t *src_2, + container_t **dst); + +/* dst does not initially have a valid container. Creates either + * an array or a bitset container, indicated by return code. + * A bitset container will not have a valid cardinality and the + * container type might not be correct for the actual cardinality + */ + +bool array_array_container_lazy_xor( + const array_container_t *src_1, const array_container_t *src_2, + container_t **dst); + +/* Dst is a valid run container. (Can it be src_2? Let's say not.) + * Leaves result as run container, even if other options are + * smaller. + */ + +void array_run_container_lazy_xor(const array_container_t *src_1, + const run_container_t *src_2, + run_container_t *dst); + +/* dst does not indicate a valid container initially. Eventually it + * can become any kind of container. + */ + +int run_run_container_xor( + const run_container_t *src_1, const run_container_t *src_2, + container_t **dst); + +/* INPLACE versions (initial implementation may not exploit all inplace + * opportunities (if any...) + */ + +/* Compute the xor of src_1 and src_2 and write the result to + * dst (which has no container initially). It will modify src_1 + * to be dst if the result is a bitset. Otherwise, it will + * free src_1 and dst will be a new array container. In both + * cases, the caller is responsible for deallocating dst. + * Returns true iff dst is a bitset */ + +bool bitset_array_container_ixor( + bitset_container_t *src_1, const array_container_t *src_2, + container_t **dst); + +bool bitset_bitset_container_ixor( + bitset_container_t *src_1, const bitset_container_t *src_2, + container_t **dst); + +bool array_bitset_container_ixor( + array_container_t *src_1, const bitset_container_t *src_2, + container_t **dst); + +/* Compute the xor of src_1 and src_2 and write the result to + * dst. Result may be either a bitset or an array container + * (returns "result is bitset"). dst does not initially have + * any container, but becomes either a bitset container (return + * result true) or an array container. + */ + +bool run_bitset_container_ixor( + run_container_t *src_1, const bitset_container_t *src_2, + container_t **dst); + +bool bitset_run_container_ixor( + bitset_container_t *src_1, const run_container_t *src_2, + container_t **dst); + +/* dst does not indicate a valid container initially. Eventually it + * can become any kind of container. + */ + +int array_run_container_ixor( + array_container_t *src_1, const run_container_t *src_2, + container_t **dst); + +int run_array_container_ixor( + run_container_t *src_1, const array_container_t *src_2, + container_t **dst); + +bool array_array_container_ixor( + array_container_t *src_1, const array_container_t *src_2, + container_t **dst); + +int run_run_container_ixor( + run_container_t *src_1, const run_container_t *src_2, + container_t **dst); + +#ifdef __cplusplus +} } } // extern "C" { namespace roaring { namespace internal { +#endif + +#endif +/* end file include/roaring/containers/mixed_xor.h */ +/* begin file include/roaring/containers/containers.h */ +#ifndef CONTAINERS_CONTAINERS_H +#define CONTAINERS_CONTAINERS_H + +#include +#include +#include + + +#ifdef __cplusplus +extern "C" { namespace roaring { namespace internal { +#endif + +// would enum be possible or better? + +/** + * The switch case statements follow + * BITSET_CONTAINER_TYPE -- ARRAY_CONTAINER_TYPE -- RUN_CONTAINER_TYPE + * so it makes more sense to number them 1, 2, 3 (in the vague hope that the + * compiler might exploit this ordering). + */ + +#define BITSET_CONTAINER_TYPE 1 +#define ARRAY_CONTAINER_TYPE 2 +#define RUN_CONTAINER_TYPE 3 +#define SHARED_CONTAINER_TYPE 4 + +/** + * Macros for pairing container type codes, suitable for switch statements. + * Use PAIR_CONTAINER_TYPES() for the switch, CONTAINER_PAIR() for the cases: + * + * switch (PAIR_CONTAINER_TYPES(type1, type2)) { + * case CONTAINER_PAIR(BITSET,ARRAY): + * ... + * } + */ +#define PAIR_CONTAINER_TYPES(type1,type2) \ + (4 * (type1) + (type2)) + +#define CONTAINER_PAIR(name1,name2) \ + (4 * (name1##_CONTAINER_TYPE) + (name2##_CONTAINER_TYPE)) + +/** + * A shared container is a wrapper around a container + * with reference counting. + */ +STRUCT_CONTAINER(shared_container_s) { + container_t *container; + uint8_t typecode; + croaring_refcount_t counter; // to be managed atomically +}; + +typedef struct shared_container_s shared_container_t; + +#define CAST_shared(c) CAST(shared_container_t *, c) // safer downcast +#define const_CAST_shared(c) CAST(const shared_container_t *, c) +#define movable_CAST_shared(c) movable_CAST(shared_container_t **, c) + +/* + * With copy_on_write = true + * Create a new shared container if the typecode is not SHARED_CONTAINER_TYPE, + * otherwise, increase the count + * If copy_on_write = false, then clone. + * Return NULL in case of failure. + **/ +container_t *get_copy_of_container(container_t *container, uint8_t *typecode, + bool copy_on_write); + +/* Frees a shared container (actually decrement its counter and only frees when + * the counter falls to zero). */ +void shared_container_free(shared_container_t *container); + +/* extract a copy from the shared container, freeing the shared container if +there is just one instance left, +clone instances when the counter is higher than one +*/ +container_t *shared_container_extract_copy(shared_container_t *container, + uint8_t *typecode); + +/* access to container underneath */ +static inline const container_t *container_unwrap_shared( + const container_t *candidate_shared_container, uint8_t *type +){ + if (*type == SHARED_CONTAINER_TYPE) { + *type = const_CAST_shared(candidate_shared_container)->typecode; + assert(*type != SHARED_CONTAINER_TYPE); + return const_CAST_shared(candidate_shared_container)->container; + } else { + return candidate_shared_container; + } +} + + +/* access to container underneath */ +static inline container_t *container_mutable_unwrap_shared( + container_t *c, uint8_t *type +) { + if (*type == SHARED_CONTAINER_TYPE) { // the passed in container is shared + *type = CAST_shared(c)->typecode; + assert(*type != SHARED_CONTAINER_TYPE); + return CAST_shared(c)->container; // return the enclosed container + } else { + return c; // wasn't shared, so return as-is + } +} + +/* access to container underneath and queries its type */ +static inline uint8_t get_container_type( + const container_t *c, uint8_t type +){ + if (type == SHARED_CONTAINER_TYPE) { + return const_CAST_shared(c)->typecode; + } else { + return type; + } +} + +/** + * Copies a container, requires a typecode. This allocates new memory, caller + * is responsible for deallocation. If the container is not shared, then it is + * physically cloned. Sharable containers are not cloneable. + */ +container_t *container_clone(const container_t *container, uint8_t typecode); + +/* access to container underneath, cloning it if needed */ +static inline container_t *get_writable_copy_if_shared( + container_t *c, uint8_t *type +){ + if (*type == SHARED_CONTAINER_TYPE) { // shared, return enclosed container + return shared_container_extract_copy(CAST_shared(c), type); + } else { + return c; // not shared, so return as-is + } +} + +/** + * End of shared container code + */ + +static const char *container_names[] = {"bitset", "array", "run", "shared"}; +static const char *shared_container_names[] = { + "bitset (shared)", "array (shared)", "run (shared)"}; + +// no matter what the initial container was, convert it to a bitset +// if a new container is produced, caller responsible for freeing the previous +// one +// container should not be a shared container +static inline bitset_container_t *container_to_bitset( + container_t *c, uint8_t typecode +){ + bitset_container_t *result = NULL; + switch (typecode) { + case BITSET_CONTAINER_TYPE: + return CAST_bitset(c); // nothing to do + case ARRAY_CONTAINER_TYPE: + result = bitset_container_from_array(CAST_array(c)); + return result; + case RUN_CONTAINER_TYPE: + result = bitset_container_from_run(CAST_run(c)); + return result; + case SHARED_CONTAINER_TYPE: + assert(false); + roaring_unreachable; + } + assert(false); + roaring_unreachable; + return 0; // unreached +} + +/** + * Get the container name from the typecode + * (unused at time of writing) + */ +/*static inline const char *get_container_name(uint8_t typecode) { + switch (typecode) { + case BITSET_CONTAINER_TYPE: + return container_names[0]; + case ARRAY_CONTAINER_TYPE: + return container_names[1]; + case RUN_CONTAINER_TYPE: + return container_names[2]; + case SHARED_CONTAINER_TYPE: + return container_names[3]; + default: + assert(false); + roaring_unreachable; + return "unknown"; + } +}*/ + +static inline const char *get_full_container_name( + const container_t *c, uint8_t typecode +){ + switch (typecode) { + case BITSET_CONTAINER_TYPE: + return container_names[0]; + case ARRAY_CONTAINER_TYPE: + return container_names[1]; + case RUN_CONTAINER_TYPE: + return container_names[2]; + case SHARED_CONTAINER_TYPE: + switch (const_CAST_shared(c)->typecode) { + case BITSET_CONTAINER_TYPE: + return shared_container_names[0]; + case ARRAY_CONTAINER_TYPE: + return shared_container_names[1]; + case RUN_CONTAINER_TYPE: + return shared_container_names[2]; + default: + assert(false); + roaring_unreachable; + return "unknown"; + } + break; + default: + assert(false); + roaring_unreachable; + return "unknown"; + } + roaring_unreachable; + return NULL; +} + +/** + * Get the container cardinality (number of elements), requires a typecode + */ +static inline int container_get_cardinality( + const container_t *c, uint8_t typecode +){ + c = container_unwrap_shared(c, &typecode); + switch (typecode) { + case BITSET_CONTAINER_TYPE: + return bitset_container_cardinality(const_CAST_bitset(c)); + case ARRAY_CONTAINER_TYPE: + return array_container_cardinality(const_CAST_array(c)); + case RUN_CONTAINER_TYPE: + return run_container_cardinality(const_CAST_run(c)); + } + assert(false); + roaring_unreachable; + return 0; // unreached +} + + + +// returns true if a container is known to be full. Note that a lazy bitset +// container +// might be full without us knowing +static inline bool container_is_full(const container_t *c, uint8_t typecode) { + c = container_unwrap_shared(c, &typecode); + switch (typecode) { + case BITSET_CONTAINER_TYPE: + return bitset_container_cardinality( + const_CAST_bitset(c)) == (1 << 16); + case ARRAY_CONTAINER_TYPE: + return array_container_cardinality( + const_CAST_array(c)) == (1 << 16); + case RUN_CONTAINER_TYPE: + return run_container_is_full(const_CAST_run(c)); + } + assert(false); + roaring_unreachable; + return 0; // unreached +} + +static inline int container_shrink_to_fit( + container_t *c, uint8_t type +){ + c = container_mutable_unwrap_shared(c, &type); + switch (type) { + case BITSET_CONTAINER_TYPE: + return 0; // no shrinking possible + case ARRAY_CONTAINER_TYPE: + return array_container_shrink_to_fit(CAST_array(c)); + case RUN_CONTAINER_TYPE: + return run_container_shrink_to_fit(CAST_run(c)); + } + assert(false); + roaring_unreachable; + return 0; // unreached +} + + +/** + * make a container with a run of ones + */ +/* initially always use a run container, even if an array might be + * marginally + * smaller */ +static inline container_t *container_range_of_ones( + uint32_t range_start, uint32_t range_end, + uint8_t *result_type +){ + assert(range_end >= range_start); + uint64_t cardinality = range_end - range_start + 1; + if(cardinality <= 2) { + *result_type = ARRAY_CONTAINER_TYPE; + return array_container_create_range(range_start, range_end); + } else { + *result_type = RUN_CONTAINER_TYPE; + return run_container_create_range(range_start, range_end); + } +} + + +/* Create a container with all the values between in [min,max) at a + distance k*step from min. */ +static inline container_t *container_from_range( + uint8_t *type, uint32_t min, + uint32_t max, uint16_t step +){ + if (step == 0) return NULL; // being paranoid + if (step == 1) { + return container_range_of_ones(min,max,type); + // Note: the result is not always a run (need to check the cardinality) + //*type = RUN_CONTAINER_TYPE; + //return run_container_create_range(min, max); + } + int size = (max - min + step - 1) / step; + if (size <= DEFAULT_MAX_SIZE) { // array container + *type = ARRAY_CONTAINER_TYPE; + array_container_t *array = array_container_create_given_capacity(size); + array_container_add_from_range(array, min, max, step); + assert(array->cardinality == size); + return array; + } else { // bitset container + *type = BITSET_CONTAINER_TYPE; + bitset_container_t *bitset = bitset_container_create(); + bitset_container_add_from_range(bitset, min, max, step); + assert(bitset->cardinality == size); + return bitset; + } +} + +/** + * "repair" the container after lazy operations. + */ +static inline container_t *container_repair_after_lazy( + container_t *c, uint8_t *type +){ + c = get_writable_copy_if_shared(c, type); // !!! unnecessary cloning + container_t *result = NULL; + switch (*type) { + case BITSET_CONTAINER_TYPE: { + bitset_container_t *bc = CAST_bitset(c); + bc->cardinality = bitset_container_compute_cardinality(bc); + if (bc->cardinality <= DEFAULT_MAX_SIZE) { + result = array_container_from_bitset(bc); + bitset_container_free(bc); + *type = ARRAY_CONTAINER_TYPE; + return result; + } + return c; } + case ARRAY_CONTAINER_TYPE: + return c; // nothing to do + case RUN_CONTAINER_TYPE: + return convert_run_to_efficient_container_and_free( + CAST_run(c), type); + case SHARED_CONTAINER_TYPE: + assert(false); + } + assert(false); + roaring_unreachable; + return 0; // unreached +} + +/** + * Writes the underlying array to buf, outputs how many bytes were written. + * This is meant to be byte-by-byte compatible with the Java and Go versions of + * Roaring. + * The number of bytes written should be + * container_write(container, buf). + * + */ +static inline int32_t container_write( + const container_t *c, uint8_t typecode, + char *buf +){ + c = container_unwrap_shared(c, &typecode); + switch (typecode) { + case BITSET_CONTAINER_TYPE: + return bitset_container_write(const_CAST_bitset(c), buf); + case ARRAY_CONTAINER_TYPE: + return array_container_write(const_CAST_array(c), buf); + case RUN_CONTAINER_TYPE: + return run_container_write(const_CAST_run(c), buf); + } + assert(false); + roaring_unreachable; + return 0; // unreached +} + +/** + * Get the container size in bytes under portable serialization (see + * container_write), requires a + * typecode + */ +static inline int32_t container_size_in_bytes( + const container_t *c, uint8_t typecode +){ + c = container_unwrap_shared(c, &typecode); + switch (typecode) { + case BITSET_CONTAINER_TYPE: + return bitset_container_size_in_bytes(const_CAST_bitset(c)); + case ARRAY_CONTAINER_TYPE: + return array_container_size_in_bytes(const_CAST_array(c)); + case RUN_CONTAINER_TYPE: + return run_container_size_in_bytes(const_CAST_run(c)); + } + assert(false); + roaring_unreachable; + return 0; // unreached +} + +/** + * print the container (useful for debugging), requires a typecode + */ +void container_printf(const container_t *container, uint8_t typecode); + +/** + * print the content of the container as a comma-separated list of 32-bit values + * starting at base, requires a typecode + */ +void container_printf_as_uint32_array(const container_t *container, + uint8_t typecode, uint32_t base); + +/** + * Checks whether a container is not empty, requires a typecode + */ +static inline bool container_nonzero_cardinality( + const container_t *c, uint8_t typecode +){ + c = container_unwrap_shared(c, &typecode); + switch (typecode) { + case BITSET_CONTAINER_TYPE: + return bitset_container_const_nonzero_cardinality( + const_CAST_bitset(c)); + case ARRAY_CONTAINER_TYPE: + return array_container_nonzero_cardinality(const_CAST_array(c)); + case RUN_CONTAINER_TYPE: + return run_container_nonzero_cardinality(const_CAST_run(c)); + } + assert(false); + roaring_unreachable; + return 0; // unreached +} + +/** + * Recover memory from a container, requires a typecode + */ +void container_free(container_t *container, uint8_t typecode); + +/** + * Convert a container to an array of values, requires a typecode as well as a + * "base" (most significant values) + * Returns number of ints added. + */ +static inline int container_to_uint32_array( + uint32_t *output, + const container_t *c, uint8_t typecode, + uint32_t base +){ + c = container_unwrap_shared(c, &typecode); + switch (typecode) { + case BITSET_CONTAINER_TYPE: + return bitset_container_to_uint32_array( + output, const_CAST_bitset(c), base); + case ARRAY_CONTAINER_TYPE: + return array_container_to_uint32_array( + output, const_CAST_array(c), base); + case RUN_CONTAINER_TYPE: + return run_container_to_uint32_array( + output, const_CAST_run(c), base); + } + assert(false); + roaring_unreachable; + return 0; // unreached +} + +/** + * Add a value to a container, requires a typecode, fills in new_typecode and + * return (possibly different) container. + * This function may allocate a new container, and caller is responsible for + * memory deallocation + */ +static inline container_t *container_add( + container_t *c, uint16_t val, + uint8_t typecode, // !!! should be second argument? + uint8_t *new_typecode +){ + c = get_writable_copy_if_shared(c, &typecode); + switch (typecode) { + case BITSET_CONTAINER_TYPE: + bitset_container_set(CAST_bitset(c), val); + *new_typecode = BITSET_CONTAINER_TYPE; + return c; + case ARRAY_CONTAINER_TYPE: { + array_container_t *ac = CAST_array(c); + if (array_container_try_add(ac, val, DEFAULT_MAX_SIZE) != -1) { + *new_typecode = ARRAY_CONTAINER_TYPE; + return ac; + } else { + bitset_container_t* bitset = bitset_container_from_array(ac); + bitset_container_add(bitset, val); + *new_typecode = BITSET_CONTAINER_TYPE; + return bitset; + } + } break; + case RUN_CONTAINER_TYPE: + // per Java, no container type adjustments are done (revisit?) + run_container_add(CAST_run(c), val); + *new_typecode = RUN_CONTAINER_TYPE; + return c; + default: + assert(false); + roaring_unreachable; + return NULL; + } +} + +/** + * Remove a value from a container, requires a typecode, fills in new_typecode + * and + * return (possibly different) container. + * This function may allocate a new container, and caller is responsible for + * memory deallocation + */ +static inline container_t *container_remove( + container_t *c, uint16_t val, + uint8_t typecode, // !!! should be second argument? + uint8_t *new_typecode +){ + c = get_writable_copy_if_shared(c, &typecode); + switch (typecode) { + case BITSET_CONTAINER_TYPE: + if (bitset_container_remove(CAST_bitset(c), val)) { + int card = bitset_container_cardinality(CAST_bitset(c)); + if (card <= DEFAULT_MAX_SIZE) { + *new_typecode = ARRAY_CONTAINER_TYPE; + return array_container_from_bitset(CAST_bitset(c)); + } + } + *new_typecode = typecode; + return c; + case ARRAY_CONTAINER_TYPE: + *new_typecode = typecode; + array_container_remove(CAST_array(c), val); + return c; + case RUN_CONTAINER_TYPE: + // per Java, no container type adjustments are done (revisit?) + run_container_remove(CAST_run(c), val); + *new_typecode = RUN_CONTAINER_TYPE; + return c; + default: + assert(false); + roaring_unreachable; + return NULL; + } +} + +/** + * Check whether a value is in a container, requires a typecode + */ +static inline bool container_contains( + const container_t *c, + uint16_t val, + uint8_t typecode // !!! should be second argument? +){ + c = container_unwrap_shared(c, &typecode); + switch (typecode) { + case BITSET_CONTAINER_TYPE: + return bitset_container_get(const_CAST_bitset(c), val); + case ARRAY_CONTAINER_TYPE: + return array_container_contains(const_CAST_array(c), val); + case RUN_CONTAINER_TYPE: + return run_container_contains(const_CAST_run(c), val); + default: + assert(false); + roaring_unreachable; + return false; + } +} + +/** + * Check whether a range of values from range_start (included) to range_end (excluded) + * is in a container, requires a typecode + */ +static inline bool container_contains_range( + const container_t *c, + uint32_t range_start, uint32_t range_end, + uint8_t typecode // !!! should be second argument? +){ + c = container_unwrap_shared(c, &typecode); + switch (typecode) { + case BITSET_CONTAINER_TYPE: + return bitset_container_get_range(const_CAST_bitset(c), + range_start, range_end); + case ARRAY_CONTAINER_TYPE: + return array_container_contains_range(const_CAST_array(c), + range_start, range_end); + case RUN_CONTAINER_TYPE: + return run_container_contains_range(const_CAST_run(c), + range_start, range_end); + default: + assert(false); + roaring_unreachable; + return false; + } +} + +/** + * Returns true if the two containers have the same content. Note that + * two containers having different types can be "equal" in this sense. + */ +static inline bool container_equals( + const container_t *c1, uint8_t type1, + const container_t *c2, uint8_t type2 +){ + c1 = container_unwrap_shared(c1, &type1); + c2 = container_unwrap_shared(c2, &type2); + switch (PAIR_CONTAINER_TYPES(type1, type2)) { + case CONTAINER_PAIR(BITSET,BITSET): + return bitset_container_equals(const_CAST_bitset(c1), + const_CAST_bitset(c2)); + + case CONTAINER_PAIR(BITSET,RUN): + return run_container_equals_bitset(const_CAST_run(c2), + const_CAST_bitset(c1)); + + case CONTAINER_PAIR(RUN,BITSET): + return run_container_equals_bitset(const_CAST_run(c1), + const_CAST_bitset(c2)); + + case CONTAINER_PAIR(BITSET,ARRAY): + // java would always return false? + return array_container_equal_bitset(const_CAST_array(c2), + const_CAST_bitset(c1)); + + case CONTAINER_PAIR(ARRAY,BITSET): + // java would always return false? + return array_container_equal_bitset(const_CAST_array(c1), + const_CAST_bitset(c2)); + + case CONTAINER_PAIR(ARRAY,RUN): + return run_container_equals_array(const_CAST_run(c2), + const_CAST_array(c1)); + + case CONTAINER_PAIR(RUN,ARRAY): + return run_container_equals_array(const_CAST_run(c1), + const_CAST_array(c2)); + + case CONTAINER_PAIR(ARRAY,ARRAY): + return array_container_equals(const_CAST_array(c1), + const_CAST_array(c2)); + + case CONTAINER_PAIR(RUN,RUN): + return run_container_equals(const_CAST_run(c1), + const_CAST_run(c2)); + + default: + assert(false); + roaring_unreachable; + return false; + } +} + +/** + * Returns true if the container c1 is a subset of the container c2. Note that + * c1 can be a subset of c2 even if they have a different type. + */ +static inline bool container_is_subset( + const container_t *c1, uint8_t type1, + const container_t *c2, uint8_t type2 +){ + c1 = container_unwrap_shared(c1, &type1); + c2 = container_unwrap_shared(c2, &type2); + switch (PAIR_CONTAINER_TYPES(type1, type2)) { + case CONTAINER_PAIR(BITSET,BITSET): + return bitset_container_is_subset(const_CAST_bitset(c1), + const_CAST_bitset(c2)); + + case CONTAINER_PAIR(BITSET,RUN): + return bitset_container_is_subset_run(const_CAST_bitset(c1), + const_CAST_run(c2)); + + case CONTAINER_PAIR(RUN,BITSET): + return run_container_is_subset_bitset(const_CAST_run(c1), + const_CAST_bitset(c2)); + + case CONTAINER_PAIR(BITSET,ARRAY): + return false; // by construction, size(c1) > size(c2) + + case CONTAINER_PAIR(ARRAY,BITSET): + return array_container_is_subset_bitset(const_CAST_array(c1), + const_CAST_bitset(c2)); + + case CONTAINER_PAIR(ARRAY,RUN): + return array_container_is_subset_run(const_CAST_array(c1), + const_CAST_run(c2)); + + case CONTAINER_PAIR(RUN,ARRAY): + return run_container_is_subset_array(const_CAST_run(c1), + const_CAST_array(c2)); + + case CONTAINER_PAIR(ARRAY,ARRAY): + return array_container_is_subset(const_CAST_array(c1), + const_CAST_array(c2)); + + case CONTAINER_PAIR(RUN,RUN): + return run_container_is_subset(const_CAST_run(c1), + const_CAST_run(c2)); + + default: + assert(false); + roaring_unreachable; + return false; + } +} + +// macro-izations possibilities for generic non-inplace binary-op dispatch + +/** + * Compute intersection between two containers, generate a new container (having + * type result_type), requires a typecode. This allocates new memory, caller + * is responsible for deallocation. + */ +static inline container_t *container_and( + const container_t *c1, uint8_t type1, + const container_t *c2, uint8_t type2, + uint8_t *result_type +){ + c1 = container_unwrap_shared(c1, &type1); + c2 = container_unwrap_shared(c2, &type2); + container_t *result = NULL; + switch (PAIR_CONTAINER_TYPES(type1, type2)) { + case CONTAINER_PAIR(BITSET,BITSET): + *result_type = bitset_bitset_container_intersection( + const_CAST_bitset(c1), + const_CAST_bitset(c2), &result) + ? BITSET_CONTAINER_TYPE + : ARRAY_CONTAINER_TYPE; + return result; + + case CONTAINER_PAIR(ARRAY,ARRAY): + result = array_container_create(); + array_container_intersection(const_CAST_array(c1), + const_CAST_array(c2), + CAST_array(result)); + *result_type = ARRAY_CONTAINER_TYPE; // never bitset + return result; + + case CONTAINER_PAIR(RUN,RUN): + result = run_container_create(); + run_container_intersection(const_CAST_run(c1), + const_CAST_run(c2), + CAST_run(result)); + return convert_run_to_efficient_container_and_free( + CAST_run(result), result_type); + + case CONTAINER_PAIR(BITSET,ARRAY): + result = array_container_create(); + array_bitset_container_intersection(const_CAST_array(c2), + const_CAST_bitset(c1), + CAST_array(result)); + *result_type = ARRAY_CONTAINER_TYPE; // never bitset + return result; + + case CONTAINER_PAIR(ARRAY,BITSET): + result = array_container_create(); + *result_type = ARRAY_CONTAINER_TYPE; // never bitset + array_bitset_container_intersection(const_CAST_array(c1), + const_CAST_bitset(c2), + CAST_array(result)); + return result; + + case CONTAINER_PAIR(BITSET,RUN): + *result_type = run_bitset_container_intersection( + const_CAST_run(c2), + const_CAST_bitset(c1), &result) + ? BITSET_CONTAINER_TYPE + : ARRAY_CONTAINER_TYPE; + return result; + + case CONTAINER_PAIR(RUN,BITSET): + *result_type = run_bitset_container_intersection( + const_CAST_run(c1), + const_CAST_bitset(c2), &result) + ? BITSET_CONTAINER_TYPE + : ARRAY_CONTAINER_TYPE; + return result; + + case CONTAINER_PAIR(ARRAY,RUN): + result = array_container_create(); + *result_type = ARRAY_CONTAINER_TYPE; // never bitset + array_run_container_intersection(const_CAST_array(c1), + const_CAST_run(c2), + CAST_array(result)); + return result; + + case CONTAINER_PAIR(RUN,ARRAY): + result = array_container_create(); + *result_type = ARRAY_CONTAINER_TYPE; // never bitset + array_run_container_intersection(const_CAST_array(c2), + const_CAST_run(c1), + CAST_array(result)); + return result; + + default: + assert(false); + roaring_unreachable; + return NULL; + } +} + +/** + * Compute the size of the intersection between two containers. + */ +static inline int container_and_cardinality( + const container_t *c1, uint8_t type1, + const container_t *c2, uint8_t type2 +){ + c1 = container_unwrap_shared(c1, &type1); + c2 = container_unwrap_shared(c2, &type2); + switch (PAIR_CONTAINER_TYPES(type1, type2)) { + case CONTAINER_PAIR(BITSET,BITSET): + return bitset_container_and_justcard( + const_CAST_bitset(c1), const_CAST_bitset(c2)); + + case CONTAINER_PAIR(ARRAY,ARRAY): + return array_container_intersection_cardinality( + const_CAST_array(c1), const_CAST_array(c2)); + + case CONTAINER_PAIR(RUN,RUN): + return run_container_intersection_cardinality( + const_CAST_run(c1), const_CAST_run(c2)); + + case CONTAINER_PAIR(BITSET,ARRAY): + return array_bitset_container_intersection_cardinality( + const_CAST_array(c2), const_CAST_bitset(c1)); + + case CONTAINER_PAIR(ARRAY,BITSET): + return array_bitset_container_intersection_cardinality( + const_CAST_array(c1), const_CAST_bitset(c2)); + + case CONTAINER_PAIR(BITSET,RUN): + return run_bitset_container_intersection_cardinality( + const_CAST_run(c2), const_CAST_bitset(c1)); + + case CONTAINER_PAIR(RUN,BITSET): + return run_bitset_container_intersection_cardinality( + const_CAST_run(c1), const_CAST_bitset(c2)); + + case CONTAINER_PAIR(ARRAY,RUN): + return array_run_container_intersection_cardinality( + const_CAST_array(c1), const_CAST_run(c2)); + + case CONTAINER_PAIR(RUN,ARRAY): + return array_run_container_intersection_cardinality( + const_CAST_array(c2), const_CAST_run(c1)); + + default: + assert(false); + roaring_unreachable; + return 0; + } +} + +/** + * Check whether two containers intersect. + */ +static inline bool container_intersect( + const container_t *c1, uint8_t type1, + const container_t *c2, uint8_t type2 +){ + c1 = container_unwrap_shared(c1, &type1); + c2 = container_unwrap_shared(c2, &type2); + switch (PAIR_CONTAINER_TYPES(type1, type2)) { + case CONTAINER_PAIR(BITSET,BITSET): + return bitset_container_intersect(const_CAST_bitset(c1), + const_CAST_bitset(c2)); + + case CONTAINER_PAIR(ARRAY,ARRAY): + return array_container_intersect(const_CAST_array(c1), + const_CAST_array(c2)); + + case CONTAINER_PAIR(RUN,RUN): + return run_container_intersect(const_CAST_run(c1), + const_CAST_run(c2)); + + case CONTAINER_PAIR(BITSET,ARRAY): + return array_bitset_container_intersect(const_CAST_array(c2), + const_CAST_bitset(c1)); + + case CONTAINER_PAIR(ARRAY,BITSET): + return array_bitset_container_intersect(const_CAST_array(c1), + const_CAST_bitset(c2)); + + case CONTAINER_PAIR(BITSET,RUN): + return run_bitset_container_intersect(const_CAST_run(c2), + const_CAST_bitset(c1)); + + case CONTAINER_PAIR(RUN,BITSET): + return run_bitset_container_intersect(const_CAST_run(c1), + const_CAST_bitset(c2)); + + case CONTAINER_PAIR(ARRAY,RUN): + return array_run_container_intersect(const_CAST_array(c1), + const_CAST_run(c2)); + + case CONTAINER_PAIR(RUN,ARRAY): + return array_run_container_intersect(const_CAST_array(c2), + const_CAST_run(c1)); + + default: + assert(false); + roaring_unreachable; + return 0; + } +} + +/** + * Compute intersection between two containers, with result in the first + container if possible. If the returned pointer is identical to c1, + then the container has been modified. If the returned pointer is different + from c1, then a new container has been created and the caller is responsible + for freeing it. + The type of the first container may change. Returns the modified + (and possibly new) container. +*/ +static inline container_t *container_iand( + container_t *c1, uint8_t type1, + const container_t *c2, uint8_t type2, + uint8_t *result_type +){ + c1 = get_writable_copy_if_shared(c1, &type1); + c2 = container_unwrap_shared(c2, &type2); + container_t *result = NULL; + switch (PAIR_CONTAINER_TYPES(type1, type2)) { + case CONTAINER_PAIR(BITSET,BITSET): + *result_type = + bitset_bitset_container_intersection_inplace( + CAST_bitset(c1), const_CAST_bitset(c2), &result) + ? BITSET_CONTAINER_TYPE + : ARRAY_CONTAINER_TYPE; + return result; + + case CONTAINER_PAIR(ARRAY,ARRAY): + array_container_intersection_inplace(CAST_array(c1), + const_CAST_array(c2)); + *result_type = ARRAY_CONTAINER_TYPE; + return c1; + + case CONTAINER_PAIR(RUN,RUN): + result = run_container_create(); + run_container_intersection(const_CAST_run(c1), + const_CAST_run(c2), + CAST_run(result)); + // as of January 2016, Java code used non-in-place intersection for + // two runcontainers + return convert_run_to_efficient_container_and_free( + CAST_run(result), result_type); + + case CONTAINER_PAIR(BITSET,ARRAY): + // c1 is a bitmap so no inplace possible + result = array_container_create(); + array_bitset_container_intersection(const_CAST_array(c2), + const_CAST_bitset(c1), + CAST_array(result)); + *result_type = ARRAY_CONTAINER_TYPE; // never bitset + return result; + + case CONTAINER_PAIR(ARRAY,BITSET): + *result_type = ARRAY_CONTAINER_TYPE; // never bitset + array_bitset_container_intersection( + const_CAST_array(c1), const_CAST_bitset(c2), + CAST_array(c1)); // result is allowed to be same as c1 + return c1; + + case CONTAINER_PAIR(BITSET,RUN): + // will attempt in-place computation + *result_type = run_bitset_container_intersection( + const_CAST_run(c2), + const_CAST_bitset(c1), &c1) + ? BITSET_CONTAINER_TYPE + : ARRAY_CONTAINER_TYPE; + return c1; + + case CONTAINER_PAIR(RUN,BITSET): + *result_type = run_bitset_container_intersection( + const_CAST_run(c1), + const_CAST_bitset(c2), &result) + ? BITSET_CONTAINER_TYPE + : ARRAY_CONTAINER_TYPE; + return result; + + case CONTAINER_PAIR(ARRAY,RUN): + result = array_container_create(); + *result_type = ARRAY_CONTAINER_TYPE; // never bitset + array_run_container_intersection(const_CAST_array(c1), + const_CAST_run(c2), + CAST_array(result)); + return result; + + case CONTAINER_PAIR(RUN,ARRAY): + result = array_container_create(); + *result_type = ARRAY_CONTAINER_TYPE; // never bitset + array_run_container_intersection(const_CAST_array(c2), + const_CAST_run(c1), + CAST_array(result)); + return result; + + default: + assert(false); + roaring_unreachable; + return NULL; + } +} + +/** + * Compute union between two containers, generate a new container (having type + * result_type), requires a typecode. This allocates new memory, caller + * is responsible for deallocation. + */ +static inline container_t *container_or( + const container_t *c1, uint8_t type1, + const container_t *c2, uint8_t type2, + uint8_t *result_type +){ + c1 = container_unwrap_shared(c1, &type1); + c2 = container_unwrap_shared(c2, &type2); + container_t *result = NULL; + switch (PAIR_CONTAINER_TYPES(type1, type2)) { + case CONTAINER_PAIR(BITSET,BITSET): + result = bitset_container_create(); + bitset_container_or(const_CAST_bitset(c1), + const_CAST_bitset(c2), + CAST_bitset(result)); + *result_type = BITSET_CONTAINER_TYPE; + return result; + + case CONTAINER_PAIR(ARRAY,ARRAY): + *result_type = array_array_container_union( + const_CAST_array(c1), + const_CAST_array(c2), &result) + ? BITSET_CONTAINER_TYPE + : ARRAY_CONTAINER_TYPE; + return result; + + case CONTAINER_PAIR(RUN,RUN): + result = run_container_create(); + run_container_union(const_CAST_run(c1), + const_CAST_run(c2), + CAST_run(result)); + *result_type = RUN_CONTAINER_TYPE; + // todo: could be optimized since will never convert to array + result = convert_run_to_efficient_container_and_free( + CAST_run(result), result_type); + return result; + + case CONTAINER_PAIR(BITSET,ARRAY): + result = bitset_container_create(); + array_bitset_container_union(const_CAST_array(c2), + const_CAST_bitset(c1), + CAST_bitset(result)); + *result_type = BITSET_CONTAINER_TYPE; + return result; + + case CONTAINER_PAIR(ARRAY,BITSET): + result = bitset_container_create(); + array_bitset_container_union(const_CAST_array(c1), + const_CAST_bitset(c2), + CAST_bitset(result)); + *result_type = BITSET_CONTAINER_TYPE; + return result; + + case CONTAINER_PAIR(BITSET,RUN): + if (run_container_is_full(const_CAST_run(c2))) { + result = run_container_create(); + *result_type = RUN_CONTAINER_TYPE; + run_container_copy(const_CAST_run(c2), + CAST_run(result)); + return result; + } + result = bitset_container_create(); + run_bitset_container_union(const_CAST_run(c2), + const_CAST_bitset(c1), + CAST_bitset(result)); + *result_type = BITSET_CONTAINER_TYPE; + return result; + + case CONTAINER_PAIR(RUN,BITSET): + if (run_container_is_full(const_CAST_run(c1))) { + result = run_container_create(); + *result_type = RUN_CONTAINER_TYPE; + run_container_copy(const_CAST_run(c1), + CAST_run(result)); + return result; + } + result = bitset_container_create(); + run_bitset_container_union(const_CAST_run(c1), + const_CAST_bitset(c2), + CAST_bitset(result)); + *result_type = BITSET_CONTAINER_TYPE; + return result; + + case CONTAINER_PAIR(ARRAY,RUN): + result = run_container_create(); + array_run_container_union(const_CAST_array(c1), + const_CAST_run(c2), + CAST_run(result)); + result = convert_run_to_efficient_container_and_free( + CAST_run(result), result_type); + return result; + + case CONTAINER_PAIR(RUN,ARRAY): + result = run_container_create(); + array_run_container_union(const_CAST_array(c2), + const_CAST_run(c1), + CAST_run(result)); + result = convert_run_to_efficient_container_and_free( + CAST_run(result), result_type); + return result; + + default: + assert(false); + roaring_unreachable; + return NULL; // unreached + } +} + +/** + * Compute union between two containers, generate a new container (having type + * result_type), requires a typecode. This allocates new memory, caller + * is responsible for deallocation. + * + * This lazy version delays some operations such as the maintenance of the + * cardinality. It requires repair later on the generated containers. + */ +static inline container_t *container_lazy_or( + const container_t *c1, uint8_t type1, + const container_t *c2, uint8_t type2, + uint8_t *result_type +){ + c1 = container_unwrap_shared(c1, &type1); + c2 = container_unwrap_shared(c2, &type2); + container_t *result = NULL; + switch (PAIR_CONTAINER_TYPES(type1, type2)) { + case CONTAINER_PAIR(BITSET,BITSET): + result = bitset_container_create(); + bitset_container_or_nocard( + const_CAST_bitset(c1), const_CAST_bitset(c2), + CAST_bitset(result)); // is lazy + *result_type = BITSET_CONTAINER_TYPE; + return result; + + case CONTAINER_PAIR(ARRAY,ARRAY): + *result_type = array_array_container_lazy_union( + const_CAST_array(c1), + const_CAST_array(c2), &result) + ? BITSET_CONTAINER_TYPE + : ARRAY_CONTAINER_TYPE; + return result; + + case CONTAINER_PAIR(RUN,RUN): + result = run_container_create(); + run_container_union(const_CAST_run(c1), + const_CAST_run(c2), + CAST_run(result)); + *result_type = RUN_CONTAINER_TYPE; + // we are being lazy + result = convert_run_to_efficient_container_and_free( + CAST_run(result), result_type); + return result; + + case CONTAINER_PAIR(BITSET,ARRAY): + result = bitset_container_create(); + array_bitset_container_lazy_union( + const_CAST_array(c2), const_CAST_bitset(c1), + CAST_bitset(result)); // is lazy + *result_type = BITSET_CONTAINER_TYPE; + return result; + + case CONTAINER_PAIR(ARRAY,BITSET): + result = bitset_container_create(); + array_bitset_container_lazy_union( + const_CAST_array(c1), const_CAST_bitset(c2), + CAST_bitset(result)); // is lazy + *result_type = BITSET_CONTAINER_TYPE; + return result; + + case CONTAINER_PAIR(BITSET,RUN): + if (run_container_is_full(const_CAST_run(c2))) { + result = run_container_create(); + *result_type = RUN_CONTAINER_TYPE; + run_container_copy(const_CAST_run(c2), CAST_run(result)); + return result; + } + result = bitset_container_create(); + run_bitset_container_lazy_union( + const_CAST_run(c2), const_CAST_bitset(c1), + CAST_bitset(result)); // is lazy + *result_type = BITSET_CONTAINER_TYPE; + return result; + + case CONTAINER_PAIR(RUN,BITSET): + if (run_container_is_full(const_CAST_run(c1))) { + result = run_container_create(); + *result_type = RUN_CONTAINER_TYPE; + run_container_copy(const_CAST_run(c1), CAST_run(result)); + return result; + } + result = bitset_container_create(); + run_bitset_container_lazy_union( + const_CAST_run(c1), const_CAST_bitset(c2), + CAST_bitset(result)); // is lazy + *result_type = BITSET_CONTAINER_TYPE; + return result; + + case CONTAINER_PAIR(ARRAY,RUN): + result = run_container_create(); + array_run_container_union(const_CAST_array(c1), + const_CAST_run(c2), + CAST_run(result)); + *result_type = RUN_CONTAINER_TYPE; + // next line skipped since we are lazy + // result = convert_run_to_efficient_container(result, result_type); + return result; + + case CONTAINER_PAIR(RUN,ARRAY): + result = run_container_create(); + array_run_container_union( + const_CAST_array(c2), const_CAST_run(c1), + CAST_run(result)); // TODO make lazy + *result_type = RUN_CONTAINER_TYPE; + // next line skipped since we are lazy + // result = convert_run_to_efficient_container(result, result_type); + return result; + + default: + assert(false); + roaring_unreachable; + return NULL; // unreached + } +} + +/** + * Compute the union between two containers, with result in the first container. + * If the returned pointer is identical to c1, then the container has been + * modified. + * If the returned pointer is different from c1, then a new container has been + * created and the caller is responsible for freeing it. + * The type of the first container may change. Returns the modified + * (and possibly new) container +*/ +static inline container_t *container_ior( + container_t *c1, uint8_t type1, + const container_t *c2, uint8_t type2, + uint8_t *result_type +){ + c1 = get_writable_copy_if_shared(c1, &type1); + c2 = container_unwrap_shared(c2, &type2); + container_t *result = NULL; + switch (PAIR_CONTAINER_TYPES(type1, type2)) { + case CONTAINER_PAIR(BITSET,BITSET): + bitset_container_or(const_CAST_bitset(c1), + const_CAST_bitset(c2), + CAST_bitset(c1)); +#ifdef OR_BITSET_CONVERSION_TO_FULL + if (CAST_bitset(c1)->cardinality == (1 << 16)) { // we convert + result = run_container_create_range(0, (1 << 16)); + *result_type = RUN_CONTAINER_TYPE; + return result; + } +#endif + *result_type = BITSET_CONTAINER_TYPE; + return c1; + + case CONTAINER_PAIR(ARRAY,ARRAY): + *result_type = array_array_container_inplace_union( + CAST_array(c1), const_CAST_array(c2), &result) + ? BITSET_CONTAINER_TYPE + : ARRAY_CONTAINER_TYPE; + if((result == NULL) + && (*result_type == ARRAY_CONTAINER_TYPE)) { + return c1; // the computation was done in-place! + } + return result; + + case CONTAINER_PAIR(RUN,RUN): + run_container_union_inplace(CAST_run(c1), const_CAST_run(c2)); + return convert_run_to_efficient_container(CAST_run(c1), + result_type); + + case CONTAINER_PAIR(BITSET,ARRAY): + array_bitset_container_union(const_CAST_array(c2), + const_CAST_bitset(c1), + CAST_bitset(c1)); + *result_type = BITSET_CONTAINER_TYPE; // never array + return c1; + + case CONTAINER_PAIR(ARRAY,BITSET): + // c1 is an array, so no in-place possible + result = bitset_container_create(); + *result_type = BITSET_CONTAINER_TYPE; + array_bitset_container_union(const_CAST_array(c1), + const_CAST_bitset(c2), + CAST_bitset(result)); + return result; + + case CONTAINER_PAIR(BITSET,RUN): + if (run_container_is_full(const_CAST_run(c2))) { + result = run_container_create(); + *result_type = RUN_CONTAINER_TYPE; + run_container_copy(const_CAST_run(c2), CAST_run(result)); + return result; + } + run_bitset_container_union(const_CAST_run(c2), + const_CAST_bitset(c1), + CAST_bitset(c1)); // allowed + *result_type = BITSET_CONTAINER_TYPE; + return c1; + + case CONTAINER_PAIR(RUN,BITSET): + if (run_container_is_full(const_CAST_run(c1))) { + *result_type = RUN_CONTAINER_TYPE; + return c1; + } + result = bitset_container_create(); + run_bitset_container_union(const_CAST_run(c1), + const_CAST_bitset(c2), + CAST_bitset(result)); + *result_type = BITSET_CONTAINER_TYPE; + return result; + + case CONTAINER_PAIR(ARRAY,RUN): + result = run_container_create(); + array_run_container_union(const_CAST_array(c1), + const_CAST_run(c2), + CAST_run(result)); + result = convert_run_to_efficient_container_and_free( + CAST_run(result), result_type); + return result; + + case CONTAINER_PAIR(RUN,ARRAY): + array_run_container_inplace_union(const_CAST_array(c2), + CAST_run(c1)); + c1 = convert_run_to_efficient_container(CAST_run(c1), + result_type); + return c1; + + default: + assert(false); + roaring_unreachable; + return NULL; + } +} + +/** + * Compute the union between two containers, with result in the first container. + * If the returned pointer is identical to c1, then the container has been + * modified. + * If the returned pointer is different from c1, then a new container has been + * created and the caller is responsible for freeing it. + * The type of the first container may change. Returns the modified + * (and possibly new) container + * + * This lazy version delays some operations such as the maintenance of the + * cardinality. It requires repair later on the generated containers. +*/ +static inline container_t *container_lazy_ior( + container_t *c1, uint8_t type1, + const container_t *c2, uint8_t type2, + uint8_t *result_type +){ + assert(type1 != SHARED_CONTAINER_TYPE); + // c1 = get_writable_copy_if_shared(c1,&type1); + c2 = container_unwrap_shared(c2, &type2); + container_t *result = NULL; + switch (PAIR_CONTAINER_TYPES(type1, type2)) { + case CONTAINER_PAIR(BITSET,BITSET): +#ifdef LAZY_OR_BITSET_CONVERSION_TO_FULL + // if we have two bitsets, we might as well compute the cardinality + bitset_container_or(const_CAST_bitset(c1), + const_CAST_bitset(c2), + CAST_bitset(c1)); + // it is possible that two bitsets can lead to a full container + if (CAST_bitset(c1)->cardinality == (1 << 16)) { // we convert + result = run_container_create_range(0, (1 << 16)); + *result_type = RUN_CONTAINER_TYPE; + return result; + } +#else + bitset_container_or_nocard(const_CAST_bitset(c1), + const_CAST_bitset(c2), + CAST_bitset(c1)); + +#endif + *result_type = BITSET_CONTAINER_TYPE; + return c1; + + case CONTAINER_PAIR(ARRAY,ARRAY): + *result_type = array_array_container_lazy_inplace_union( + CAST_array(c1), + const_CAST_array(c2), &result) + ? BITSET_CONTAINER_TYPE + : ARRAY_CONTAINER_TYPE; + if((result == NULL) + && (*result_type == ARRAY_CONTAINER_TYPE)) { + return c1; // the computation was done in-place! + } + return result; + + case CONTAINER_PAIR(RUN,RUN): + run_container_union_inplace(CAST_run(c1), + const_CAST_run(c2)); + *result_type = RUN_CONTAINER_TYPE; + return convert_run_to_efficient_container(CAST_run(c1), + result_type); + + case CONTAINER_PAIR(BITSET,ARRAY): + array_bitset_container_lazy_union( + const_CAST_array(c2), const_CAST_bitset(c1), + CAST_bitset(c1)); // is lazy + *result_type = BITSET_CONTAINER_TYPE; // never array + return c1; + + case CONTAINER_PAIR(ARRAY,BITSET): + // c1 is an array, so no in-place possible + result = bitset_container_create(); + *result_type = BITSET_CONTAINER_TYPE; + array_bitset_container_lazy_union( + const_CAST_array(c1), const_CAST_bitset(c2), + CAST_bitset(result)); // is lazy + return result; + + case CONTAINER_PAIR(BITSET,RUN): + if (run_container_is_full(const_CAST_run(c2))) { + result = run_container_create(); + *result_type = RUN_CONTAINER_TYPE; + run_container_copy(const_CAST_run(c2), + CAST_run(result)); + return result; + } + run_bitset_container_lazy_union( + const_CAST_run(c2), const_CAST_bitset(c1), + CAST_bitset(c1)); // allowed // lazy + *result_type = BITSET_CONTAINER_TYPE; + return c1; + + case CONTAINER_PAIR(RUN,BITSET): + if (run_container_is_full(const_CAST_run(c1))) { + *result_type = RUN_CONTAINER_TYPE; + return c1; + } + result = bitset_container_create(); + run_bitset_container_lazy_union( + const_CAST_run(c1), const_CAST_bitset(c2), + CAST_bitset(result)); // lazy + *result_type = BITSET_CONTAINER_TYPE; + return result; + + case CONTAINER_PAIR(ARRAY,RUN): + result = run_container_create(); + array_run_container_union(const_CAST_array(c1), + const_CAST_run(c2), + CAST_run(result)); + *result_type = RUN_CONTAINER_TYPE; + // next line skipped since we are lazy + // result = convert_run_to_efficient_container_and_free(result, + // result_type); + return result; + + case CONTAINER_PAIR(RUN,ARRAY): + array_run_container_inplace_union(const_CAST_array(c2), + CAST_run(c1)); + *result_type = RUN_CONTAINER_TYPE; + // next line skipped since we are lazy + // result = convert_run_to_efficient_container_and_free(result, + // result_type); + return c1; + + default: + assert(false); + roaring_unreachable; + return NULL; + } +} + +/** + * Compute symmetric difference (xor) between two containers, generate a new + * container (having type result_type), requires a typecode. This allocates new + * memory, caller is responsible for deallocation. + */ +static inline container_t* container_xor( + const container_t *c1, uint8_t type1, + const container_t *c2, uint8_t type2, + uint8_t *result_type +){ + c1 = container_unwrap_shared(c1, &type1); + c2 = container_unwrap_shared(c2, &type2); + container_t *result = NULL; + switch (PAIR_CONTAINER_TYPES(type1, type2)) { + case CONTAINER_PAIR(BITSET,BITSET): + *result_type = bitset_bitset_container_xor( + const_CAST_bitset(c1), + const_CAST_bitset(c2), &result) + ? BITSET_CONTAINER_TYPE + : ARRAY_CONTAINER_TYPE; + return result; + + case CONTAINER_PAIR(ARRAY,ARRAY): + *result_type = array_array_container_xor( + const_CAST_array(c1), + const_CAST_array(c2), &result) + ? BITSET_CONTAINER_TYPE + : ARRAY_CONTAINER_TYPE; + return result; + + case CONTAINER_PAIR(RUN,RUN): + *result_type = + run_run_container_xor(const_CAST_run(c1), + const_CAST_run(c2), &result); + return result; + + case CONTAINER_PAIR(BITSET,ARRAY): + *result_type = array_bitset_container_xor( + const_CAST_array(c2), + const_CAST_bitset(c1), &result) + ? BITSET_CONTAINER_TYPE + : ARRAY_CONTAINER_TYPE; + return result; + + case CONTAINER_PAIR(ARRAY,BITSET): + *result_type = array_bitset_container_xor( + const_CAST_array(c1), + const_CAST_bitset(c2), &result) + ? BITSET_CONTAINER_TYPE + : ARRAY_CONTAINER_TYPE; + return result; + + case CONTAINER_PAIR(BITSET,RUN): + *result_type = run_bitset_container_xor( + const_CAST_run(c2), + const_CAST_bitset(c1), &result) + ? BITSET_CONTAINER_TYPE + : ARRAY_CONTAINER_TYPE; + return result; + + case CONTAINER_PAIR(RUN,BITSET): + *result_type = run_bitset_container_xor( + const_CAST_run(c1), + const_CAST_bitset(c2), &result) + ? BITSET_CONTAINER_TYPE + : ARRAY_CONTAINER_TYPE; + return result; + + case CONTAINER_PAIR(ARRAY,RUN): + *result_type = + array_run_container_xor(const_CAST_array(c1), + const_CAST_run(c2), &result); + return result; + + case CONTAINER_PAIR(RUN,ARRAY): + *result_type = + array_run_container_xor(const_CAST_array(c2), + const_CAST_run(c1), &result); + return result; + + default: + assert(false); + roaring_unreachable; + return NULL; // unreached + } +} + +/* Applies an offset to the non-empty container 'c'. + * The results are stored in new containers returned via 'lo' and 'hi', for the + * low and high halves of the result (where the low half matches the original key + * and the high one corresponds to values for the following key). + * Either one of 'lo' and 'hi' are allowed to be 'NULL', but not both. + * Whenever one of them is not 'NULL', it should point to a 'NULL' container. + * Whenever one of them is 'NULL' the shifted elements for that part will not be + * computed. + * If either of the resulting containers turns out to be empty, the pointed + * container will remain 'NULL'. + */ +static inline void container_add_offset(const container_t *c, uint8_t type, + container_t **lo, container_t **hi, + uint16_t offset) { + assert(offset != 0); + assert(container_nonzero_cardinality(c, type)); + assert(lo != NULL || hi != NULL); + assert(lo == NULL || *lo == NULL); + assert(hi == NULL || *hi == NULL); + + switch (type) { + case BITSET_CONTAINER_TYPE: + bitset_container_offset(const_CAST_bitset(c), lo, hi, offset); + break; + case ARRAY_CONTAINER_TYPE: + array_container_offset(const_CAST_array(c), lo, hi, offset); + break; + case RUN_CONTAINER_TYPE: + run_container_offset(const_CAST_run(c), lo, hi, offset); + break; + default: + assert(false); + roaring_unreachable; + break; + } +} + +/** + * Compute xor between two containers, generate a new container (having type + * result_type), requires a typecode. This allocates new memory, caller + * is responsible for deallocation. + * + * This lazy version delays some operations such as the maintenance of the + * cardinality. It requires repair later on the generated containers. + */ +static inline container_t *container_lazy_xor( + const container_t *c1, uint8_t type1, + const container_t *c2, uint8_t type2, + uint8_t *result_type +){ + c1 = container_unwrap_shared(c1, &type1); + c2 = container_unwrap_shared(c2, &type2); + container_t *result = NULL; + switch (PAIR_CONTAINER_TYPES(type1, type2)) { + case CONTAINER_PAIR(BITSET,BITSET): + result = bitset_container_create(); + bitset_container_xor_nocard( + const_CAST_bitset(c1), const_CAST_bitset(c2), + CAST_bitset(result)); // is lazy + *result_type = BITSET_CONTAINER_TYPE; + return result; + + case CONTAINER_PAIR(ARRAY,ARRAY): + *result_type = array_array_container_lazy_xor( + const_CAST_array(c1), + const_CAST_array(c2), &result) + ? BITSET_CONTAINER_TYPE + : ARRAY_CONTAINER_TYPE; + return result; + + case CONTAINER_PAIR(RUN,RUN): + // nothing special done yet. + *result_type = + run_run_container_xor(const_CAST_run(c1), + const_CAST_run(c2), &result); + return result; + + case CONTAINER_PAIR(BITSET,ARRAY): + result = bitset_container_create(); + *result_type = BITSET_CONTAINER_TYPE; + array_bitset_container_lazy_xor(const_CAST_array(c2), + const_CAST_bitset(c1), + CAST_bitset(result)); + return result; + + case CONTAINER_PAIR(ARRAY,BITSET): + result = bitset_container_create(); + *result_type = BITSET_CONTAINER_TYPE; + array_bitset_container_lazy_xor(const_CAST_array(c1), + const_CAST_bitset(c2), + CAST_bitset(result)); + return result; + + case CONTAINER_PAIR(BITSET,RUN): + result = bitset_container_create(); + run_bitset_container_lazy_xor(const_CAST_run(c2), + const_CAST_bitset(c1), + CAST_bitset(result)); + *result_type = BITSET_CONTAINER_TYPE; + return result; + + case CONTAINER_PAIR(RUN,BITSET): + result = bitset_container_create(); + run_bitset_container_lazy_xor(const_CAST_run(c1), + const_CAST_bitset(c2), + CAST_bitset(result)); + *result_type = BITSET_CONTAINER_TYPE; + return result; + + case CONTAINER_PAIR(ARRAY,RUN): + result = run_container_create(); + array_run_container_lazy_xor(const_CAST_array(c1), + const_CAST_run(c2), + CAST_run(result)); + *result_type = RUN_CONTAINER_TYPE; + // next line skipped since we are lazy + // result = convert_run_to_efficient_container(result, result_type); + return result; + + case CONTAINER_PAIR(RUN,ARRAY): + result = run_container_create(); + array_run_container_lazy_xor(const_CAST_array(c2), + const_CAST_run(c1), + CAST_run(result)); + *result_type = RUN_CONTAINER_TYPE; + // next line skipped since we are lazy + // result = convert_run_to_efficient_container(result, result_type); + return result; + + default: + assert(false); + roaring_unreachable; + return NULL; // unreached + } +} + +/** + * Compute the xor between two containers, with result in the first container. + * If the returned pointer is identical to c1, then the container has been + * modified. + * If the returned pointer is different from c1, then a new container has been + * created and the caller is responsible for freeing it. + * The type of the first container may change. Returns the modified + * (and possibly new) container +*/ +static inline container_t *container_ixor( + container_t *c1, uint8_t type1, + const container_t *c2, uint8_t type2, + uint8_t *result_type +){ + c1 = get_writable_copy_if_shared(c1, &type1); + c2 = container_unwrap_shared(c2, &type2); + container_t *result = NULL; + switch (PAIR_CONTAINER_TYPES(type1, type2)) { + case CONTAINER_PAIR(BITSET,BITSET): + *result_type = bitset_bitset_container_ixor( + CAST_bitset(c1), const_CAST_bitset(c2), &result) + ? BITSET_CONTAINER_TYPE + : ARRAY_CONTAINER_TYPE; + return result; + + case CONTAINER_PAIR(ARRAY,ARRAY): + *result_type = array_array_container_ixor( + CAST_array(c1), const_CAST_array(c2), &result) + ? BITSET_CONTAINER_TYPE + : ARRAY_CONTAINER_TYPE; + return result; + + case CONTAINER_PAIR(RUN,RUN): + *result_type = run_run_container_ixor( + CAST_run(c1), const_CAST_run(c2), &result); + return result; + + case CONTAINER_PAIR(BITSET,ARRAY): + *result_type = bitset_array_container_ixor( + CAST_bitset(c1), const_CAST_array(c2), &result) + ? BITSET_CONTAINER_TYPE + : ARRAY_CONTAINER_TYPE; + return result; + + case CONTAINER_PAIR(ARRAY,BITSET): + *result_type = array_bitset_container_ixor( + CAST_array(c1), const_CAST_bitset(c2), &result) + ? BITSET_CONTAINER_TYPE + : ARRAY_CONTAINER_TYPE; + return result; + + case CONTAINER_PAIR(BITSET,RUN): + *result_type = + bitset_run_container_ixor( + CAST_bitset(c1), const_CAST_run(c2), &result) + ? BITSET_CONTAINER_TYPE + : ARRAY_CONTAINER_TYPE; + + return result; + + case CONTAINER_PAIR(RUN,BITSET): + *result_type = run_bitset_container_ixor( + CAST_run(c1), const_CAST_bitset(c2), &result) + ? BITSET_CONTAINER_TYPE + : ARRAY_CONTAINER_TYPE; + return result; + + case CONTAINER_PAIR(ARRAY,RUN): + *result_type = array_run_container_ixor( + CAST_array(c1), const_CAST_run(c2), &result); + return result; + + case CONTAINER_PAIR(RUN,ARRAY): + *result_type = run_array_container_ixor( + CAST_run(c1), const_CAST_array(c2), &result); + return result; + + default: + assert(false); + roaring_unreachable; + return NULL; + } +} + +/** + * Compute the xor between two containers, with result in the first container. + * If the returned pointer is identical to c1, then the container has been + * modified. + * If the returned pointer is different from c1, then a new container has been + * created and the caller is responsible for freeing it. + * The type of the first container may change. Returns the modified + * (and possibly new) container + * + * This lazy version delays some operations such as the maintenance of the + * cardinality. It requires repair later on the generated containers. +*/ +static inline container_t *container_lazy_ixor( + container_t *c1, uint8_t type1, + const container_t *c2, uint8_t type2, + uint8_t *result_type +){ + assert(type1 != SHARED_CONTAINER_TYPE); + // c1 = get_writable_copy_if_shared(c1,&type1); + c2 = container_unwrap_shared(c2, &type2); + switch (PAIR_CONTAINER_TYPES(type1, type2)) { + case CONTAINER_PAIR(BITSET,BITSET): + bitset_container_xor_nocard(CAST_bitset(c1), + const_CAST_bitset(c2), + CAST_bitset(c1)); // is lazy + *result_type = BITSET_CONTAINER_TYPE; + return c1; + + // TODO: other cases being lazy, esp. when we know inplace not likely + // could see the corresponding code for union + default: + // we may have a dirty bitset (without a precomputed cardinality) + // and calling container_ixor on it might be unsafe. + if (type1 == BITSET_CONTAINER_TYPE) { + bitset_container_t *bc = CAST_bitset(c1); + if (bc->cardinality == BITSET_UNKNOWN_CARDINALITY) { + bc->cardinality = bitset_container_compute_cardinality(bc); + } + } + return container_ixor(c1, type1, c2, type2, result_type); + } +} + +/** + * Compute difference (andnot) between two containers, generate a new + * container (having type result_type), requires a typecode. This allocates new + * memory, caller is responsible for deallocation. + */ +static inline container_t *container_andnot( + const container_t *c1, uint8_t type1, + const container_t *c2, uint8_t type2, + uint8_t *result_type +){ + c1 = container_unwrap_shared(c1, &type1); + c2 = container_unwrap_shared(c2, &type2); + container_t *result = NULL; + switch (PAIR_CONTAINER_TYPES(type1, type2)) { + case CONTAINER_PAIR(BITSET,BITSET): + *result_type = bitset_bitset_container_andnot( + const_CAST_bitset(c1), + const_CAST_bitset(c2), &result) + ? BITSET_CONTAINER_TYPE + : ARRAY_CONTAINER_TYPE; + return result; + + case CONTAINER_PAIR(ARRAY,ARRAY): + result = array_container_create(); + array_array_container_andnot(const_CAST_array(c1), + const_CAST_array(c2), + CAST_array(result)); + *result_type = ARRAY_CONTAINER_TYPE; + return result; + + case CONTAINER_PAIR(RUN,RUN): + if (run_container_is_full(const_CAST_run(c2))) { + result = array_container_create(); + *result_type = ARRAY_CONTAINER_TYPE; + return result; + } + *result_type = + run_run_container_andnot(const_CAST_run(c1), + const_CAST_run(c2), &result); + return result; + + case CONTAINER_PAIR(BITSET,ARRAY): + *result_type = bitset_array_container_andnot( + const_CAST_bitset(c1), + const_CAST_array(c2), &result) + ? BITSET_CONTAINER_TYPE + : ARRAY_CONTAINER_TYPE; + return result; + + case CONTAINER_PAIR(ARRAY,BITSET): + result = array_container_create(); + array_bitset_container_andnot(const_CAST_array(c1), + const_CAST_bitset(c2), + CAST_array(result)); + *result_type = ARRAY_CONTAINER_TYPE; + return result; + + case CONTAINER_PAIR(BITSET,RUN): + if (run_container_is_full(const_CAST_run(c2))) { + result = array_container_create(); + *result_type = ARRAY_CONTAINER_TYPE; + return result; + } + *result_type = bitset_run_container_andnot( + const_CAST_bitset(c1), + const_CAST_run(c2), &result) + ? BITSET_CONTAINER_TYPE + : ARRAY_CONTAINER_TYPE; + return result; + + case CONTAINER_PAIR(RUN,BITSET): + *result_type = run_bitset_container_andnot( + const_CAST_run(c1), + const_CAST_bitset(c2), &result) + ? BITSET_CONTAINER_TYPE + : ARRAY_CONTAINER_TYPE; + return result; + + case CONTAINER_PAIR(ARRAY,RUN): + if (run_container_is_full(const_CAST_run(c2))) { + result = array_container_create(); + *result_type = ARRAY_CONTAINER_TYPE; + return result; + } + result = array_container_create(); + array_run_container_andnot(const_CAST_array(c1), + const_CAST_run(c2), + CAST_array(result)); + *result_type = ARRAY_CONTAINER_TYPE; + return result; + + case CONTAINER_PAIR(RUN,ARRAY): + *result_type = run_array_container_andnot( + const_CAST_run(c1), const_CAST_array(c2), + &result); + return result; + + default: + assert(false); + roaring_unreachable; + return NULL; // unreached + } +} + +/** + * Compute the andnot between two containers, with result in the first + * container. + * If the returned pointer is identical to c1, then the container has been + * modified. + * If the returned pointer is different from c1, then a new container has been + * created and the caller is responsible for freeing it. + * The type of the first container may change. Returns the modified + * (and possibly new) container +*/ +static inline container_t *container_iandnot( + container_t *c1, uint8_t type1, + const container_t *c2, uint8_t type2, + uint8_t *result_type +){ + c1 = get_writable_copy_if_shared(c1, &type1); + c2 = container_unwrap_shared(c2, &type2); + container_t *result = NULL; + switch (PAIR_CONTAINER_TYPES(type1, type2)) { + case CONTAINER_PAIR(BITSET,BITSET): + *result_type = bitset_bitset_container_iandnot( + CAST_bitset(c1), + const_CAST_bitset(c2), &result) + ? BITSET_CONTAINER_TYPE + : ARRAY_CONTAINER_TYPE; + return result; + + case CONTAINER_PAIR(ARRAY,ARRAY): + array_array_container_iandnot(CAST_array(c1), + const_CAST_array(c2)); + *result_type = ARRAY_CONTAINER_TYPE; + return c1; + + case CONTAINER_PAIR(RUN,RUN): + *result_type = run_run_container_iandnot( + CAST_run(c1), const_CAST_run(c2), &result); + return result; + + case CONTAINER_PAIR(BITSET,ARRAY): + *result_type = bitset_array_container_iandnot( + CAST_bitset(c1), + const_CAST_array(c2), &result) + ? BITSET_CONTAINER_TYPE + : ARRAY_CONTAINER_TYPE; + return result; + + case CONTAINER_PAIR(ARRAY,BITSET): + *result_type = ARRAY_CONTAINER_TYPE; + array_bitset_container_iandnot(CAST_array(c1), + const_CAST_bitset(c2)); + return c1; + + case CONTAINER_PAIR(BITSET,RUN): + *result_type = bitset_run_container_iandnot( + CAST_bitset(c1), + const_CAST_run(c2), &result) + ? BITSET_CONTAINER_TYPE + : ARRAY_CONTAINER_TYPE; + return result; + + case CONTAINER_PAIR(RUN,BITSET): + *result_type = run_bitset_container_iandnot( + CAST_run(c1), + const_CAST_bitset(c2), &result) + ? BITSET_CONTAINER_TYPE + : ARRAY_CONTAINER_TYPE; + return result; + + case CONTAINER_PAIR(ARRAY,RUN): + *result_type = ARRAY_CONTAINER_TYPE; + array_run_container_iandnot(CAST_array(c1), + const_CAST_run(c2)); + return c1; + + case CONTAINER_PAIR(RUN,ARRAY): + *result_type = run_array_container_iandnot( + CAST_run(c1), const_CAST_array(c2), &result); + return result; + + default: + assert(false); + roaring_unreachable; + return NULL; + } +} + +/** + * Visit all values x of the container once, passing (base+x,ptr) + * to iterator. You need to specify a container and its type. + * Returns true if the iteration should continue. + */ +static inline bool container_iterate( + const container_t *c, uint8_t type, + uint32_t base, + roaring_iterator iterator, void *ptr +){ + c = container_unwrap_shared(c, &type); + switch (type) { + case BITSET_CONTAINER_TYPE: + return bitset_container_iterate(const_CAST_bitset(c), + base, iterator, ptr); + case ARRAY_CONTAINER_TYPE: + return array_container_iterate(const_CAST_array(c), + base, iterator, ptr); + case RUN_CONTAINER_TYPE: + return run_container_iterate(const_CAST_run(c), + base, iterator, ptr); + default: + assert(false); + roaring_unreachable; + } + assert(false); + roaring_unreachable; + return false; +} + +static inline bool container_iterate64( + const container_t *c, uint8_t type, + uint32_t base, + roaring_iterator64 iterator, + uint64_t high_bits, void *ptr +){ + c = container_unwrap_shared(c, &type); + switch (type) { + case BITSET_CONTAINER_TYPE: + return bitset_container_iterate64(const_CAST_bitset(c), base, + iterator, high_bits, ptr); + case ARRAY_CONTAINER_TYPE: + return array_container_iterate64(const_CAST_array(c), base, + iterator, high_bits, ptr); + case RUN_CONTAINER_TYPE: + return run_container_iterate64(const_CAST_run(c), base, + iterator, high_bits, ptr); + default: + assert(false); + roaring_unreachable; + } + assert(false); + roaring_unreachable; + return false; +} + +static inline container_t *container_not( + const container_t *c, uint8_t type, + uint8_t *result_type +){ + c = container_unwrap_shared(c, &type); + container_t *result = NULL; + switch (type) { + case BITSET_CONTAINER_TYPE: + *result_type = bitset_container_negation( + const_CAST_bitset(c), &result) + ? BITSET_CONTAINER_TYPE + : ARRAY_CONTAINER_TYPE; + return result; + case ARRAY_CONTAINER_TYPE: + result = bitset_container_create(); + *result_type = BITSET_CONTAINER_TYPE; + array_container_negation(const_CAST_array(c), + CAST_bitset(result)); + return result; + case RUN_CONTAINER_TYPE: + *result_type = + run_container_negation(const_CAST_run(c), &result); + return result; + + default: + assert(false); + roaring_unreachable; + } + assert(false); + roaring_unreachable; + return NULL; +} + +static inline container_t *container_not_range( + const container_t *c, uint8_t type, + uint32_t range_start, uint32_t range_end, + uint8_t *result_type +){ + c = container_unwrap_shared(c, &type); + container_t *result = NULL; + switch (type) { + case BITSET_CONTAINER_TYPE: + *result_type = + bitset_container_negation_range( + const_CAST_bitset(c), range_start, range_end, &result) + ? BITSET_CONTAINER_TYPE + : ARRAY_CONTAINER_TYPE; + return result; + case ARRAY_CONTAINER_TYPE: + *result_type = + array_container_negation_range( + const_CAST_array(c), range_start, range_end, &result) + ? BITSET_CONTAINER_TYPE + : ARRAY_CONTAINER_TYPE; + return result; + case RUN_CONTAINER_TYPE: + *result_type = run_container_negation_range( + const_CAST_run(c), range_start, range_end, &result); + return result; + + default: + assert(false); + roaring_unreachable; + } + assert(false); + roaring_unreachable; + return NULL; +} + +static inline container_t *container_inot( + container_t *c, uint8_t type, + uint8_t *result_type +){ + c = get_writable_copy_if_shared(c, &type); + container_t *result = NULL; + switch (type) { + case BITSET_CONTAINER_TYPE: + *result_type = bitset_container_negation_inplace( + CAST_bitset(c), &result) + ? BITSET_CONTAINER_TYPE + : ARRAY_CONTAINER_TYPE; + return result; + case ARRAY_CONTAINER_TYPE: + // will never be inplace + result = bitset_container_create(); + *result_type = BITSET_CONTAINER_TYPE; + array_container_negation(CAST_array(c), + CAST_bitset(result)); + array_container_free(CAST_array(c)); + return result; + case RUN_CONTAINER_TYPE: + *result_type = + run_container_negation_inplace(CAST_run(c), &result); + return result; + + default: + assert(false); + roaring_unreachable; + } + assert(false); + roaring_unreachable; + return NULL; +} + +static inline container_t *container_inot_range( + container_t *c, uint8_t type, + uint32_t range_start, uint32_t range_end, + uint8_t *result_type +){ + c = get_writable_copy_if_shared(c, &type); + container_t *result = NULL; + switch (type) { + case BITSET_CONTAINER_TYPE: + *result_type = + bitset_container_negation_range_inplace( + CAST_bitset(c), range_start, range_end, &result) + ? BITSET_CONTAINER_TYPE + : ARRAY_CONTAINER_TYPE; + return result; + case ARRAY_CONTAINER_TYPE: + *result_type = + array_container_negation_range_inplace( + CAST_array(c), range_start, range_end, &result) + ? BITSET_CONTAINER_TYPE + : ARRAY_CONTAINER_TYPE; + return result; + case RUN_CONTAINER_TYPE: + *result_type = run_container_negation_range_inplace( + CAST_run(c), range_start, range_end, &result); + return result; + + default: + assert(false); + roaring_unreachable; + } + assert(false); + roaring_unreachable; + return NULL; +} + +/** + * If the element of given rank is in this container, supposing that + * the first + * element has rank start_rank, then the function returns true and + * sets element + * accordingly. + * Otherwise, it returns false and update start_rank. + */ +static inline bool container_select( + const container_t *c, uint8_t type, + uint32_t *start_rank, uint32_t rank, + uint32_t *element +){ + c = container_unwrap_shared(c, &type); + switch (type) { + case BITSET_CONTAINER_TYPE: + return bitset_container_select(const_CAST_bitset(c), + start_rank, rank, element); + case ARRAY_CONTAINER_TYPE: + return array_container_select(const_CAST_array(c), + start_rank, rank, element); + case RUN_CONTAINER_TYPE: + return run_container_select(const_CAST_run(c), + start_rank, rank, element); + default: + assert(false); + roaring_unreachable; + } + assert(false); + roaring_unreachable; + return false; +} + +static inline uint16_t container_maximum( + const container_t *c, uint8_t type +){ + c = container_unwrap_shared(c, &type); + switch (type) { + case BITSET_CONTAINER_TYPE: + return bitset_container_maximum(const_CAST_bitset(c)); + case ARRAY_CONTAINER_TYPE: + return array_container_maximum(const_CAST_array(c)); + case RUN_CONTAINER_TYPE: + return run_container_maximum(const_CAST_run(c)); + default: + assert(false); + roaring_unreachable; + } + assert(false); + roaring_unreachable; + return false; +} + +static inline uint16_t container_minimum( + const container_t *c, uint8_t type +){ + c = container_unwrap_shared(c, &type); + switch (type) { + case BITSET_CONTAINER_TYPE: + return bitset_container_minimum(const_CAST_bitset(c)); + case ARRAY_CONTAINER_TYPE: + return array_container_minimum(const_CAST_array(c)); + case RUN_CONTAINER_TYPE: + return run_container_minimum(const_CAST_run(c)); + default: + assert(false); + roaring_unreachable; + } + assert(false); + roaring_unreachable; + return false; +} + +// number of values smaller or equal to x +static inline int container_rank( + const container_t *c, uint8_t type, + uint16_t x +){ + c = container_unwrap_shared(c, &type); + switch (type) { + case BITSET_CONTAINER_TYPE: + return bitset_container_rank(const_CAST_bitset(c), x); + case ARRAY_CONTAINER_TYPE: + return array_container_rank(const_CAST_array(c), x); + case RUN_CONTAINER_TYPE: + return run_container_rank(const_CAST_run(c), x); + default: + assert(false); + roaring_unreachable; + } + assert(false); + roaring_unreachable; + return false; +} + +// return the index of x, if not exsist return -1 +static inline int container_get_index(const container_t *c, uint8_t type, + uint16_t x) { + c = container_unwrap_shared(c, &type); + switch (type) { + case BITSET_CONTAINER_TYPE: + return bitset_container_get_index(const_CAST_bitset(c), x); + case ARRAY_CONTAINER_TYPE: + return array_container_get_index(const_CAST_array(c), x); + case RUN_CONTAINER_TYPE: + return run_container_get_index(const_CAST_run(c), x); + default: + assert(false); + roaring_unreachable; + } + assert(false); + roaring_unreachable; + return false; +} + +/** + * Add all values in range [min, max] to a given container. + * + * If the returned pointer is different from $container, then a new container + * has been created and the caller is responsible for freeing it. + * The type of the first container may change. Returns the modified + * (and possibly new) container. + */ +static inline container_t *container_add_range( + container_t *c, uint8_t type, + uint32_t min, uint32_t max, + uint8_t *result_type +){ + // NB: when selecting new container type, we perform only inexpensive checks + switch (type) { + case BITSET_CONTAINER_TYPE: { + bitset_container_t *bitset = CAST_bitset(c); + + int32_t union_cardinality = 0; + union_cardinality += bitset->cardinality; + union_cardinality += max - min + 1; + union_cardinality -= bitset_lenrange_cardinality(bitset->words, + min, max-min); + + if (union_cardinality == INT32_C(0x10000)) { + *result_type = RUN_CONTAINER_TYPE; + return run_container_create_range(0, INT32_C(0x10000)); + } else { + *result_type = BITSET_CONTAINER_TYPE; + bitset_set_lenrange(bitset->words, min, max - min); + bitset->cardinality = union_cardinality; + return bitset; + } + } + case ARRAY_CONTAINER_TYPE: { + array_container_t *array = CAST_array(c); + + int32_t nvals_greater = count_greater(array->array, array->cardinality, max); + int32_t nvals_less = count_less(array->array, array->cardinality - nvals_greater, min); + int32_t union_cardinality = nvals_less + (max - min + 1) + nvals_greater; + + if (union_cardinality == INT32_C(0x10000)) { + *result_type = RUN_CONTAINER_TYPE; + return run_container_create_range(0, INT32_C(0x10000)); + } else if (union_cardinality <= DEFAULT_MAX_SIZE) { + *result_type = ARRAY_CONTAINER_TYPE; + array_container_add_range_nvals(array, min, max, nvals_less, nvals_greater); + return array; + } else { + *result_type = BITSET_CONTAINER_TYPE; + bitset_container_t *bitset = bitset_container_from_array(array); + bitset_set_lenrange(bitset->words, min, max - min); + bitset->cardinality = union_cardinality; + return bitset; + } + } + case RUN_CONTAINER_TYPE: { + run_container_t *run = CAST_run(c); + + int32_t nruns_greater = rle16_count_greater(run->runs, run->n_runs, max); + int32_t nruns_less = rle16_count_less(run->runs, run->n_runs - nruns_greater, min); + + int32_t run_size_bytes = (nruns_less + 1 + nruns_greater) * sizeof(rle16_t); + int32_t bitset_size_bytes = BITSET_CONTAINER_SIZE_IN_WORDS * sizeof(uint64_t); + + if (run_size_bytes <= bitset_size_bytes) { + run_container_add_range_nruns(run, min, max, nruns_less, nruns_greater); + *result_type = RUN_CONTAINER_TYPE; + return run; + } else { + return container_from_run_range(run, min, max, result_type); + } + } + default: + roaring_unreachable; + } +} + +/* + * Removes all elements in range [min, max]. + * Returns one of: + * - NULL if no elements left + * - pointer to the original container + * - pointer to a newly-allocated container (if it is more efficient) + * + * If the returned pointer is different from $container, then a new container + * has been created and the caller is responsible for freeing the original container. + */ +static inline container_t *container_remove_range( + container_t *c, uint8_t type, + uint32_t min, uint32_t max, + uint8_t *result_type +){ + switch (type) { + case BITSET_CONTAINER_TYPE: { + bitset_container_t *bitset = CAST_bitset(c); + + int32_t result_cardinality = bitset->cardinality - + bitset_lenrange_cardinality(bitset->words, min, max-min); + + if (result_cardinality == 0) { + return NULL; + } else if (result_cardinality <= DEFAULT_MAX_SIZE) { + *result_type = ARRAY_CONTAINER_TYPE; + bitset_reset_range(bitset->words, min, max+1); + bitset->cardinality = result_cardinality; + return array_container_from_bitset(bitset); + } else { + *result_type = BITSET_CONTAINER_TYPE; + bitset_reset_range(bitset->words, min, max+1); + bitset->cardinality = result_cardinality; + return bitset; + } + } + case ARRAY_CONTAINER_TYPE: { + array_container_t *array = CAST_array(c); + + int32_t nvals_greater = count_greater(array->array, array->cardinality, max); + int32_t nvals_less = count_less(array->array, array->cardinality - nvals_greater, min); + int32_t result_cardinality = nvals_less + nvals_greater; + + if (result_cardinality == 0) { + return NULL; + } else { + *result_type = ARRAY_CONTAINER_TYPE; + array_container_remove_range(array, nvals_less, + array->cardinality - result_cardinality); + return array; + } + } + case RUN_CONTAINER_TYPE: { + run_container_t *run = CAST_run(c); + + if (run->n_runs == 0) { + return NULL; + } + if (min <= run_container_minimum(run) && max >= run_container_maximum(run)) { + return NULL; + } + + run_container_remove_range(run, min, max); + return convert_run_to_efficient_container(run, result_type); + } + default: + roaring_unreachable; + } +} + +#ifdef __cplusplus +} } } // extern "C" { namespace roaring { namespace internal { +#endif + +#endif +/* end file include/roaring/containers/containers.h */ +/* begin file include/roaring/roaring_array.h */ +#ifndef INCLUDE_ROARING_ARRAY_H +#define INCLUDE_ROARING_ARRAY_H + +#include +#include +#include + + +#ifdef __cplusplus +extern "C" { namespace roaring { + +// Note: in pure C++ code, you should avoid putting `using` in header files +using api::roaring_array_t; + +namespace internal { +#endif + +enum { + SERIAL_COOKIE_NO_RUNCONTAINER = 12346, + SERIAL_COOKIE = 12347, + FROZEN_COOKIE = 13766, + NO_OFFSET_THRESHOLD = 4 +}; + +/** + * Create a new roaring array + */ +roaring_array_t *ra_create(void); + +/** + * Initialize an existing roaring array with the specified capacity (in number + * of containers) + */ +bool ra_init_with_capacity(roaring_array_t *new_ra, uint32_t cap); + +/** + * Initialize with zero capacity + */ +void ra_init(roaring_array_t *t); + +/** + * Copies this roaring array, we assume that dest is not initialized + */ +bool ra_copy(const roaring_array_t *source, roaring_array_t *dest, + bool copy_on_write); + +/* + * Shrinks the capacity, returns the number of bytes saved. + */ +int ra_shrink_to_fit(roaring_array_t *ra); + +/** + * Copies this roaring array, we assume that dest is initialized + */ +bool ra_overwrite(const roaring_array_t *source, roaring_array_t *dest, + bool copy_on_write); + +/** + * Frees the memory used by a roaring array + */ +void ra_clear(roaring_array_t *r); + +/** + * Frees the memory used by a roaring array, but does not free the containers + */ +void ra_clear_without_containers(roaring_array_t *r); + +/** + * Frees just the containers + */ +void ra_clear_containers(roaring_array_t *ra); + +/** + * Get the index corresponding to a 16-bit key + */ +inline int32_t ra_get_index(const roaring_array_t *ra, uint16_t x) { + if ((ra->size == 0) || ra->keys[ra->size - 1] == x) return ra->size - 1; + return binarySearch(ra->keys, (int32_t)ra->size, x); +} + +/** + * Retrieves the container at index i, filling in the typecode + */ +inline container_t *ra_get_container_at_index( + const roaring_array_t *ra, uint16_t i, uint8_t *typecode +){ + *typecode = ra->typecodes[i]; + return ra->containers[i]; +} + +/** + * Retrieves the key at index i + */ +inline uint16_t ra_get_key_at_index(const roaring_array_t *ra, uint16_t i) { + return ra->keys[i]; +} + +/** + * Add a new key-value pair at index i + */ +void ra_insert_new_key_value_at( + roaring_array_t *ra, int32_t i, uint16_t key, + container_t *c, uint8_t typecode); + +/** + * Append a new key-value pair + */ +void ra_append( + roaring_array_t *ra, uint16_t key, + container_t *c, uint8_t typecode); + +/** + * Append a new key-value pair to ra, cloning (in COW sense) a value from sa + * at index index + */ +void ra_append_copy(roaring_array_t *ra, const roaring_array_t *sa, + uint16_t index, bool copy_on_write); + +/** + * Append new key-value pairs to ra, cloning (in COW sense) values from sa + * at indexes + * [start_index, end_index) + */ +void ra_append_copy_range(roaring_array_t *ra, const roaring_array_t *sa, + int32_t start_index, int32_t end_index, + bool copy_on_write); + +/** appends from sa to ra, ending with the greatest key that is + * is less or equal stopping_key + */ +void ra_append_copies_until(roaring_array_t *ra, const roaring_array_t *sa, + uint16_t stopping_key, bool copy_on_write); + +/** appends from sa to ra, starting with the smallest key that is + * is strictly greater than before_start + */ + +void ra_append_copies_after(roaring_array_t *ra, const roaring_array_t *sa, + uint16_t before_start, bool copy_on_write); + +/** + * Move the key-value pairs to ra from sa at indexes + * [start_index, end_index), old array should not be freed + * (use ra_clear_without_containers) + **/ +void ra_append_move_range(roaring_array_t *ra, roaring_array_t *sa, + int32_t start_index, int32_t end_index); +/** + * Append new key-value pairs to ra, from sa at indexes + * [start_index, end_index) + */ +void ra_append_range(roaring_array_t *ra, roaring_array_t *sa, + int32_t start_index, int32_t end_index, + bool copy_on_write); + +/** + * Set the container at the corresponding index using the specified + * typecode. + */ +inline void ra_set_container_at_index( + const roaring_array_t *ra, int32_t i, + container_t *c, uint8_t typecode +){ + assert(i < ra->size); + ra->containers[i] = c; + ra->typecodes[i] = typecode; +} + +container_t *ra_get_container(roaring_array_t *ra, uint16_t x, uint8_t *typecode); + +/** + * If needed, increase the capacity of the array so that it can fit k values + * (at + * least); + */ +bool extend_array(roaring_array_t *ra, int32_t k); + +inline int32_t ra_get_size(const roaring_array_t *ra) { return ra->size; } + +static inline int32_t ra_advance_until(const roaring_array_t *ra, uint16_t x, + int32_t pos) { + return advanceUntil(ra->keys, pos, ra->size, x); +} + +int32_t ra_advance_until_freeing(roaring_array_t *ra, uint16_t x, int32_t pos); + +void ra_downsize(roaring_array_t *ra, int32_t new_length); + +inline void ra_replace_key_and_container_at_index( + roaring_array_t *ra, int32_t i, uint16_t key, + container_t *c, uint8_t typecode +){ + assert(i < ra->size); + + ra->keys[i] = key; + ra->containers[i] = c; + ra->typecodes[i] = typecode; +} + +// write set bits to an array +void ra_to_uint32_array(const roaring_array_t *ra, uint32_t *ans); + +bool ra_range_uint32_array(const roaring_array_t *ra, size_t offset, size_t limit, uint32_t *ans); + +/** + * write a bitmap to a buffer. This is meant to be compatible with + * the + * Java and Go versions. Return the size in bytes of the serialized + * output (which should be ra_portable_size_in_bytes(ra)). + */ +size_t ra_portable_serialize(const roaring_array_t *ra, char *buf); + +/** + * read a bitmap from a serialized version. This is meant to be compatible + * with the Java and Go versions. + * maxbytes indicates how many bytes available from buf. + * When the function returns true, roaring_array_t is populated with the data + * and *readbytes indicates how many bytes were read. In all cases, if the function + * returns true, then maxbytes >= *readbytes. + */ +bool ra_portable_deserialize(roaring_array_t *ra, const char *buf, const size_t maxbytes, size_t * readbytes); + +/** + * Quickly checks whether there is a serialized bitmap at the pointer, + * not exceeding size "maxbytes" in bytes. This function does not allocate + * memory dynamically. + * + * This function returns 0 if and only if no valid bitmap is found. + * Otherwise, it returns how many bytes are occupied by the bitmap data. + */ +size_t ra_portable_deserialize_size(const char *buf, const size_t maxbytes); + +/** + * How many bytes are required to serialize this bitmap (meant to be + * compatible + * with Java and Go versions) + */ +size_t ra_portable_size_in_bytes(const roaring_array_t *ra); + +/** + * return true if it contains at least one run container. + */ +bool ra_has_run_container(const roaring_array_t *ra); + +/** + * Size of the header when serializing (meant to be compatible + * with Java and Go versions) + */ +uint32_t ra_portable_header_size(const roaring_array_t *ra); + +/** + * If the container at the index i is share, unshare it (creating a local + * copy if needed). + */ +static inline void ra_unshare_container_at_index(roaring_array_t *ra, + uint16_t i) { + assert(i < ra->size); + ra->containers[i] = get_writable_copy_if_shared(ra->containers[i], + &ra->typecodes[i]); +} + +/** + * remove at index i, sliding over all entries after i + */ +void ra_remove_at_index(roaring_array_t *ra, int32_t i); + + +/** +* clears all containers, sets the size at 0 and shrinks the memory usage. +*/ +void ra_reset(roaring_array_t *ra); + +/** + * remove at index i, sliding over all entries after i. Free removed container. + */ +void ra_remove_at_index_and_free(roaring_array_t *ra, int32_t i); + +/** + * remove a chunk of indices, sliding over entries after it + */ +// void ra_remove_index_range(roaring_array_t *ra, int32_t begin, int32_t end); + +// used in inplace andNot only, to slide left the containers from +// the mutated RoaringBitmap that are after the largest container of +// the argument RoaringBitmap. It is followed by a call to resize. +// +void ra_copy_range(roaring_array_t *ra, uint32_t begin, uint32_t end, + uint32_t new_begin); + +/** + * Shifts rightmost $count containers to the left (distance < 0) or + * to the right (distance > 0). + * Allocates memory if necessary. + * This function doesn't free or create new containers. + * Caller is responsible for that. + */ +void ra_shift_tail(roaring_array_t *ra, int32_t count, int32_t distance); + +#ifdef __cplusplus +} // namespace internal +} } // extern "C" { namespace roaring { +#endif + +#endif +/* end file include/roaring/roaring_array.h */ +/* begin file src/array_util.c */ +#include +#include +#include +#include +#include +#include + + +#if CROARING_IS_X64 +#ifndef CROARING_COMPILER_SUPPORTS_AVX512 +#error "CROARING_COMPILER_SUPPORTS_AVX512 needs to be defined." +#endif // CROARING_COMPILER_SUPPORTS_AVX512 +#endif + +#ifdef __cplusplus +using namespace ::roaring::internal; +extern "C" { namespace roaring { namespace internal { +#endif + +extern inline int32_t binarySearch(const uint16_t *array, int32_t lenarray, + uint16_t ikey); + +#if CROARING_IS_X64 +// used by intersect_vector16 +ALIGNED(0x1000) +static const uint8_t shuffle_mask16[] = { + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0, 1, 2, 3, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 4, 5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 4, 5, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 2, 3, 4, 5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, 4, 5, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 6, 7, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0, 1, 6, 7, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 6, 7, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, + 6, 7, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 4, 5, 6, 7, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 4, 5, 6, 7, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 4, 5, + 6, 7, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0, 1, 2, 3, 4, 5, 6, 7, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 8, 9, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 8, 9, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 2, 3, 8, 9, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, 8, 9, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 4, 5, 8, 9, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0, 1, 4, 5, 8, 9, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 4, 5, 8, 9, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, + 4, 5, 8, 9, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 6, 7, 8, 9, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 6, 7, 8, 9, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 6, 7, + 8, 9, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0, 1, 2, 3, 6, 7, 8, 9, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 4, 5, 6, 7, 8, 9, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 4, 5, + 6, 7, 8, 9, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 2, 3, 4, 5, 6, 7, 8, 9, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 10, 11, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0, 1, 10, 11, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 10, 11, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, + 10, 11, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 4, 5, 10, 11, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 4, 5, 10, 11, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 4, 5, + 10, 11, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0, 1, 2, 3, 4, 5, 10, 11, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 6, 7, 10, 11, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 6, 7, + 10, 11, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 2, 3, 6, 7, 10, 11, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, 6, 7, 10, 11, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 4, 5, 6, 7, + 10, 11, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0, 1, 4, 5, 6, 7, 10, 11, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 4, 5, 6, 7, 10, 11, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, + 4, 5, 6, 7, 10, 11, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 8, 9, 10, 11, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 8, 9, 10, 11, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 8, 9, + 10, 11, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0, 1, 2, 3, 8, 9, 10, 11, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 4, 5, 8, 9, 10, 11, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 4, 5, + 8, 9, 10, 11, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 2, 3, 4, 5, 8, 9, 10, 11, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, 4, 5, 8, 9, + 10, 11, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 6, 7, 8, 9, + 10, 11, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0, 1, 6, 7, 8, 9, 10, 11, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 6, 7, 8, 9, 10, 11, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, + 6, 7, 8, 9, 10, 11, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 4, 5, 6, 7, 8, 9, 10, 11, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 4, 5, 6, 7, 8, 9, + 10, 11, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 4, 5, + 6, 7, 8, 9, 10, 11, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 0xFF, 0xFF, 0xFF, 0xFF, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 12, 13, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 2, 3, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, 12, 13, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 4, 5, 12, 13, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0, 1, 4, 5, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 4, 5, 12, 13, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, + 4, 5, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 6, 7, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 6, 7, 12, 13, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 6, 7, + 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0, 1, 2, 3, 6, 7, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 4, 5, 6, 7, 12, 13, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 4, 5, + 6, 7, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 2, 3, 4, 5, 6, 7, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, 4, 5, 6, 7, + 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 8, 9, 12, 13, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0, 1, 8, 9, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 8, 9, 12, 13, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, + 8, 9, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 4, 5, 8, 9, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 4, 5, 8, 9, 12, 13, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 4, 5, + 8, 9, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0, 1, 2, 3, 4, 5, 8, 9, 12, 13, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 6, 7, 8, 9, 12, 13, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 6, 7, + 8, 9, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 2, 3, 6, 7, 8, 9, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, 6, 7, 8, 9, + 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 4, 5, 6, 7, + 8, 9, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0, 1, 4, 5, 6, 7, 8, 9, 12, 13, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 4, 5, 6, 7, 8, 9, + 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, + 4, 5, 6, 7, 8, 9, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, + 10, 11, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 10, 11, 12, 13, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 10, 11, + 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0, 1, 2, 3, 10, 11, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 4, 5, 10, 11, 12, 13, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 4, 5, + 10, 11, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 2, 3, 4, 5, 10, 11, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, 4, 5, 10, 11, + 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 6, 7, 10, 11, + 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0, 1, 6, 7, 10, 11, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 6, 7, 10, 11, 12, 13, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, + 6, 7, 10, 11, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 4, 5, 6, 7, 10, 11, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 4, 5, 6, 7, 10, 11, + 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 4, 5, + 6, 7, 10, 11, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13, + 0xFF, 0xFF, 0xFF, 0xFF, 8, 9, 10, 11, 12, 13, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 8, 9, + 10, 11, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 2, 3, 8, 9, 10, 11, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, 8, 9, 10, 11, + 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 4, 5, 8, 9, + 10, 11, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0, 1, 4, 5, 8, 9, 10, 11, 12, 13, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 4, 5, 8, 9, 10, 11, + 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, + 4, 5, 8, 9, 10, 11, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, + 6, 7, 8, 9, 10, 11, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 6, 7, 8, 9, 10, 11, + 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 6, 7, + 8, 9, 10, 11, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0, 1, 2, 3, 6, 7, 8, 9, 10, 11, 12, 13, + 0xFF, 0xFF, 0xFF, 0xFF, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 4, 5, + 6, 7, 8, 9, 10, 11, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, + 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 0xFF, 0xFF, 14, 15, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0, 1, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, + 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 4, 5, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 4, 5, 14, 15, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 4, 5, + 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0, 1, 2, 3, 4, 5, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 6, 7, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 6, 7, + 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 2, 3, 6, 7, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, 6, 7, 14, 15, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 4, 5, 6, 7, + 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0, 1, 4, 5, 6, 7, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 4, 5, 6, 7, 14, 15, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, + 4, 5, 6, 7, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 8, 9, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 8, 9, 14, 15, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 8, 9, + 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0, 1, 2, 3, 8, 9, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 4, 5, 8, 9, 14, 15, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 4, 5, + 8, 9, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 2, 3, 4, 5, 8, 9, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, 4, 5, 8, 9, + 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 6, 7, 8, 9, + 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0, 1, 6, 7, 8, 9, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 6, 7, 8, 9, 14, 15, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, + 6, 7, 8, 9, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 4, 5, 6, 7, 8, 9, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 4, 5, 6, 7, 8, 9, + 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 4, 5, + 6, 7, 8, 9, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 14, 15, + 0xFF, 0xFF, 0xFF, 0xFF, 10, 11, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 10, 11, + 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 2, 3, 10, 11, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, 10, 11, 14, 15, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 4, 5, 10, 11, + 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0, 1, 4, 5, 10, 11, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 4, 5, 10, 11, 14, 15, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, + 4, 5, 10, 11, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 6, 7, 10, 11, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 6, 7, 10, 11, 14, 15, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 6, 7, + 10, 11, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 4, 5, 6, 7, 10, 11, 14, 15, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 4, 5, + 6, 7, 10, 11, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 2, 3, 4, 5, 6, 7, 10, 11, 14, 15, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, 4, 5, 6, 7, + 10, 11, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 8, 9, 10, 11, + 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0, 1, 8, 9, 10, 11, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 8, 9, 10, 11, 14, 15, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, + 8, 9, 10, 11, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 4, 5, 8, 9, 10, 11, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 4, 5, 8, 9, 10, 11, + 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 4, 5, + 8, 9, 10, 11, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 14, 15, + 0xFF, 0xFF, 0xFF, 0xFF, 6, 7, 8, 9, 10, 11, 14, 15, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 6, 7, + 8, 9, 10, 11, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 2, 3, 6, 7, 8, 9, 10, 11, 14, 15, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, 6, 7, 8, 9, + 10, 11, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 4, 5, 6, 7, + 8, 9, 10, 11, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, + 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, + 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, 0xFF, 0xFF, + 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 12, 13, 14, 15, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 12, 13, + 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0, 1, 2, 3, 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 4, 5, 12, 13, 14, 15, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 4, 5, + 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 2, 3, 4, 5, 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, 4, 5, 12, 13, + 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 6, 7, 12, 13, + 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0, 1, 6, 7, 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 6, 7, 12, 13, 14, 15, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, + 6, 7, 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 4, 5, 6, 7, 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 4, 5, 6, 7, 12, 13, + 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 4, 5, + 6, 7, 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, + 0xFF, 0xFF, 0xFF, 0xFF, 8, 9, 12, 13, 14, 15, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 8, 9, + 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 2, 3, 8, 9, 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, 8, 9, 12, 13, + 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 4, 5, 8, 9, + 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0, 1, 4, 5, 8, 9, 12, 13, 14, 15, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 4, 5, 8, 9, 12, 13, + 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, + 4, 5, 8, 9, 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, + 6, 7, 8, 9, 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 6, 7, 8, 9, 12, 13, + 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 6, 7, + 8, 9, 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0, 1, 2, 3, 6, 7, 8, 9, 12, 13, 14, 15, + 0xFF, 0xFF, 0xFF, 0xFF, 4, 5, 6, 7, 8, 9, 12, 13, + 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 4, 5, + 6, 7, 8, 9, 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, + 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, + 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 12, 13, 14, 15, 0xFF, 0xFF, 10, 11, 12, 13, + 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0, 1, 10, 11, 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 10, 11, 12, 13, 14, 15, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, + 10, 11, 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 4, 5, 10, 11, 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 4, 5, 10, 11, 12, 13, + 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 4, 5, + 10, 11, 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0, 1, 2, 3, 4, 5, 10, 11, 12, 13, 14, 15, + 0xFF, 0xFF, 0xFF, 0xFF, 6, 7, 10, 11, 12, 13, 14, 15, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 6, 7, + 10, 11, 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 2, 3, 6, 7, 10, 11, 12, 13, 14, 15, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, 6, 7, 10, 11, + 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 4, 5, 6, 7, + 10, 11, 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0, 1, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15, + 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 4, 5, 6, 7, 10, 11, + 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, + 4, 5, 6, 7, 10, 11, 12, 13, 14, 15, 0xFF, 0xFF, + 8, 9, 10, 11, 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 8, 9, 10, 11, 12, 13, + 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 8, 9, + 10, 11, 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, + 0xFF, 0xFF, 0xFF, 0xFF, 4, 5, 8, 9, 10, 11, 12, 13, + 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 4, 5, + 8, 9, 10, 11, 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, + 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, + 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, 4, 5, 8, 9, + 10, 11, 12, 13, 14, 15, 0xFF, 0xFF, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0, 1, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, + 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0xFF, 0xFF, + 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 0xFF, 0xFF, 2, 3, 4, 5, + 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0xFF, 0xFF, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15}; + +/** + * From Schlegel et al., Fast Sorted-Set Intersection using SIMD Instructions + * Optimized by D. Lemire on May 3rd 2013 + */ +CROARING_TARGET_AVX2 +int32_t intersect_vector16(const uint16_t *__restrict__ A, size_t s_a, + const uint16_t *__restrict__ B, size_t s_b, + uint16_t *C) { + size_t count = 0; + size_t i_a = 0, i_b = 0; + const int vectorlength = sizeof(__m128i) / sizeof(uint16_t); + const size_t st_a = (s_a / vectorlength) * vectorlength; + const size_t st_b = (s_b / vectorlength) * vectorlength; + __m128i v_a, v_b; + if ((i_a < st_a) && (i_b < st_b)) { + v_a = _mm_lddqu_si128((__m128i *)&A[i_a]); + v_b = _mm_lddqu_si128((__m128i *)&B[i_b]); + while ((A[i_a] == 0) || (B[i_b] == 0)) { + const __m128i res_v = _mm_cmpestrm( + v_b, vectorlength, v_a, vectorlength, + _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK); + const int r = _mm_extract_epi32(res_v, 0); + __m128i sm16 = _mm_loadu_si128((const __m128i *)shuffle_mask16 + r); + __m128i p = _mm_shuffle_epi8(v_a, sm16); + _mm_storeu_si128((__m128i *)&C[count], p); // can overflow + count += _mm_popcnt_u32(r); + const uint16_t a_max = A[i_a + vectorlength - 1]; + const uint16_t b_max = B[i_b + vectorlength - 1]; + if (a_max <= b_max) { + i_a += vectorlength; + if (i_a == st_a) break; + v_a = _mm_lddqu_si128((__m128i *)&A[i_a]); + } + if (b_max <= a_max) { + i_b += vectorlength; + if (i_b == st_b) break; + v_b = _mm_lddqu_si128((__m128i *)&B[i_b]); + } + } + if ((i_a < st_a) && (i_b < st_b)) + while (true) { + const __m128i res_v = _mm_cmpistrm( + v_b, v_a, + _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK); + const int r = _mm_extract_epi32(res_v, 0); + __m128i sm16 = + _mm_loadu_si128((const __m128i *)shuffle_mask16 + r); + __m128i p = _mm_shuffle_epi8(v_a, sm16); + _mm_storeu_si128((__m128i *)&C[count], p); // can overflow + count += _mm_popcnt_u32(r); + const uint16_t a_max = A[i_a + vectorlength - 1]; + const uint16_t b_max = B[i_b + vectorlength - 1]; + if (a_max <= b_max) { + i_a += vectorlength; + if (i_a == st_a) break; + v_a = _mm_lddqu_si128((__m128i *)&A[i_a]); + } + if (b_max <= a_max) { + i_b += vectorlength; + if (i_b == st_b) break; + v_b = _mm_lddqu_si128((__m128i *)&B[i_b]); + } + } + } + // intersect the tail using scalar intersection + while (i_a < s_a && i_b < s_b) { + uint16_t a = A[i_a]; + uint16_t b = B[i_b]; + if (a < b) { + i_a++; + } else if (b < a) { + i_b++; + } else { + C[count] = a; //==b; + count++; + i_a++; + i_b++; + } + } + return (int32_t)count; +} + +ALLOW_UNALIGNED +int array_container_to_uint32_array_vector16(void *vout, const uint16_t* array, size_t cardinality, + uint32_t base) { + int outpos = 0; + uint32_t *out = (uint32_t *)vout; + size_t i = 0; + for ( ;i + sizeof(__m128i)/sizeof(uint16_t) <= cardinality; i += sizeof(__m128i)/sizeof(uint16_t)) { + __m128i vinput = _mm_loadu_si128((const __m128i*) (array + i)); + __m256i voutput = _mm256_add_epi32(_mm256_cvtepu16_epi32(vinput), _mm256_set1_epi32(base)); + _mm256_storeu_si256((__m256i*)(out + outpos), voutput); + outpos += sizeof(__m256i)/sizeof(uint32_t); + } + for ( ; i < cardinality; ++i) { + const uint32_t val = base + array[i]; + memcpy(out + outpos, &val, + sizeof(uint32_t)); // should be compiled as a MOV on x64 + outpos++; + } + return outpos; +} + +int32_t intersect_vector16_inplace(uint16_t *__restrict__ A, size_t s_a, + const uint16_t *__restrict__ B, size_t s_b) { + size_t count = 0; + size_t i_a = 0, i_b = 0; + const int vectorlength = sizeof(__m128i) / sizeof(uint16_t); + const size_t st_a = (s_a / vectorlength) * vectorlength; + const size_t st_b = (s_b / vectorlength) * vectorlength; + __m128i v_a, v_b; + if ((i_a < st_a) && (i_b < st_b)) { + v_a = _mm_lddqu_si128((__m128i *)&A[i_a]); + v_b = _mm_lddqu_si128((__m128i *)&B[i_b]); + __m128i tmp[2] = {_mm_setzero_si128()}; + size_t tmp_count = 0; + while ((A[i_a] == 0) || (B[i_b] == 0)) { + const __m128i res_v = _mm_cmpestrm( + v_b, vectorlength, v_a, vectorlength, + _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK); + const int r = _mm_extract_epi32(res_v, 0); + __m128i sm16 = _mm_loadu_si128((const __m128i *)shuffle_mask16 + r); + __m128i p = _mm_shuffle_epi8(v_a, sm16); + _mm_storeu_si128((__m128i*)&((uint16_t*)tmp)[tmp_count], p); + tmp_count += _mm_popcnt_u32(r); + const uint16_t a_max = A[i_a + vectorlength - 1]; + const uint16_t b_max = B[i_b + vectorlength - 1]; + if (a_max <= b_max) { + _mm_storeu_si128((__m128i *)&A[count], tmp[0]); + _mm_storeu_si128(tmp, _mm_setzero_si128()); + count += tmp_count; + tmp_count = 0; + i_a += vectorlength; + if (i_a == st_a) break; + v_a = _mm_lddqu_si128((__m128i *)&A[i_a]); + } + if (b_max <= a_max) { + i_b += vectorlength; + if (i_b == st_b) break; + v_b = _mm_lddqu_si128((__m128i *)&B[i_b]); + } + } + if ((i_a < st_a) && (i_b < st_b)) { + while (true) { + const __m128i res_v = _mm_cmpistrm( + v_b, v_a, + _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK); + const int r = _mm_extract_epi32(res_v, 0); + __m128i sm16 = _mm_loadu_si128((const __m128i *)shuffle_mask16 + r); + __m128i p = _mm_shuffle_epi8(v_a, sm16); + _mm_storeu_si128((__m128i*)&((uint16_t*)tmp)[tmp_count], p); + tmp_count += _mm_popcnt_u32(r); + const uint16_t a_max = A[i_a + vectorlength - 1]; + const uint16_t b_max = B[i_b + vectorlength - 1]; + if (a_max <= b_max) { + _mm_storeu_si128((__m128i *)&A[count], tmp[0]); + _mm_storeu_si128(tmp, _mm_setzero_si128()); + count += tmp_count; + tmp_count = 0; + i_a += vectorlength; + if (i_a == st_a) break; + v_a = _mm_lddqu_si128((__m128i *)&A[i_a]); + } + if (b_max <= a_max) { + i_b += vectorlength; + if (i_b == st_b) break; + v_b = _mm_lddqu_si128((__m128i *)&B[i_b]); + } + } + } + // tmp_count <= 8, so this does not affect efficiency so much + for (size_t i = 0; i < tmp_count; i++) { + A[count] = ((uint16_t*)tmp)[i]; + count++; + } + i_a += tmp_count; // We can at least jump pass $tmp_count elements in A + } + // intersect the tail using scalar intersection + while (i_a < s_a && i_b < s_b) { + uint16_t a = A[i_a]; + uint16_t b = B[i_b]; + if (a < b) { + i_a++; + } else if (b < a) { + i_b++; + } else { + A[count] = a; //==b; + count++; + i_a++; + i_b++; + } + } + return (int32_t)count; +} +CROARING_UNTARGET_AVX2 + +CROARING_TARGET_AVX2 +int32_t intersect_vector16_cardinality(const uint16_t *__restrict__ A, + size_t s_a, + const uint16_t *__restrict__ B, + size_t s_b) { + size_t count = 0; + size_t i_a = 0, i_b = 0; + const int vectorlength = sizeof(__m128i) / sizeof(uint16_t); + const size_t st_a = (s_a / vectorlength) * vectorlength; + const size_t st_b = (s_b / vectorlength) * vectorlength; + __m128i v_a, v_b; + if ((i_a < st_a) && (i_b < st_b)) { + v_a = _mm_lddqu_si128((__m128i *)&A[i_a]); + v_b = _mm_lddqu_si128((__m128i *)&B[i_b]); + while ((A[i_a] == 0) || (B[i_b] == 0)) { + const __m128i res_v = _mm_cmpestrm( + v_b, vectorlength, v_a, vectorlength, + _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK); + const int r = _mm_extract_epi32(res_v, 0); + count += _mm_popcnt_u32(r); + const uint16_t a_max = A[i_a + vectorlength - 1]; + const uint16_t b_max = B[i_b + vectorlength - 1]; + if (a_max <= b_max) { + i_a += vectorlength; + if (i_a == st_a) break; + v_a = _mm_lddqu_si128((__m128i *)&A[i_a]); + } + if (b_max <= a_max) { + i_b += vectorlength; + if (i_b == st_b) break; + v_b = _mm_lddqu_si128((__m128i *)&B[i_b]); + } + } + if ((i_a < st_a) && (i_b < st_b)) + while (true) { + const __m128i res_v = _mm_cmpistrm( + v_b, v_a, + _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK); + const int r = _mm_extract_epi32(res_v, 0); + count += _mm_popcnt_u32(r); + const uint16_t a_max = A[i_a + vectorlength - 1]; + const uint16_t b_max = B[i_b + vectorlength - 1]; + if (a_max <= b_max) { + i_a += vectorlength; + if (i_a == st_a) break; + v_a = _mm_lddqu_si128((__m128i *)&A[i_a]); + } + if (b_max <= a_max) { + i_b += vectorlength; + if (i_b == st_b) break; + v_b = _mm_lddqu_si128((__m128i *)&B[i_b]); + } + } + } + // intersect the tail using scalar intersection + while (i_a < s_a && i_b < s_b) { + uint16_t a = A[i_a]; + uint16_t b = B[i_b]; + if (a < b) { + i_a++; + } else if (b < a) { + i_b++; + } else { + count++; + i_a++; + i_b++; + } + } + return (int32_t)count; +} +CROARING_UNTARGET_AVX2 + +CROARING_TARGET_AVX2 +///////// +// Warning: +// This function may not be safe if A == C or B == C. +///////// +int32_t difference_vector16(const uint16_t *__restrict__ A, size_t s_a, + const uint16_t *__restrict__ B, size_t s_b, + uint16_t *C) { + // we handle the degenerate case + if (s_a == 0) return 0; + if (s_b == 0) { + if (A != C) memcpy(C, A, sizeof(uint16_t) * s_a); + return (int32_t)s_a; + } + // handle the leading zeroes, it is messy but it allows us to use the fast + // _mm_cmpistrm instrinsic safely + int32_t count = 0; + if ((A[0] == 0) || (B[0] == 0)) { + if ((A[0] == 0) && (B[0] == 0)) { + A++; + s_a--; + B++; + s_b--; + } else if (A[0] == 0) { + C[count++] = 0; + A++; + s_a--; + } else { + B++; + s_b--; + } + } + // at this point, we have two non-empty arrays, made of non-zero + // increasing values. + size_t i_a = 0, i_b = 0; + const size_t vectorlength = sizeof(__m128i) / sizeof(uint16_t); + const size_t st_a = (s_a / vectorlength) * vectorlength; + const size_t st_b = (s_b / vectorlength) * vectorlength; + if ((i_a < st_a) && (i_b < st_b)) { // this is the vectorized code path + __m128i v_a, v_b; //, v_bmax; + // we load a vector from A and a vector from B + v_a = _mm_lddqu_si128((__m128i *)&A[i_a]); + v_b = _mm_lddqu_si128((__m128i *)&B[i_b]); + // we have a runningmask which indicates which values from A have been + // spotted in B, these don't get written out. + __m128i runningmask_a_found_in_b = _mm_setzero_si128(); + /**** + * start of the main vectorized loop + *****/ + while (true) { + // afoundinb will contain a mask indicate for each entry in A + // whether it is seen + // in B + const __m128i a_found_in_b = + _mm_cmpistrm(v_b, v_a, _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | + _SIDD_BIT_MASK); + runningmask_a_found_in_b = + _mm_or_si128(runningmask_a_found_in_b, a_found_in_b); + // we always compare the last values of A and B + const uint16_t a_max = A[i_a + vectorlength - 1]; + const uint16_t b_max = B[i_b + vectorlength - 1]; + if (a_max <= b_max) { + // Ok. In this code path, we are ready to write our v_a + // because there is no need to read more from B, they will + // all be large values. + const int bitmask_belongs_to_difference = + _mm_extract_epi32(runningmask_a_found_in_b, 0) ^ 0xFF; + /*** next few lines are probably expensive *****/ + __m128i sm16 = _mm_loadu_si128((const __m128i *)shuffle_mask16 + + bitmask_belongs_to_difference); + __m128i p = _mm_shuffle_epi8(v_a, sm16); + _mm_storeu_si128((__m128i *)&C[count], p); // can overflow + count += _mm_popcnt_u32(bitmask_belongs_to_difference); + // we advance a + i_a += vectorlength; + if (i_a == st_a) // no more + break; + runningmask_a_found_in_b = _mm_setzero_si128(); + v_a = _mm_lddqu_si128((__m128i *)&A[i_a]); + } + if (b_max <= a_max) { + // in this code path, the current v_b has become useless + i_b += vectorlength; + if (i_b == st_b) break; + v_b = _mm_lddqu_si128((__m128i *)&B[i_b]); + } + } + // at this point, either we have i_a == st_a, which is the end of the + // vectorized processing, + // or we have i_b == st_b, and we are not done processing the vector... + // so we need to finish it off. + if (i_a < st_a) { // we have unfinished business... + uint16_t buffer[8]; // buffer to do a masked load + memset(buffer, 0, 8 * sizeof(uint16_t)); + memcpy(buffer, B + i_b, (s_b - i_b) * sizeof(uint16_t)); + v_b = _mm_lddqu_si128((__m128i *)buffer); + const __m128i a_found_in_b = + _mm_cmpistrm(v_b, v_a, _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | + _SIDD_BIT_MASK); + runningmask_a_found_in_b = + _mm_or_si128(runningmask_a_found_in_b, a_found_in_b); + const int bitmask_belongs_to_difference = + _mm_extract_epi32(runningmask_a_found_in_b, 0) ^ 0xFF; + __m128i sm16 = _mm_loadu_si128((const __m128i *)shuffle_mask16 + + bitmask_belongs_to_difference); + __m128i p = _mm_shuffle_epi8(v_a, sm16); + _mm_storeu_si128((__m128i *)&C[count], p); // can overflow + count += _mm_popcnt_u32(bitmask_belongs_to_difference); + i_a += vectorlength; + } + // at this point we should have i_a == st_a and i_b == st_b + } + // do the tail using scalar code + while (i_a < s_a && i_b < s_b) { + uint16_t a = A[i_a]; + uint16_t b = B[i_b]; + if (b < a) { + i_b++; + } else if (a < b) { + C[count] = a; + count++; + i_a++; + } else { //== + i_a++; + i_b++; + } + } + if (i_a < s_a) { + if(C == A) { + assert((size_t)count <= i_a); + if((size_t)count < i_a) { + memmove(C + count, A + i_a, sizeof(uint16_t) * (s_a - i_a)); + } + } else { + for(size_t i = 0; i < (s_a - i_a); i++) { + C[count + i] = A[i + i_a]; + } + } + count += (int32_t)(s_a - i_a); + } + return count; +} +CROARING_UNTARGET_AVX2 +#endif // CROARING_IS_X64 + + + +/** +* Branchless binary search going after 4 values at once. +* Assumes that array is sorted. +* You have that array[*index1] >= target1, array[*index12] >= target2, ... +* except when *index1 = n, in which case you know that all values in array are +* smaller than target1, and so forth. +* It has logarithmic complexity. +*/ +static void binarySearch4(const uint16_t *array, int32_t n, uint16_t target1, + uint16_t target2, uint16_t target3, uint16_t target4, + int32_t *index1, int32_t *index2, int32_t *index3, + int32_t *index4) { + const uint16_t *base1 = array; + const uint16_t *base2 = array; + const uint16_t *base3 = array; + const uint16_t *base4 = array; + if (n == 0) + return; + while (n > 1) { + int32_t half = n >> 1; + base1 = (base1[half] < target1) ? &base1[half] : base1; + base2 = (base2[half] < target2) ? &base2[half] : base2; + base3 = (base3[half] < target3) ? &base3[half] : base3; + base4 = (base4[half] < target4) ? &base4[half] : base4; + n -= half; + } + *index1 = (int32_t)((*base1 < target1) + base1 - array); + *index2 = (int32_t)((*base2 < target2) + base2 - array); + *index3 = (int32_t)((*base3 < target3) + base3 - array); + *index4 = (int32_t)((*base4 < target4) + base4 - array); +} + +/** +* Branchless binary search going after 2 values at once. +* Assumes that array is sorted. +* You have that array[*index1] >= target1, array[*index12] >= target2. +* except when *index1 = n, in which case you know that all values in array are +* smaller than target1, and so forth. +* It has logarithmic complexity. +*/ +static void binarySearch2(const uint16_t *array, int32_t n, uint16_t target1, + uint16_t target2, int32_t *index1, int32_t *index2) { + const uint16_t *base1 = array; + const uint16_t *base2 = array; + if (n == 0) + return; + while (n > 1) { + int32_t half = n >> 1; + base1 = (base1[half] < target1) ? &base1[half] : base1; + base2 = (base2[half] < target2) ? &base2[half] : base2; + n -= half; + } + *index1 = (int32_t)((*base1 < target1) + base1 - array); + *index2 = (int32_t)((*base2 < target2) + base2 - array); +} + +/* Computes the intersection between one small and one large set of uint16_t. + * Stores the result into buffer and return the number of elements. + * Processes the small set in blocks of 4 values calling binarySearch4 + * and binarySearch2. This approach can be slightly superior to a conventional + * galloping search in some instances. + */ +int32_t intersect_skewed_uint16(const uint16_t *small, size_t size_s, + const uint16_t *large, size_t size_l, + uint16_t *buffer) { + size_t pos = 0, idx_l = 0, idx_s = 0; + + if (0 == size_s) { + return 0; + } + int32_t index1 = 0, index2 = 0, index3 = 0, index4 = 0; + while ((idx_s + 4 <= size_s) && (idx_l < size_l)) { + uint16_t target1 = small[idx_s]; + uint16_t target2 = small[idx_s + 1]; + uint16_t target3 = small[idx_s + 2]; + uint16_t target4 = small[idx_s + 3]; + binarySearch4(large + idx_l, (int32_t)(size_l - idx_l), target1, target2, target3, + target4, &index1, &index2, &index3, &index4); + if ((index1 + idx_l < size_l) && (large[idx_l + index1] == target1)) { + buffer[pos++] = target1; + } + if ((index2 + idx_l < size_l) && (large[idx_l + index2] == target2)) { + buffer[pos++] = target2; + } + if ((index3 + idx_l < size_l) && (large[idx_l + index3] == target3)) { + buffer[pos++] = target3; + } + if ((index4 + idx_l < size_l) && (large[idx_l + index4] == target4)) { + buffer[pos++] = target4; + } + idx_s += 4; + idx_l += index4; + } + if ((idx_s + 2 <= size_s) && (idx_l < size_l)) { + uint16_t target1 = small[idx_s]; + uint16_t target2 = small[idx_s + 1]; + binarySearch2(large + idx_l, (int32_t)(size_l - idx_l), target1, target2, &index1, + &index2); + if ((index1 + idx_l < size_l) && (large[idx_l + index1] == target1)) { + buffer[pos++] = target1; + } + if ((index2 + idx_l < size_l) && (large[idx_l + index2] == target2)) { + buffer[pos++] = target2; + } + idx_s += 2; + idx_l += index2; + } + if ((idx_s < size_s) && (idx_l < size_l)) { + uint16_t val_s = small[idx_s]; + int32_t index = binarySearch(large + idx_l, (int32_t)(size_l - idx_l), val_s); + if (index >= 0) + buffer[pos++] = val_s; + } + return (int32_t)pos; +} + + + +// TODO: this could be accelerated, possibly, by using binarySearch4 as above. +int32_t intersect_skewed_uint16_cardinality(const uint16_t *small, + size_t size_s, + const uint16_t *large, + size_t size_l) { + size_t pos = 0, idx_l = 0, idx_s = 0; + + if (0 == size_s) { + return 0; + } + + uint16_t val_l = large[idx_l], val_s = small[idx_s]; + + while (true) { + if (val_l < val_s) { + idx_l = advanceUntil(large, (int32_t)idx_l, (int32_t)size_l, val_s); + if (idx_l == size_l) break; + val_l = large[idx_l]; + } else if (val_s < val_l) { + idx_s++; + if (idx_s == size_s) break; + val_s = small[idx_s]; + } else { + pos++; + idx_s++; + if (idx_s == size_s) break; + val_s = small[idx_s]; + idx_l = advanceUntil(large, (int32_t)idx_l, (int32_t)size_l, val_s); + if (idx_l == size_l) break; + val_l = large[idx_l]; + } + } + + return (int32_t)pos; +} + +bool intersect_skewed_uint16_nonempty(const uint16_t *small, size_t size_s, + const uint16_t *large, size_t size_l) { + size_t idx_l = 0, idx_s = 0; + + if (0 == size_s) { + return false; + } + + uint16_t val_l = large[idx_l], val_s = small[idx_s]; + + while (true) { + if (val_l < val_s) { + idx_l = advanceUntil(large, (int32_t)idx_l, (int32_t)size_l, val_s); + if (idx_l == size_l) break; + val_l = large[idx_l]; + } else if (val_s < val_l) { + idx_s++; + if (idx_s == size_s) break; + val_s = small[idx_s]; + } else { + return true; + } + } + + return false; +} + +/** + * Generic intersection function. + */ +int32_t intersect_uint16(const uint16_t *A, const size_t lenA, + const uint16_t *B, const size_t lenB, uint16_t *out) { + const uint16_t *initout = out; + if (lenA == 0 || lenB == 0) return 0; + const uint16_t *endA = A + lenA; + const uint16_t *endB = B + lenB; + + while (1) { + while (*A < *B) { + SKIP_FIRST_COMPARE: + if (++A == endA) return (int32_t)(out - initout); + } + while (*A > *B) { + if (++B == endB) return (int32_t)(out - initout); + } + if (*A == *B) { + *out++ = *A; + if (++A == endA || ++B == endB) return (int32_t)(out - initout); + } else { + goto SKIP_FIRST_COMPARE; + } + } + return (int32_t)(out - initout); // NOTREACHED +} + +int32_t intersect_uint16_cardinality(const uint16_t *A, const size_t lenA, + const uint16_t *B, const size_t lenB) { + int32_t answer = 0; + if (lenA == 0 || lenB == 0) return 0; + const uint16_t *endA = A + lenA; + const uint16_t *endB = B + lenB; + + while (1) { + while (*A < *B) { + SKIP_FIRST_COMPARE: + if (++A == endA) return answer; + } + while (*A > *B) { + if (++B == endB) return answer; + } + if (*A == *B) { + ++answer; + if (++A == endA || ++B == endB) return answer; + } else { + goto SKIP_FIRST_COMPARE; + } + } + return answer; // NOTREACHED +} + + +bool intersect_uint16_nonempty(const uint16_t *A, const size_t lenA, + const uint16_t *B, const size_t lenB) { + if (lenA == 0 || lenB == 0) return 0; + const uint16_t *endA = A + lenA; + const uint16_t *endB = B + lenB; + + while (1) { + while (*A < *B) { + SKIP_FIRST_COMPARE: + if (++A == endA) return false; + } + while (*A > *B) { + if (++B == endB) return false; + } + if (*A == *B) { + return true; + } else { + goto SKIP_FIRST_COMPARE; + } + } + return false; // NOTREACHED +} + + + +/** + * Generic intersection function. + */ +size_t intersection_uint32(const uint32_t *A, const size_t lenA, + const uint32_t *B, const size_t lenB, + uint32_t *out) { + const uint32_t *initout = out; + if (lenA == 0 || lenB == 0) return 0; + const uint32_t *endA = A + lenA; + const uint32_t *endB = B + lenB; + + while (1) { + while (*A < *B) { + SKIP_FIRST_COMPARE: + if (++A == endA) return (out - initout); + } + while (*A > *B) { + if (++B == endB) return (out - initout); + } + if (*A == *B) { + *out++ = *A; + if (++A == endA || ++B == endB) return (out - initout); + } else { + goto SKIP_FIRST_COMPARE; + } + } + return (out - initout); // NOTREACHED +} + +size_t intersection_uint32_card(const uint32_t *A, const size_t lenA, + const uint32_t *B, const size_t lenB) { + if (lenA == 0 || lenB == 0) return 0; + size_t card = 0; + const uint32_t *endA = A + lenA; + const uint32_t *endB = B + lenB; + + while (1) { + while (*A < *B) { + SKIP_FIRST_COMPARE: + if (++A == endA) return card; + } + while (*A > *B) { + if (++B == endB) return card; + } + if (*A == *B) { + card++; + if (++A == endA || ++B == endB) return card; + } else { + goto SKIP_FIRST_COMPARE; + } + } + return card; // NOTREACHED +} + +// can one vectorize the computation of the union? (Update: Yes! See +// union_vector16). + +size_t union_uint16(const uint16_t *set_1, size_t size_1, const uint16_t *set_2, + size_t size_2, uint16_t *buffer) { + size_t pos = 0, idx_1 = 0, idx_2 = 0; + + if (0 == size_2) { + memmove(buffer, set_1, size_1 * sizeof(uint16_t)); + return size_1; + } + if (0 == size_1) { + memmove(buffer, set_2, size_2 * sizeof(uint16_t)); + return size_2; + } + + uint16_t val_1 = set_1[idx_1], val_2 = set_2[idx_2]; + + while (true) { + if (val_1 < val_2) { + buffer[pos++] = val_1; + ++idx_1; + if (idx_1 >= size_1) break; + val_1 = set_1[idx_1]; + } else if (val_2 < val_1) { + buffer[pos++] = val_2; + ++idx_2; + if (idx_2 >= size_2) break; + val_2 = set_2[idx_2]; + } else { + buffer[pos++] = val_1; + ++idx_1; + ++idx_2; + if (idx_1 >= size_1 || idx_2 >= size_2) break; + val_1 = set_1[idx_1]; + val_2 = set_2[idx_2]; + } + } + + if (idx_1 < size_1) { + const size_t n_elems = size_1 - idx_1; + memmove(buffer + pos, set_1 + idx_1, n_elems * sizeof(uint16_t)); + pos += n_elems; + } else if (idx_2 < size_2) { + const size_t n_elems = size_2 - idx_2; + memmove(buffer + pos, set_2 + idx_2, n_elems * sizeof(uint16_t)); + pos += n_elems; + } + + return pos; +} + +int difference_uint16(const uint16_t *a1, int length1, const uint16_t *a2, + int length2, uint16_t *a_out) { + int out_card = 0; + int k1 = 0, k2 = 0; + if (length1 == 0) return 0; + if (length2 == 0) { + if (a1 != a_out) memcpy(a_out, a1, sizeof(uint16_t) * length1); + return length1; + } + uint16_t s1 = a1[k1]; + uint16_t s2 = a2[k2]; + while (true) { + if (s1 < s2) { + a_out[out_card++] = s1; + ++k1; + if (k1 >= length1) { + break; + } + s1 = a1[k1]; + } else if (s1 == s2) { + ++k1; + ++k2; + if (k1 >= length1) { + break; + } + if (k2 >= length2) { + memmove(a_out + out_card, a1 + k1, + sizeof(uint16_t) * (length1 - k1)); + return out_card + length1 - k1; + } + s1 = a1[k1]; + s2 = a2[k2]; + } else { // if (val1>val2) + ++k2; + if (k2 >= length2) { + memmove(a_out + out_card, a1 + k1, + sizeof(uint16_t) * (length1 - k1)); + return out_card + length1 - k1; + } + s2 = a2[k2]; + } + } + return out_card; +} + +int32_t xor_uint16(const uint16_t *array_1, int32_t card_1, + const uint16_t *array_2, int32_t card_2, uint16_t *out) { + int32_t pos1 = 0, pos2 = 0, pos_out = 0; + while (pos1 < card_1 && pos2 < card_2) { + const uint16_t v1 = array_1[pos1]; + const uint16_t v2 = array_2[pos2]; + if (v1 == v2) { + ++pos1; + ++pos2; + continue; + } + if (v1 < v2) { + out[pos_out++] = v1; + ++pos1; + } else { + out[pos_out++] = v2; + ++pos2; + } + } + if (pos1 < card_1) { + const size_t n_elems = card_1 - pos1; + memcpy(out + pos_out, array_1 + pos1, n_elems * sizeof(uint16_t)); + pos_out += (int32_t)n_elems; + } else if (pos2 < card_2) { + const size_t n_elems = card_2 - pos2; + memcpy(out + pos_out, array_2 + pos2, n_elems * sizeof(uint16_t)); + pos_out += (int32_t)n_elems; + } + return pos_out; +} + +#if CROARING_IS_X64 + +/*** + * start of the SIMD 16-bit union code + * + */ +CROARING_TARGET_AVX2 + +// Assuming that vInput1 and vInput2 are sorted, produces a sorted output going +// from vecMin all the way to vecMax +// developed originally for merge sort using SIMD instructions. +// Standard merge. See, e.g., Inoue and Taura, SIMD- and Cache-Friendly +// Algorithm for Sorting an Array of Structures +static inline void sse_merge(const __m128i *vInput1, + const __m128i *vInput2, // input 1 & 2 + __m128i *vecMin, __m128i *vecMax) { // output + __m128i vecTmp; + vecTmp = _mm_min_epu16(*vInput1, *vInput2); + *vecMax = _mm_max_epu16(*vInput1, *vInput2); + vecTmp = _mm_alignr_epi8(vecTmp, vecTmp, 2); + *vecMin = _mm_min_epu16(vecTmp, *vecMax); + *vecMax = _mm_max_epu16(vecTmp, *vecMax); + vecTmp = _mm_alignr_epi8(*vecMin, *vecMin, 2); + *vecMin = _mm_min_epu16(vecTmp, *vecMax); + *vecMax = _mm_max_epu16(vecTmp, *vecMax); + vecTmp = _mm_alignr_epi8(*vecMin, *vecMin, 2); + *vecMin = _mm_min_epu16(vecTmp, *vecMax); + *vecMax = _mm_max_epu16(vecTmp, *vecMax); + vecTmp = _mm_alignr_epi8(*vecMin, *vecMin, 2); + *vecMin = _mm_min_epu16(vecTmp, *vecMax); + *vecMax = _mm_max_epu16(vecTmp, *vecMax); + vecTmp = _mm_alignr_epi8(*vecMin, *vecMin, 2); + *vecMin = _mm_min_epu16(vecTmp, *vecMax); + *vecMax = _mm_max_epu16(vecTmp, *vecMax); + vecTmp = _mm_alignr_epi8(*vecMin, *vecMin, 2); + *vecMin = _mm_min_epu16(vecTmp, *vecMax); + *vecMax = _mm_max_epu16(vecTmp, *vecMax); + vecTmp = _mm_alignr_epi8(*vecMin, *vecMin, 2); + *vecMin = _mm_min_epu16(vecTmp, *vecMax); + *vecMax = _mm_max_epu16(vecTmp, *vecMax); + *vecMin = _mm_alignr_epi8(*vecMin, *vecMin, 2); +} +CROARING_UNTARGET_AVX2 +// used by store_unique, generated by simdunion.py +static uint8_t uniqshuf[] = { + 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, + 0xc, 0xd, 0xe, 0xf, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, + 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0x0, 0x1, 0x4, 0x5, + 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, + 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, + 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, 0x6, 0x7, 0x8, 0x9, + 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0x2, 0x3, 0x6, 0x7, + 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, + 0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, + 0xFF, 0xFF, 0xFF, 0xFF, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, + 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, + 0x4, 0x5, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, + 0x2, 0x3, 0x4, 0x5, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, + 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x4, 0x5, 0x8, 0x9, 0xa, 0xb, + 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0x4, 0x5, 0x8, 0x9, + 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x0, 0x1, 0x2, 0x3, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, + 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, + 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x8, 0x9, + 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, + 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0x2, 0x3, 0x4, 0x5, + 0x6, 0x7, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, + 0x0, 0x1, 0x4, 0x5, 0x6, 0x7, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, + 0xFF, 0xFF, 0xFF, 0xFF, 0x4, 0x5, 0x6, 0x7, 0xa, 0xb, 0xc, 0xd, + 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, + 0x6, 0x7, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, + 0x2, 0x3, 0x6, 0x7, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x6, 0x7, 0xa, 0xb, 0xc, 0xd, + 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x6, 0x7, 0xa, 0xb, + 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, + 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x4, 0x5, 0xa, 0xb, 0xc, 0xd, + 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x4, 0x5, + 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x4, 0x5, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, 0xa, 0xb, 0xc, 0xd, + 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0xa, 0xb, + 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x0, 0x1, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, + 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, + 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xc, 0xd, 0xe, 0xf, + 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, + 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0x4, 0x5, 0x6, 0x7, + 0x8, 0x9, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x0, 0x1, 0x2, 0x3, 0x6, 0x7, 0x8, 0x9, 0xc, 0xd, 0xe, 0xf, + 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x6, 0x7, 0x8, 0x9, 0xc, 0xd, + 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x6, 0x7, + 0x8, 0x9, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x6, 0x7, 0x8, 0x9, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x8, 0x9, + 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x4, 0x5, + 0x8, 0x9, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x0, 0x1, 0x4, 0x5, 0x8, 0x9, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x4, 0x5, 0x8, 0x9, 0xc, 0xd, 0xe, 0xf, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, + 0x8, 0x9, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x2, 0x3, 0x8, 0x9, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x8, 0x9, 0xc, 0xd, 0xe, 0xf, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x8, 0x9, 0xc, 0xd, + 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0xc, 0xd, 0xe, 0xf, + 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0xc, 0xd, + 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x4, 0x5, + 0x6, 0x7, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x4, 0x5, 0x6, 0x7, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, 0x6, 0x7, 0xc, 0xd, + 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x6, 0x7, + 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x0, 0x1, 0x6, 0x7, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x6, 0x7, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, + 0x4, 0x5, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x2, 0x3, 0x4, 0x5, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x4, 0x5, 0xc, 0xd, 0xe, 0xf, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x4, 0x5, 0xc, 0xd, + 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x0, 0x1, 0x2, 0x3, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0xc, 0xd, + 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, + 0x8, 0x9, 0xa, 0xb, 0xe, 0xf, 0xFF, 0xFF, 0x2, 0x3, 0x4, 0x5, + 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, + 0x0, 0x1, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xe, 0xf, + 0xFF, 0xFF, 0xFF, 0xFF, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, + 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, + 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, + 0x2, 0x3, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xe, 0xf, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, + 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x6, 0x7, 0x8, 0x9, + 0xa, 0xb, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x8, 0x9, 0xa, 0xb, 0xe, 0xf, + 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x4, 0x5, 0x8, 0x9, 0xa, 0xb, + 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x4, 0x5, + 0x8, 0x9, 0xa, 0xb, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x4, 0x5, 0x8, 0x9, 0xa, 0xb, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, 0x8, 0x9, 0xa, 0xb, + 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x8, 0x9, + 0xa, 0xb, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x0, 0x1, 0x8, 0x9, 0xa, 0xb, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x8, 0x9, 0xa, 0xb, 0xe, 0xf, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, + 0x4, 0x5, 0x6, 0x7, 0xa, 0xb, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, + 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0xa, 0xb, 0xe, 0xf, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x4, 0x5, 0x6, 0x7, 0xa, 0xb, + 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x4, 0x5, 0x6, 0x7, + 0xa, 0xb, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x0, 0x1, 0x2, 0x3, 0x6, 0x7, 0xa, 0xb, 0xe, 0xf, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x6, 0x7, 0xa, 0xb, 0xe, 0xf, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x6, 0x7, + 0xa, 0xb, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x6, 0x7, 0xa, 0xb, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0xa, 0xb, + 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x4, 0x5, + 0xa, 0xb, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x0, 0x1, 0x4, 0x5, 0xa, 0xb, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x4, 0x5, 0xa, 0xb, 0xe, 0xf, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, + 0xa, 0xb, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x2, 0x3, 0xa, 0xb, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0xa, 0xb, 0xe, 0xf, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xa, 0xb, 0xe, 0xf, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xe, 0xf, + 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, + 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x4, 0x5, + 0x6, 0x7, 0x8, 0x9, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, 0x6, 0x7, 0x8, 0x9, + 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x6, 0x7, + 0x8, 0x9, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x6, 0x7, 0x8, 0x9, 0xe, 0xf, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, + 0x4, 0x5, 0x8, 0x9, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x2, 0x3, 0x4, 0x5, 0x8, 0x9, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x4, 0x5, 0x8, 0x9, 0xe, 0xf, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x4, 0x5, 0x8, 0x9, + 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x0, 0x1, 0x2, 0x3, 0x8, 0x9, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x8, 0x9, 0xe, 0xf, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x8, 0x9, + 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x8, 0x9, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, + 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x4, 0x5, + 0x6, 0x7, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x0, 0x1, 0x4, 0x5, 0x6, 0x7, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x4, 0x5, 0x6, 0x7, 0xe, 0xf, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, + 0x6, 0x7, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x2, 0x3, 0x6, 0x7, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x6, 0x7, 0xe, 0xf, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x6, 0x7, 0xe, 0xf, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x4, 0x5, 0xe, 0xf, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x4, 0x5, + 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x4, 0x5, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, 0xe, 0xf, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0xe, 0xf, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x0, 0x1, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, + 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xFF, 0xFF, + 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, + 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, + 0xa, 0xb, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0x4, 0x5, 0x6, 0x7, + 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x0, 0x1, 0x2, 0x3, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, + 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, + 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x6, 0x7, + 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x8, 0x9, + 0xa, 0xb, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x4, 0x5, + 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x0, 0x1, 0x4, 0x5, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x4, 0x5, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, + 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x2, 0x3, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x8, 0x9, 0xa, 0xb, + 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0xa, 0xb, 0xc, 0xd, + 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0xa, 0xb, + 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x4, 0x5, + 0x6, 0x7, 0xa, 0xb, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x4, 0x5, 0x6, 0x7, 0xa, 0xb, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, 0x6, 0x7, 0xa, 0xb, + 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x6, 0x7, + 0xa, 0xb, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x0, 0x1, 0x6, 0x7, 0xa, 0xb, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x6, 0x7, 0xa, 0xb, 0xc, 0xd, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, + 0x4, 0x5, 0xa, 0xb, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x2, 0x3, 0x4, 0x5, 0xa, 0xb, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x4, 0x5, 0xa, 0xb, 0xc, 0xd, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x4, 0x5, 0xa, 0xb, + 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x0, 0x1, 0x2, 0x3, 0xa, 0xb, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0xa, 0xb, 0xc, 0xd, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0xa, 0xb, + 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xa, 0xb, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, + 0x8, 0x9, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x4, 0x5, + 0x6, 0x7, 0x8, 0x9, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x0, 0x1, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xc, 0xd, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xc, 0xd, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, + 0x6, 0x7, 0x8, 0x9, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x2, 0x3, 0x6, 0x7, 0x8, 0x9, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xc, 0xd, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x6, 0x7, 0x8, 0x9, + 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x8, 0x9, 0xc, 0xd, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x4, 0x5, 0x8, 0x9, 0xc, 0xd, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x4, 0x5, + 0x8, 0x9, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x4, 0x5, 0x8, 0x9, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, 0x8, 0x9, 0xc, 0xd, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x8, 0x9, + 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x0, 0x1, 0x8, 0x9, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x8, 0x9, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, + 0x4, 0x5, 0x6, 0x7, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x4, 0x5, 0x6, 0x7, 0xc, 0xd, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x4, 0x5, 0x6, 0x7, + 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x0, 0x1, 0x2, 0x3, 0x6, 0x7, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x6, 0x7, 0xc, 0xd, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x6, 0x7, + 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x6, 0x7, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0xc, 0xd, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x4, 0x5, + 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x0, 0x1, 0x4, 0x5, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x4, 0x5, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, + 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x2, 0x3, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xc, 0xd, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, + 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, + 0xa, 0xb, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x4, 0x5, + 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, 0x6, 0x7, 0x8, 0x9, + 0xa, 0xb, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x6, 0x7, + 0x8, 0x9, 0xa, 0xb, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, + 0x4, 0x5, 0x8, 0x9, 0xa, 0xb, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x2, 0x3, 0x4, 0x5, 0x8, 0x9, 0xa, 0xb, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x4, 0x5, 0x8, 0x9, 0xa, 0xb, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x4, 0x5, 0x8, 0x9, + 0xa, 0xb, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x0, 0x1, 0x2, 0x3, 0x8, 0x9, 0xa, 0xb, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x8, 0x9, 0xa, 0xb, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x8, 0x9, + 0xa, 0xb, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x8, 0x9, 0xa, 0xb, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, + 0xa, 0xb, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x4, 0x5, + 0x6, 0x7, 0xa, 0xb, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x0, 0x1, 0x4, 0x5, 0x6, 0x7, 0xa, 0xb, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x4, 0x5, 0x6, 0x7, 0xa, 0xb, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, + 0x6, 0x7, 0xa, 0xb, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x2, 0x3, 0x6, 0x7, 0xa, 0xb, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x6, 0x7, 0xa, 0xb, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x6, 0x7, 0xa, 0xb, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0xa, 0xb, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x4, 0x5, 0xa, 0xb, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x4, 0x5, + 0xa, 0xb, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x4, 0x5, 0xa, 0xb, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, 0xa, 0xb, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0xa, 0xb, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x0, 0x1, 0xa, 0xb, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xa, 0xb, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, + 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x4, 0x5, 0x6, 0x7, + 0x8, 0x9, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x0, 0x1, 0x2, 0x3, 0x6, 0x7, 0x8, 0x9, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x6, 0x7, 0x8, 0x9, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x6, 0x7, + 0x8, 0x9, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x6, 0x7, 0x8, 0x9, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x8, 0x9, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x4, 0x5, + 0x8, 0x9, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x0, 0x1, 0x4, 0x5, 0x8, 0x9, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x4, 0x5, 0x8, 0x9, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, + 0x8, 0x9, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x2, 0x3, 0x8, 0x9, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x8, 0x9, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x8, 0x9, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x4, 0x5, + 0x6, 0x7, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x4, 0x5, 0x6, 0x7, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, 0x6, 0x7, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x6, 0x7, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x0, 0x1, 0x6, 0x7, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x6, 0x7, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, + 0x4, 0x5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x2, 0x3, 0x4, 0x5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x4, 0x5, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x4, 0x5, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x0, 0x1, 0x2, 0x3, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF}; +CROARING_TARGET_AVX2 +// write vector new, while omitting repeated values assuming that previously +// written vector was "old" +static inline int store_unique(__m128i old, __m128i newval, uint16_t *output) { + __m128i vecTmp = _mm_alignr_epi8(newval, old, 16 - 2); + // lots of high latency instructions follow (optimize?) + int M = _mm_movemask_epi8( + _mm_packs_epi16(_mm_cmpeq_epi16(vecTmp, newval), _mm_setzero_si128())); + int numberofnewvalues = 8 - _mm_popcnt_u32(M); + __m128i key = _mm_lddqu_si128((const __m128i *)uniqshuf + M); + __m128i val = _mm_shuffle_epi8(newval, key); + _mm_storeu_si128((__m128i *)output, val); + return numberofnewvalues; +} +CROARING_UNTARGET_AVX2 + +// working in-place, this function overwrites the repeated values +// could be avoided? +static inline uint32_t unique(uint16_t *out, uint32_t len) { + uint32_t pos = 1; + for (uint32_t i = 1; i < len; ++i) { + if (out[i] != out[i - 1]) { + out[pos++] = out[i]; + } + } + return pos; +} + +// use with qsort, could be avoided +static int uint16_compare(const void *a, const void *b) { + return (*(uint16_t *)a - *(uint16_t *)b); +} + +CROARING_TARGET_AVX2 +// a one-pass SSE union algorithm +// This function may not be safe if array1 == output or array2 == output. +uint32_t union_vector16(const uint16_t *__restrict__ array1, uint32_t length1, + const uint16_t *__restrict__ array2, uint32_t length2, + uint16_t *__restrict__ output) { + if ((length1 < 8) || (length2 < 8)) { + return (uint32_t)union_uint16(array1, length1, array2, length2, output); + } + __m128i vA, vB, V, vecMin, vecMax; + __m128i laststore; + uint16_t *initoutput = output; + uint32_t len1 = length1 / 8; + uint32_t len2 = length2 / 8; + uint32_t pos1 = 0; + uint32_t pos2 = 0; + // we start the machine + vA = _mm_lddqu_si128((const __m128i *)array1 + pos1); + pos1++; + vB = _mm_lddqu_si128((const __m128i *)array2 + pos2); + pos2++; + sse_merge(&vA, &vB, &vecMin, &vecMax); + laststore = _mm_set1_epi16(-1); + output += store_unique(laststore, vecMin, output); + laststore = vecMin; + if ((pos1 < len1) && (pos2 < len2)) { + uint16_t curA, curB; + curA = array1[8 * pos1]; + curB = array2[8 * pos2]; + while (true) { + if (curA <= curB) { + V = _mm_lddqu_si128((const __m128i *)array1 + pos1); + pos1++; + if (pos1 < len1) { + curA = array1[8 * pos1]; + } else { + break; + } + } else { + V = _mm_lddqu_si128((const __m128i *)array2 + pos2); + pos2++; + if (pos2 < len2) { + curB = array2[8 * pos2]; + } else { + break; + } + } + sse_merge(&V, &vecMax, &vecMin, &vecMax); + output += store_unique(laststore, vecMin, output); + laststore = vecMin; + } + sse_merge(&V, &vecMax, &vecMin, &vecMax); + output += store_unique(laststore, vecMin, output); + laststore = vecMin; + } + // we finish the rest off using a scalar algorithm + // could be improved? + // + // copy the small end on a tmp buffer + uint32_t len = (uint32_t)(output - initoutput); + uint16_t buffer[16]; + uint32_t leftoversize = store_unique(laststore, vecMax, buffer); + if (pos1 == len1) { + memcpy(buffer + leftoversize, array1 + 8 * pos1, + (length1 - 8 * len1) * sizeof(uint16_t)); + leftoversize += length1 - 8 * len1; + qsort(buffer, leftoversize, sizeof(uint16_t), uint16_compare); + + leftoversize = unique(buffer, leftoversize); + len += (uint32_t)union_uint16(buffer, leftoversize, array2 + 8 * pos2, + length2 - 8 * pos2, output); + } else { + memcpy(buffer + leftoversize, array2 + 8 * pos2, + (length2 - 8 * len2) * sizeof(uint16_t)); + leftoversize += length2 - 8 * len2; + qsort(buffer, leftoversize, sizeof(uint16_t), uint16_compare); + leftoversize = unique(buffer, leftoversize); + len += (uint32_t)union_uint16(buffer, leftoversize, array1 + 8 * pos1, + length1 - 8 * pos1, output); + } + return len; +} +CROARING_UNTARGET_AVX2 + +/** + * End of the SIMD 16-bit union code + * + */ + +/** + * Start of SIMD 16-bit XOR code + */ + +CROARING_TARGET_AVX2 +// write vector new, while omitting repeated values assuming that previously +// written vector was "old" +static inline int store_unique_xor(__m128i old, __m128i newval, + uint16_t *output) { + __m128i vecTmp1 = _mm_alignr_epi8(newval, old, 16 - 4); + __m128i vecTmp2 = _mm_alignr_epi8(newval, old, 16 - 2); + __m128i equalleft = _mm_cmpeq_epi16(vecTmp2, vecTmp1); + __m128i equalright = _mm_cmpeq_epi16(vecTmp2, newval); + __m128i equalleftoright = _mm_or_si128(equalleft, equalright); + int M = _mm_movemask_epi8( + _mm_packs_epi16(equalleftoright, _mm_setzero_si128())); + int numberofnewvalues = 8 - _mm_popcnt_u32(M); + __m128i key = _mm_lddqu_si128((const __m128i *)uniqshuf + M); + __m128i val = _mm_shuffle_epi8(vecTmp2, key); + _mm_storeu_si128((__m128i *)output, val); + return numberofnewvalues; +} +CROARING_UNTARGET_AVX2 + +// working in-place, this function overwrites the repeated values +// could be avoided? Warning: assumes len > 0 +static inline uint32_t unique_xor(uint16_t *out, uint32_t len) { + uint32_t pos = 1; + for (uint32_t i = 1; i < len; ++i) { + if (out[i] != out[i - 1]) { + out[pos++] = out[i]; + } else + pos--; // if it is identical to previous, delete it + } + return pos; +} +CROARING_TARGET_AVX2 +// a one-pass SSE xor algorithm +uint32_t xor_vector16(const uint16_t *__restrict__ array1, uint32_t length1, + const uint16_t *__restrict__ array2, uint32_t length2, + uint16_t *__restrict__ output) { + if ((length1 < 8) || (length2 < 8)) { + return xor_uint16(array1, length1, array2, length2, output); + } + __m128i vA, vB, V, vecMin, vecMax; + __m128i laststore; + uint16_t *initoutput = output; + uint32_t len1 = length1 / 8; + uint32_t len2 = length2 / 8; + uint32_t pos1 = 0; + uint32_t pos2 = 0; + // we start the machine + vA = _mm_lddqu_si128((const __m128i *)array1 + pos1); + pos1++; + vB = _mm_lddqu_si128((const __m128i *)array2 + pos2); + pos2++; + sse_merge(&vA, &vB, &vecMin, &vecMax); + laststore = _mm_set1_epi16(-1); + uint16_t buffer[17]; + output += store_unique_xor(laststore, vecMin, output); + + laststore = vecMin; + if ((pos1 < len1) && (pos2 < len2)) { + uint16_t curA, curB; + curA = array1[8 * pos1]; + curB = array2[8 * pos2]; + while (true) { + if (curA <= curB) { + V = _mm_lddqu_si128((const __m128i *)array1 + pos1); + pos1++; + if (pos1 < len1) { + curA = array1[8 * pos1]; + } else { + break; + } + } else { + V = _mm_lddqu_si128((const __m128i *)array2 + pos2); + pos2++; + if (pos2 < len2) { + curB = array2[8 * pos2]; + } else { + break; + } + } + sse_merge(&V, &vecMax, &vecMin, &vecMax); + // conditionally stores the last value of laststore as well as all + // but the + // last value of vecMin + output += store_unique_xor(laststore, vecMin, output); + laststore = vecMin; + } + sse_merge(&V, &vecMax, &vecMin, &vecMax); + // conditionally stores the last value of laststore as well as all but + // the + // last value of vecMin + output += store_unique_xor(laststore, vecMin, output); + laststore = vecMin; + } + uint32_t len = (uint32_t)(output - initoutput); + + // we finish the rest off using a scalar algorithm + // could be improved? + // conditionally stores the last value of laststore as well as all but the + // last value of vecMax, + // we store to "buffer" + int leftoversize = store_unique_xor(laststore, vecMax, buffer); + uint16_t vec7 = _mm_extract_epi16(vecMax, 7); + uint16_t vec6 = _mm_extract_epi16(vecMax, 6); + if (vec7 != vec6) buffer[leftoversize++] = vec7; + if (pos1 == len1) { + memcpy(buffer + leftoversize, array1 + 8 * pos1, + (length1 - 8 * len1) * sizeof(uint16_t)); + leftoversize += length1 - 8 * len1; + if (leftoversize == 0) { // trivial case + memcpy(output, array2 + 8 * pos2, + (length2 - 8 * pos2) * sizeof(uint16_t)); + len += (length2 - 8 * pos2); + } else { + qsort(buffer, leftoversize, sizeof(uint16_t), uint16_compare); + leftoversize = unique_xor(buffer, leftoversize); + len += xor_uint16(buffer, leftoversize, array2 + 8 * pos2, + length2 - 8 * pos2, output); + } + } else { + memcpy(buffer + leftoversize, array2 + 8 * pos2, + (length2 - 8 * len2) * sizeof(uint16_t)); + leftoversize += length2 - 8 * len2; + if (leftoversize == 0) { // trivial case + memcpy(output, array1 + 8 * pos1, + (length1 - 8 * pos1) * sizeof(uint16_t)); + len += (length1 - 8 * pos1); + } else { + qsort(buffer, leftoversize, sizeof(uint16_t), uint16_compare); + leftoversize = unique_xor(buffer, leftoversize); + len += xor_uint16(buffer, leftoversize, array1 + 8 * pos1, + length1 - 8 * pos1, output); + } + } + return len; +} +CROARING_UNTARGET_AVX2 +/** + * End of SIMD 16-bit XOR code + */ + +#endif // CROARING_IS_X64 + +size_t union_uint32(const uint32_t *set_1, size_t size_1, const uint32_t *set_2, + size_t size_2, uint32_t *buffer) { + size_t pos = 0, idx_1 = 0, idx_2 = 0; + + if (0 == size_2) { + memmove(buffer, set_1, size_1 * sizeof(uint32_t)); + return size_1; + } + if (0 == size_1) { + memmove(buffer, set_2, size_2 * sizeof(uint32_t)); + return size_2; + } + + uint32_t val_1 = set_1[idx_1], val_2 = set_2[idx_2]; + + while (true) { + if (val_1 < val_2) { + buffer[pos++] = val_1; + ++idx_1; + if (idx_1 >= size_1) break; + val_1 = set_1[idx_1]; + } else if (val_2 < val_1) { + buffer[pos++] = val_2; + ++idx_2; + if (idx_2 >= size_2) break; + val_2 = set_2[idx_2]; + } else { + buffer[pos++] = val_1; + ++idx_1; + ++idx_2; + if (idx_1 >= size_1 || idx_2 >= size_2) break; + val_1 = set_1[idx_1]; + val_2 = set_2[idx_2]; + } + } + + if (idx_1 < size_1) { + const size_t n_elems = size_1 - idx_1; + memmove(buffer + pos, set_1 + idx_1, n_elems * sizeof(uint32_t)); + pos += n_elems; + } else if (idx_2 < size_2) { + const size_t n_elems = size_2 - idx_2; + memmove(buffer + pos, set_2 + idx_2, n_elems * sizeof(uint32_t)); + pos += n_elems; + } + + return pos; +} + +size_t union_uint32_card(const uint32_t *set_1, size_t size_1, + const uint32_t *set_2, size_t size_2) { + size_t pos = 0, idx_1 = 0, idx_2 = 0; + + if (0 == size_2) { + return size_1; + } + if (0 == size_1) { + return size_2; + } + + uint32_t val_1 = set_1[idx_1], val_2 = set_2[idx_2]; + + while (true) { + if (val_1 < val_2) { + ++idx_1; + ++pos; + if (idx_1 >= size_1) break; + val_1 = set_1[idx_1]; + } else if (val_2 < val_1) { + ++idx_2; + ++pos; + if (idx_2 >= size_2) break; + val_2 = set_2[idx_2]; + } else { + ++idx_1; + ++idx_2; + ++pos; + if (idx_1 >= size_1 || idx_2 >= size_2) break; + val_1 = set_1[idx_1]; + val_2 = set_2[idx_2]; + } + } + + if (idx_1 < size_1) { + const size_t n_elems = size_1 - idx_1; + pos += n_elems; + } else if (idx_2 < size_2) { + const size_t n_elems = size_2 - idx_2; + pos += n_elems; + } + return pos; +} + + + +size_t fast_union_uint16(const uint16_t *set_1, size_t size_1, const uint16_t *set_2, + size_t size_2, uint16_t *buffer) { +#if CROARING_IS_X64 + if( croaring_hardware_support() & ROARING_SUPPORTS_AVX2 ) { + // compute union with smallest array first + if (size_1 < size_2) { + return union_vector16(set_1, (uint32_t)size_1, + set_2, (uint32_t)size_2, buffer); + } else { + return union_vector16(set_2, (uint32_t)size_2, + set_1, (uint32_t)size_1, buffer); + } + } else { + // compute union with smallest array first + if (size_1 < size_2) { + return union_uint16( + set_1, size_1, set_2, size_2, buffer); + } else { + return union_uint16( + set_2, size_2, set_1, size_1, buffer); + } + } +#else + // compute union with smallest array first + if (size_1 < size_2) { + return union_uint16( + set_1, size_1, set_2, size_2, buffer); + } else { + return union_uint16( + set_2, size_2, set_1, size_1, buffer); + } +#endif +} +#if CROARING_IS_X64 +#if CROARING_COMPILER_SUPPORTS_AVX512 +CROARING_TARGET_AVX512 +static inline bool _avx512_memequals(const void *s1, const void *s2, size_t n) { + const uint8_t *ptr1 = (const uint8_t *)s1; + const uint8_t *ptr2 = (const uint8_t *)s2; + const uint8_t *end1 = ptr1 + n; + const uint8_t *end8 = ptr1 + ((n >> 3) << 3); + const uint8_t *end32 = ptr1 + ((n >> 5) << 5); + const uint8_t *end64 = ptr1 + ((n >> 6) << 6); + + while (ptr1 < end64){ + __m512i r1 = _mm512_loadu_si512((const __m512i*)ptr1); + __m512i r2 = _mm512_loadu_si512((const __m512i*)ptr2); + + uint64_t mask = _mm512_cmpeq_epi8_mask(r1, r2); + + if (mask != UINT64_MAX) { + return false; + } + + ptr1 += 64; + ptr2 += 64; + + } + + while (ptr1 < end32) { + __m256i r1 = _mm256_loadu_si256((const __m256i*)ptr1); + __m256i r2 = _mm256_loadu_si256((const __m256i*)ptr2); + int mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(r1, r2)); + if ((uint32_t)mask != UINT32_MAX) { + return false; + } + ptr1 += 32; + ptr2 += 32; + } + + while (ptr1 < end8) { + uint64_t v1, v2; + memcpy(&v1,ptr1,sizeof(uint64_t)); + memcpy(&v2,ptr2,sizeof(uint64_t)); + if (v1 != v2) { + return false; + } + ptr1 += 8; + ptr2 += 8; + } + + while (ptr1 < end1) { + if (*ptr1 != *ptr2) { + return false; + } + ptr1++; + ptr2++; + } + + return true; +} +CROARING_UNTARGET_AVX512 +#endif // CROARING_COMPILER_SUPPORTS_AVX512 + +CROARING_TARGET_AVX2 +static inline bool _avx2_memequals(const void *s1, const void *s2, size_t n) { + const uint8_t *ptr1 = (const uint8_t *)s1; + const uint8_t *ptr2 = (const uint8_t *)s2; + const uint8_t *end1 = ptr1 + n; + const uint8_t *end8 = ptr1 + n/8*8; + const uint8_t *end32 = ptr1 + n/32*32; + + while (ptr1 < end32) { + __m256i r1 = _mm256_loadu_si256((const __m256i*)ptr1); + __m256i r2 = _mm256_loadu_si256((const __m256i*)ptr2); + int mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(r1, r2)); + if ((uint32_t)mask != UINT32_MAX) { + return false; + } + ptr1 += 32; + ptr2 += 32; + } + + while (ptr1 < end8) { + uint64_t v1, v2; + memcpy(&v1,ptr1,sizeof(uint64_t)); + memcpy(&v2,ptr2,sizeof(uint64_t)); + if (v1 != v2) { + return false; + } + ptr1 += 8; + ptr2 += 8; + } + + while (ptr1 < end1) { + if (*ptr1 != *ptr2) { + return false; + } + ptr1++; + ptr2++; + } + + return true; +} +CROARING_UNTARGET_AVX2 +#endif + +bool memequals(const void *s1, const void *s2, size_t n) { + if (n == 0) { + return true; + } +#if CROARING_IS_X64 + int support = croaring_hardware_support(); +#if CROARING_COMPILER_SUPPORTS_AVX512 + if( support & ROARING_SUPPORTS_AVX512 ) { + return _avx512_memequals(s1, s2, n); + } else +#endif // CROARING_COMPILER_SUPPORTS_AVX512 + if( support & ROARING_SUPPORTS_AVX2 ) { + return _avx2_memequals(s1, s2, n); + } else { + return memcmp(s1, s2, n) == 0; + } +#else + return memcmp(s1, s2, n) == 0; +#endif +} + + +#if CROARING_IS_X64 +#if CROARING_COMPILER_SUPPORTS_AVX512 +CROARING_TARGET_AVX512 +ALLOW_UNALIGNED +int avx512_array_container_to_uint32_array(void *vout, const uint16_t* array, size_t cardinality, + uint32_t base) { + int outpos = 0; + uint32_t *out = (uint32_t *)vout; + size_t i = 0; + for ( ;i + sizeof(__m256i)/sizeof(uint16_t) <= cardinality; i += sizeof(__m256i)/sizeof(uint16_t)) { + __m256i vinput = _mm256_loadu_si256((const __m256i*) (array + i)); + __m512i voutput = _mm512_add_epi32(_mm512_cvtepu16_epi32(vinput), _mm512_set1_epi32(base)); + _mm512_storeu_si512((__m512i*)(out + outpos), voutput); + outpos += sizeof(__m512i)/sizeof(uint32_t); + } + for ( ; i < cardinality; ++i) { + const uint32_t val = base + array[i]; + memcpy(out + outpos, &val, + sizeof(uint32_t)); // should be compiled as a MOV on x64 + outpos++; + } + return outpos; +} +CROARING_UNTARGET_AVX512 +#endif // #if CROARING_COMPILER_SUPPORTS_AVX512 +#endif // #if CROARING_IS_X64 + + +#ifdef __cplusplus +} } } // extern "C" { namespace roaring { namespace internal { +#endif +/* end file src/array_util.c */ +/* begin file src/bitset.c */ +#include +#include +#include +#include +#include + + +#ifdef __cplusplus +extern "C" { namespace roaring { namespace internal { +#endif + +/* Create a new bitset. Return NULL in case of failure. */ +bitset_t *bitset_create(void) { + bitset_t *bitset = NULL; + /* Allocate the bitset itself. */ + if ((bitset = (bitset_t *)roaring_malloc(sizeof(bitset_t))) == NULL) { + return NULL; + } + bitset->array = NULL; + bitset->arraysize = 0; + bitset->capacity = 0; + return bitset; +} + +/* Create a new bitset able to contain size bits. Return NULL in case of + * failure. */ +bitset_t *bitset_create_with_capacity(size_t size) { + bitset_t *bitset = NULL; + /* Allocate the bitset itself. */ + if ((bitset = (bitset_t *)roaring_malloc(sizeof(bitset_t))) == NULL) { + return NULL; + } + bitset->arraysize = + (size + sizeof(uint64_t) * 8 - 1) / (sizeof(uint64_t) * 8); + bitset->capacity = bitset->arraysize; + if ((bitset->array = + (uint64_t *)roaring_calloc(bitset->arraysize, sizeof(uint64_t))) == NULL) { + roaring_free(bitset); + return NULL; + } + return bitset; +} + +/* Create a copy */ +bitset_t *bitset_copy(const bitset_t *bitset) { + bitset_t *copy = NULL; + /* Allocate the bitset itself. */ + if ((copy = (bitset_t *)roaring_malloc(sizeof(bitset_t))) == NULL) { + return NULL; + } + memcpy(copy, bitset, sizeof(bitset_t)); + copy->capacity = copy->arraysize; + if ((copy->array = (uint64_t *)roaring_malloc(sizeof(uint64_t) * + bitset->arraysize)) == NULL) { + roaring_free(copy); + return NULL; + } + memcpy(copy->array, bitset->array, sizeof(uint64_t) * bitset->arraysize); + return copy; +} + +void bitset_clear(bitset_t *bitset) { + memset(bitset->array, 0, sizeof(uint64_t) * bitset->arraysize); +} + +void bitset_fill(bitset_t *bitset) { + memset(bitset->array, 0xff, sizeof(uint64_t) * bitset->arraysize); +} + +void bitset_shift_left(bitset_t *bitset, size_t s) { + size_t extra_words = s / 64; + int inword_shift = s % 64; + size_t as = bitset->arraysize; + if (inword_shift == 0) { + bitset_resize(bitset, as + extra_words, false); + // could be done with a memmove + for (size_t i = as + extra_words; i > extra_words; i--) { + bitset->array[i - 1] = bitset->array[i - 1 - extra_words]; + } + } else { + bitset_resize(bitset, as + extra_words + 1, true); + bitset->array[as + extra_words] = + bitset->array[as - 1] >> (64 - inword_shift); + for (size_t i = as + extra_words; i >= extra_words + 2; i--) { + bitset->array[i - 1] = + (bitset->array[i - 1 - extra_words] << inword_shift) | + (bitset->array[i - 2 - extra_words] >> (64 - inword_shift)); + } + bitset->array[extra_words] = bitset->array[0] << inword_shift; + } + for (size_t i = 0; i < extra_words; i++) { + bitset->array[i] = 0; + } +} + +void bitset_shift_right(bitset_t *bitset, size_t s) { + size_t extra_words = s / 64; + int inword_shift = s % 64; + size_t as = bitset->arraysize; + if (inword_shift == 0) { + // could be done with a memmove + for (size_t i = 0; i < as - extra_words; i++) { + bitset->array[i] = bitset->array[i + extra_words]; + } + bitset_resize(bitset, as - extra_words, false); + + } else { + for (size_t i = 0; i + extra_words + 1 < as; i++) { + bitset->array[i] = + (bitset->array[i + extra_words] >> inword_shift) | + (bitset->array[i + extra_words + 1] << (64 - inword_shift)); + } + bitset->array[as - extra_words - 1] = + (bitset->array[as - 1] >> inword_shift); + bitset_resize(bitset, as - extra_words, false); + } +} + +/* Free memory. */ +void bitset_free(bitset_t *bitset) { + if(bitset == NULL) { return; } + roaring_free(bitset->array); + roaring_free(bitset); +} + +/* Resize the bitset so that it can support newarraysize * 64 bits. Return true + * in case of success, false for failure. */ +bool bitset_resize(bitset_t *bitset, size_t newarraysize, bool padwithzeroes) { + if(newarraysize > SIZE_MAX/64) { return false; } + size_t smallest = + newarraysize < bitset->arraysize ? newarraysize : bitset->arraysize; + if (bitset->capacity < newarraysize) { + uint64_t *newarray; + size_t newcapacity = bitset->capacity; + if(newcapacity == 0) { newcapacity = 1; } + while(newcapacity < newarraysize) { newcapacity *= 2; } + if ((newarray = (uint64_t *) roaring_realloc(bitset->array, sizeof(uint64_t) * newcapacity)) == NULL) { + return false; + } + bitset->capacity = newcapacity; + bitset->array = newarray; + } + if (padwithzeroes && (newarraysize > smallest)) + memset(bitset->array + smallest, 0, + sizeof(uint64_t) * (newarraysize - smallest)); + bitset->arraysize = newarraysize; + return true; // success! +} + +size_t bitset_count(const bitset_t *bitset) { + size_t card = 0; + size_t k = 0; + for (; k + 7 < bitset->arraysize; k += 8) { + card += roaring_hamming(bitset->array[k]); + card += roaring_hamming(bitset->array[k + 1]); + card += roaring_hamming(bitset->array[k + 2]); + card += roaring_hamming(bitset->array[k + 3]); + card += roaring_hamming(bitset->array[k + 4]); + card += roaring_hamming(bitset->array[k + 5]); + card += roaring_hamming(bitset->array[k + 6]); + card += roaring_hamming(bitset->array[k + 7]); + } + for (; k + 3 < bitset->arraysize; k += 4) { + card += roaring_hamming(bitset->array[k]); + card += roaring_hamming(bitset->array[k + 1]); + card += roaring_hamming(bitset->array[k + 2]); + card += roaring_hamming(bitset->array[k + 3]); + } + for (; k < bitset->arraysize; k++) { + card += roaring_hamming(bitset->array[k]); + } + return card; +} + +bool bitset_inplace_union(bitset_t *CBITSET_RESTRICT b1, + const bitset_t *CBITSET_RESTRICT b2) { + size_t minlength = + b1->arraysize < b2->arraysize ? b1->arraysize : b2->arraysize; + for (size_t k = 0; k < minlength; ++k) { + b1->array[k] |= b2->array[k]; + } + if (b2->arraysize > b1->arraysize) { + size_t oldsize = b1->arraysize; + if (!bitset_resize(b1, b2->arraysize, false)) return false; + memcpy(b1->array + oldsize, b2->array + oldsize, + (b2->arraysize - oldsize) * sizeof(uint64_t)); + } + return true; +} + +size_t bitset_minimum(const bitset_t *bitset) { + for (size_t k = 0; k < bitset->arraysize; k++) { + uint64_t w = bitset->array[k]; + if (w != 0) { + return roaring_trailing_zeroes(w) + k * 64; + } + } + return 0; +} + +bool bitset_grow(bitset_t *bitset, size_t newarraysize) { + if(newarraysize < bitset->arraysize) { return false; } + if(newarraysize > SIZE_MAX/64) { return false; } + if (bitset->capacity < newarraysize) { + uint64_t *newarray; + size_t newcapacity = (UINT64_C(0xFFFFFFFFFFFFFFFF) >> roaring_leading_zeroes(newarraysize)) + 1; + while(newcapacity < newarraysize) { newcapacity *= 2; } + if ((newarray = (uint64_t *) roaring_realloc(bitset->array, sizeof(uint64_t) * newcapacity)) == NULL) { + return false; + } + bitset->capacity = newcapacity; + bitset->array = newarray; + } + memset(bitset->array + bitset->arraysize, 0, + sizeof(uint64_t) * (newarraysize - bitset->arraysize)); + bitset->arraysize = newarraysize; + return true; // success! +} + +size_t bitset_maximum(const bitset_t *bitset) { + for (size_t k = bitset->arraysize; k > 0; k--) { + uint64_t w = bitset->array[k - 1]; + if (w != 0) { + return 63 - roaring_leading_zeroes(w) + (k - 1) * 64; + } + } + return 0; +} + +/* Returns true if bitsets share no common elements, false otherwise. + * + * Performs early-out if common element found. */ +bool bitsets_disjoint(const bitset_t *CBITSET_RESTRICT b1, const bitset_t *CBITSET_RESTRICT b2) { + size_t minlength = + b1->arraysize < b2->arraysize ? b1->arraysize : b2->arraysize; + + for (size_t k = 0; k < minlength; k++) { + if ((b1->array[k] & b2->array[k]) != 0) return false; + } + return true; +} + +/* Returns true if bitsets contain at least 1 common element, false if they are + * disjoint. + * + * Performs early-out if common element found. */ +bool bitsets_intersect(const bitset_t *CBITSET_RESTRICT b1, const bitset_t *CBITSET_RESTRICT b2) { + size_t minlength = + b1->arraysize < b2->arraysize ? b1->arraysize : b2->arraysize; + + for (size_t k = 0; k < minlength; k++) { + if ((b1->array[k] & b2->array[k]) != 0) return true; + } + return false; +} + +/* Returns true if b has any bits set in or after b->array[starting_loc]. */ +static bool any_bits_set(const bitset_t *b, size_t starting_loc) { + if (starting_loc >= b->arraysize) { + return false; + } + for (size_t k = starting_loc; k < b->arraysize; k++) { + if (b->array[k] != 0) return true; + } + return false; +} + +/* Returns true if b1 has all of b2's bits set. + * + * Performs early out if a bit is found in b2 that is not found in b1. */ +bool bitset_contains_all(const bitset_t *CBITSET_RESTRICT b1, const bitset_t *CBITSET_RESTRICT b2) { + size_t min_size = b1->arraysize; + if(b1->arraysize > b2->arraysize) { + min_size = b2->arraysize; + } + for (size_t k = 0; k < min_size; k++) { + if ((b1->array[k] & b2->array[k]) != b2->array[k]) { + return false; + } + } + if (b2->arraysize > b1->arraysize) { + /* Need to check if b2 has any bits set beyond b1's array */ + return !any_bits_set(b2, b1->arraysize); + } + return true; +} + +size_t bitset_union_count(const bitset_t *CBITSET_RESTRICT b1, + const bitset_t *CBITSET_RESTRICT b2) { + size_t answer = 0; + size_t minlength = + b1->arraysize < b2->arraysize ? b1->arraysize : b2->arraysize; + size_t k = 0; + for (; k + 3 < minlength; k += 4) { + answer += roaring_hamming(b1->array[k] | b2->array[k]); + answer += roaring_hamming(b1->array[k + 1] | b2->array[k + 1]); + answer += roaring_hamming(b1->array[k + 2] | b2->array[k + 2]); + answer += roaring_hamming(b1->array[k + 3] | b2->array[k + 3]); + } + for (; k < minlength; ++k) { + answer += roaring_hamming(b1->array[k] | b2->array[k]); + } + if (b2->arraysize > b1->arraysize) { + // k is equal to b1->arraysize + for (; k + 3 < b2->arraysize; k += 4) { + answer += roaring_hamming(b2->array[k]); + answer += roaring_hamming(b2->array[k + 1]); + answer += roaring_hamming(b2->array[k + 2]); + answer += roaring_hamming(b2->array[k + 3]); + } + for (; k < b2->arraysize; ++k) { + answer += roaring_hamming(b2->array[k]); + } + } else { + // k is equal to b2->arraysize + for (; k + 3 < b1->arraysize; k += 4) { + answer += roaring_hamming(b1->array[k]); + answer += roaring_hamming(b1->array[k + 1]); + answer += roaring_hamming(b1->array[k + 2]); + answer += roaring_hamming(b1->array[k + 3]); + } + for (; k < b1->arraysize; ++k) { + answer += roaring_hamming(b1->array[k]); + } + } + return answer; +} + +void bitset_inplace_intersection(bitset_t *CBITSET_RESTRICT b1, + const bitset_t *CBITSET_RESTRICT b2) { + size_t minlength = + b1->arraysize < b2->arraysize ? b1->arraysize : b2->arraysize; + size_t k = 0; + for (; k < minlength; ++k) { + b1->array[k] &= b2->array[k]; + } + for (; k < b1->arraysize; ++k) { + b1->array[k] = 0; // memset could, maybe, be a tiny bit faster + } +} + +size_t bitset_intersection_count(const bitset_t *CBITSET_RESTRICT b1, + const bitset_t *CBITSET_RESTRICT b2) { + size_t answer = 0; + size_t minlength = + b1->arraysize < b2->arraysize ? b1->arraysize : b2->arraysize; + for (size_t k = 0; k < minlength; ++k) { + answer += roaring_hamming(b1->array[k] & b2->array[k]); + } + return answer; +} + +void bitset_inplace_difference(bitset_t *CBITSET_RESTRICT b1, + const bitset_t *CBITSET_RESTRICT b2) { + size_t minlength = + b1->arraysize < b2->arraysize ? b1->arraysize : b2->arraysize; + size_t k = 0; + for (; k < minlength; ++k) { + b1->array[k] &= ~(b2->array[k]); + } +} + +size_t bitset_difference_count(const bitset_t *CBITSET_RESTRICT b1, + const bitset_t *CBITSET_RESTRICT b2) { + size_t minlength = + b1->arraysize < b2->arraysize ? b1->arraysize : b2->arraysize; + size_t k = 0; + size_t answer = 0; + for (; k < minlength; ++k) { + answer += roaring_hamming(b1->array[k] & ~(b2->array[k])); + } + for (; k < b1->arraysize; ++k) { + answer += roaring_hamming(b1->array[k]); + } + return answer; +} + +bool bitset_inplace_symmetric_difference(bitset_t *CBITSET_RESTRICT b1, + const bitset_t *CBITSET_RESTRICT b2) { + size_t minlength = + b1->arraysize < b2->arraysize ? b1->arraysize : b2->arraysize; + size_t k = 0; + for (; k < minlength; ++k) { + b1->array[k] ^= b2->array[k]; + } + if (b2->arraysize > b1->arraysize) { + size_t oldsize = b1->arraysize; + if (!bitset_resize(b1, b2->arraysize, false)) return false; + memcpy(b1->array + oldsize, b2->array + oldsize, + (b2->arraysize - oldsize) * sizeof(uint64_t)); + } + return true; +} + +size_t bitset_symmetric_difference_count(const bitset_t *CBITSET_RESTRICT b1, + const bitset_t *CBITSET_RESTRICT b2) { + size_t minlength = + b1->arraysize < b2->arraysize ? b1->arraysize : b2->arraysize; + size_t k = 0; + size_t answer = 0; + for (; k < minlength; ++k) { + answer += roaring_hamming(b1->array[k] ^ b2->array[k]); + } + if (b2->arraysize > b1->arraysize) { + for (; k < b2->arraysize; ++k) { + answer += roaring_hamming(b2->array[k]); + } + } else { + for (; k < b1->arraysize; ++k) { + answer += roaring_hamming(b1->array[k]); + } + } + return answer; +} + +bool bitset_trim(bitset_t *bitset) { + size_t newsize = bitset->arraysize; + while (newsize > 0) { + if (bitset->array[newsize - 1] == 0) + newsize -= 1; + else + break; + } + if (bitset->capacity == newsize) return true; // nothing to do + uint64_t *newarray; + if ((newarray = (uint64_t *)roaring_realloc( + bitset->array, sizeof(uint64_t) * newsize)) == NULL) { + return false; + } + bitset->array = newarray; + bitset->capacity = newsize; + bitset->arraysize = newsize; + return true; +} + + +#ifdef __cplusplus +} } } // extern "C" { namespace roaring { namespace internal { +#endif +/* end file src/bitset.c */ +/* begin file src/bitset_util.c */ +#include +#include +#include +#include +#include + + +#if CROARING_IS_X64 +#ifndef CROARING_COMPILER_SUPPORTS_AVX512 +#error "CROARING_COMPILER_SUPPORTS_AVX512 needs to be defined." +#endif // CROARING_COMPILER_SUPPORTS_AVX512 +#endif + +#ifdef __cplusplus +using namespace ::roaring::internal; +extern "C" { namespace roaring { namespace api { +#endif + +#if CROARING_IS_X64 +static uint8_t lengthTable[256] = { + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, + 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, + 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, + 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, + 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8}; +#endif + +#if CROARING_IS_X64 +ALIGNED(32) +static uint32_t vecDecodeTable[256][8] = { + {0, 0, 0, 0, 0, 0, 0, 0}, /* 0x00 (00000000) */ + {1, 0, 0, 0, 0, 0, 0, 0}, /* 0x01 (00000001) */ + {2, 0, 0, 0, 0, 0, 0, 0}, /* 0x02 (00000010) */ + {1, 2, 0, 0, 0, 0, 0, 0}, /* 0x03 (00000011) */ + {3, 0, 0, 0, 0, 0, 0, 0}, /* 0x04 (00000100) */ + {1, 3, 0, 0, 0, 0, 0, 0}, /* 0x05 (00000101) */ + {2, 3, 0, 0, 0, 0, 0, 0}, /* 0x06 (00000110) */ + {1, 2, 3, 0, 0, 0, 0, 0}, /* 0x07 (00000111) */ + {4, 0, 0, 0, 0, 0, 0, 0}, /* 0x08 (00001000) */ + {1, 4, 0, 0, 0, 0, 0, 0}, /* 0x09 (00001001) */ + {2, 4, 0, 0, 0, 0, 0, 0}, /* 0x0A (00001010) */ + {1, 2, 4, 0, 0, 0, 0, 0}, /* 0x0B (00001011) */ + {3, 4, 0, 0, 0, 0, 0, 0}, /* 0x0C (00001100) */ + {1, 3, 4, 0, 0, 0, 0, 0}, /* 0x0D (00001101) */ + {2, 3, 4, 0, 0, 0, 0, 0}, /* 0x0E (00001110) */ + {1, 2, 3, 4, 0, 0, 0, 0}, /* 0x0F (00001111) */ + {5, 0, 0, 0, 0, 0, 0, 0}, /* 0x10 (00010000) */ + {1, 5, 0, 0, 0, 0, 0, 0}, /* 0x11 (00010001) */ + {2, 5, 0, 0, 0, 0, 0, 0}, /* 0x12 (00010010) */ + {1, 2, 5, 0, 0, 0, 0, 0}, /* 0x13 (00010011) */ + {3, 5, 0, 0, 0, 0, 0, 0}, /* 0x14 (00010100) */ + {1, 3, 5, 0, 0, 0, 0, 0}, /* 0x15 (00010101) */ + {2, 3, 5, 0, 0, 0, 0, 0}, /* 0x16 (00010110) */ + {1, 2, 3, 5, 0, 0, 0, 0}, /* 0x17 (00010111) */ + {4, 5, 0, 0, 0, 0, 0, 0}, /* 0x18 (00011000) */ + {1, 4, 5, 0, 0, 0, 0, 0}, /* 0x19 (00011001) */ + {2, 4, 5, 0, 0, 0, 0, 0}, /* 0x1A (00011010) */ + {1, 2, 4, 5, 0, 0, 0, 0}, /* 0x1B (00011011) */ + {3, 4, 5, 0, 0, 0, 0, 0}, /* 0x1C (00011100) */ + {1, 3, 4, 5, 0, 0, 0, 0}, /* 0x1D (00011101) */ + {2, 3, 4, 5, 0, 0, 0, 0}, /* 0x1E (00011110) */ + {1, 2, 3, 4, 5, 0, 0, 0}, /* 0x1F (00011111) */ + {6, 0, 0, 0, 0, 0, 0, 0}, /* 0x20 (00100000) */ + {1, 6, 0, 0, 0, 0, 0, 0}, /* 0x21 (00100001) */ + {2, 6, 0, 0, 0, 0, 0, 0}, /* 0x22 (00100010) */ + {1, 2, 6, 0, 0, 0, 0, 0}, /* 0x23 (00100011) */ + {3, 6, 0, 0, 0, 0, 0, 0}, /* 0x24 (00100100) */ + {1, 3, 6, 0, 0, 0, 0, 0}, /* 0x25 (00100101) */ + {2, 3, 6, 0, 0, 0, 0, 0}, /* 0x26 (00100110) */ + {1, 2, 3, 6, 0, 0, 0, 0}, /* 0x27 (00100111) */ + {4, 6, 0, 0, 0, 0, 0, 0}, /* 0x28 (00101000) */ + {1, 4, 6, 0, 0, 0, 0, 0}, /* 0x29 (00101001) */ + {2, 4, 6, 0, 0, 0, 0, 0}, /* 0x2A (00101010) */ + {1, 2, 4, 6, 0, 0, 0, 0}, /* 0x2B (00101011) */ + {3, 4, 6, 0, 0, 0, 0, 0}, /* 0x2C (00101100) */ + {1, 3, 4, 6, 0, 0, 0, 0}, /* 0x2D (00101101) */ + {2, 3, 4, 6, 0, 0, 0, 0}, /* 0x2E (00101110) */ + {1, 2, 3, 4, 6, 0, 0, 0}, /* 0x2F (00101111) */ + {5, 6, 0, 0, 0, 0, 0, 0}, /* 0x30 (00110000) */ + {1, 5, 6, 0, 0, 0, 0, 0}, /* 0x31 (00110001) */ + {2, 5, 6, 0, 0, 0, 0, 0}, /* 0x32 (00110010) */ + {1, 2, 5, 6, 0, 0, 0, 0}, /* 0x33 (00110011) */ + {3, 5, 6, 0, 0, 0, 0, 0}, /* 0x34 (00110100) */ + {1, 3, 5, 6, 0, 0, 0, 0}, /* 0x35 (00110101) */ + {2, 3, 5, 6, 0, 0, 0, 0}, /* 0x36 (00110110) */ + {1, 2, 3, 5, 6, 0, 0, 0}, /* 0x37 (00110111) */ + {4, 5, 6, 0, 0, 0, 0, 0}, /* 0x38 (00111000) */ + {1, 4, 5, 6, 0, 0, 0, 0}, /* 0x39 (00111001) */ + {2, 4, 5, 6, 0, 0, 0, 0}, /* 0x3A (00111010) */ + {1, 2, 4, 5, 6, 0, 0, 0}, /* 0x3B (00111011) */ + {3, 4, 5, 6, 0, 0, 0, 0}, /* 0x3C (00111100) */ + {1, 3, 4, 5, 6, 0, 0, 0}, /* 0x3D (00111101) */ + {2, 3, 4, 5, 6, 0, 0, 0}, /* 0x3E (00111110) */ + {1, 2, 3, 4, 5, 6, 0, 0}, /* 0x3F (00111111) */ + {7, 0, 0, 0, 0, 0, 0, 0}, /* 0x40 (01000000) */ + {1, 7, 0, 0, 0, 0, 0, 0}, /* 0x41 (01000001) */ + {2, 7, 0, 0, 0, 0, 0, 0}, /* 0x42 (01000010) */ + {1, 2, 7, 0, 0, 0, 0, 0}, /* 0x43 (01000011) */ + {3, 7, 0, 0, 0, 0, 0, 0}, /* 0x44 (01000100) */ + {1, 3, 7, 0, 0, 0, 0, 0}, /* 0x45 (01000101) */ + {2, 3, 7, 0, 0, 0, 0, 0}, /* 0x46 (01000110) */ + {1, 2, 3, 7, 0, 0, 0, 0}, /* 0x47 (01000111) */ + {4, 7, 0, 0, 0, 0, 0, 0}, /* 0x48 (01001000) */ + {1, 4, 7, 0, 0, 0, 0, 0}, /* 0x49 (01001001) */ + {2, 4, 7, 0, 0, 0, 0, 0}, /* 0x4A (01001010) */ + {1, 2, 4, 7, 0, 0, 0, 0}, /* 0x4B (01001011) */ + {3, 4, 7, 0, 0, 0, 0, 0}, /* 0x4C (01001100) */ + {1, 3, 4, 7, 0, 0, 0, 0}, /* 0x4D (01001101) */ + {2, 3, 4, 7, 0, 0, 0, 0}, /* 0x4E (01001110) */ + {1, 2, 3, 4, 7, 0, 0, 0}, /* 0x4F (01001111) */ + {5, 7, 0, 0, 0, 0, 0, 0}, /* 0x50 (01010000) */ + {1, 5, 7, 0, 0, 0, 0, 0}, /* 0x51 (01010001) */ + {2, 5, 7, 0, 0, 0, 0, 0}, /* 0x52 (01010010) */ + {1, 2, 5, 7, 0, 0, 0, 0}, /* 0x53 (01010011) */ + {3, 5, 7, 0, 0, 0, 0, 0}, /* 0x54 (01010100) */ + {1, 3, 5, 7, 0, 0, 0, 0}, /* 0x55 (01010101) */ + {2, 3, 5, 7, 0, 0, 0, 0}, /* 0x56 (01010110) */ + {1, 2, 3, 5, 7, 0, 0, 0}, /* 0x57 (01010111) */ + {4, 5, 7, 0, 0, 0, 0, 0}, /* 0x58 (01011000) */ + {1, 4, 5, 7, 0, 0, 0, 0}, /* 0x59 (01011001) */ + {2, 4, 5, 7, 0, 0, 0, 0}, /* 0x5A (01011010) */ + {1, 2, 4, 5, 7, 0, 0, 0}, /* 0x5B (01011011) */ + {3, 4, 5, 7, 0, 0, 0, 0}, /* 0x5C (01011100) */ + {1, 3, 4, 5, 7, 0, 0, 0}, /* 0x5D (01011101) */ + {2, 3, 4, 5, 7, 0, 0, 0}, /* 0x5E (01011110) */ + {1, 2, 3, 4, 5, 7, 0, 0}, /* 0x5F (01011111) */ + {6, 7, 0, 0, 0, 0, 0, 0}, /* 0x60 (01100000) */ + {1, 6, 7, 0, 0, 0, 0, 0}, /* 0x61 (01100001) */ + {2, 6, 7, 0, 0, 0, 0, 0}, /* 0x62 (01100010) */ + {1, 2, 6, 7, 0, 0, 0, 0}, /* 0x63 (01100011) */ + {3, 6, 7, 0, 0, 0, 0, 0}, /* 0x64 (01100100) */ + {1, 3, 6, 7, 0, 0, 0, 0}, /* 0x65 (01100101) */ + {2, 3, 6, 7, 0, 0, 0, 0}, /* 0x66 (01100110) */ + {1, 2, 3, 6, 7, 0, 0, 0}, /* 0x67 (01100111) */ + {4, 6, 7, 0, 0, 0, 0, 0}, /* 0x68 (01101000) */ + {1, 4, 6, 7, 0, 0, 0, 0}, /* 0x69 (01101001) */ + {2, 4, 6, 7, 0, 0, 0, 0}, /* 0x6A (01101010) */ + {1, 2, 4, 6, 7, 0, 0, 0}, /* 0x6B (01101011) */ + {3, 4, 6, 7, 0, 0, 0, 0}, /* 0x6C (01101100) */ + {1, 3, 4, 6, 7, 0, 0, 0}, /* 0x6D (01101101) */ + {2, 3, 4, 6, 7, 0, 0, 0}, /* 0x6E (01101110) */ + {1, 2, 3, 4, 6, 7, 0, 0}, /* 0x6F (01101111) */ + {5, 6, 7, 0, 0, 0, 0, 0}, /* 0x70 (01110000) */ + {1, 5, 6, 7, 0, 0, 0, 0}, /* 0x71 (01110001) */ + {2, 5, 6, 7, 0, 0, 0, 0}, /* 0x72 (01110010) */ + {1, 2, 5, 6, 7, 0, 0, 0}, /* 0x73 (01110011) */ + {3, 5, 6, 7, 0, 0, 0, 0}, /* 0x74 (01110100) */ + {1, 3, 5, 6, 7, 0, 0, 0}, /* 0x75 (01110101) */ + {2, 3, 5, 6, 7, 0, 0, 0}, /* 0x76 (01110110) */ + {1, 2, 3, 5, 6, 7, 0, 0}, /* 0x77 (01110111) */ + {4, 5, 6, 7, 0, 0, 0, 0}, /* 0x78 (01111000) */ + {1, 4, 5, 6, 7, 0, 0, 0}, /* 0x79 (01111001) */ + {2, 4, 5, 6, 7, 0, 0, 0}, /* 0x7A (01111010) */ + {1, 2, 4, 5, 6, 7, 0, 0}, /* 0x7B (01111011) */ + {3, 4, 5, 6, 7, 0, 0, 0}, /* 0x7C (01111100) */ + {1, 3, 4, 5, 6, 7, 0, 0}, /* 0x7D (01111101) */ + {2, 3, 4, 5, 6, 7, 0, 0}, /* 0x7E (01111110) */ + {1, 2, 3, 4, 5, 6, 7, 0}, /* 0x7F (01111111) */ + {8, 0, 0, 0, 0, 0, 0, 0}, /* 0x80 (10000000) */ + {1, 8, 0, 0, 0, 0, 0, 0}, /* 0x81 (10000001) */ + {2, 8, 0, 0, 0, 0, 0, 0}, /* 0x82 (10000010) */ + {1, 2, 8, 0, 0, 0, 0, 0}, /* 0x83 (10000011) */ + {3, 8, 0, 0, 0, 0, 0, 0}, /* 0x84 (10000100) */ + {1, 3, 8, 0, 0, 0, 0, 0}, /* 0x85 (10000101) */ + {2, 3, 8, 0, 0, 0, 0, 0}, /* 0x86 (10000110) */ + {1, 2, 3, 8, 0, 0, 0, 0}, /* 0x87 (10000111) */ + {4, 8, 0, 0, 0, 0, 0, 0}, /* 0x88 (10001000) */ + {1, 4, 8, 0, 0, 0, 0, 0}, /* 0x89 (10001001) */ + {2, 4, 8, 0, 0, 0, 0, 0}, /* 0x8A (10001010) */ + {1, 2, 4, 8, 0, 0, 0, 0}, /* 0x8B (10001011) */ + {3, 4, 8, 0, 0, 0, 0, 0}, /* 0x8C (10001100) */ + {1, 3, 4, 8, 0, 0, 0, 0}, /* 0x8D (10001101) */ + {2, 3, 4, 8, 0, 0, 0, 0}, /* 0x8E (10001110) */ + {1, 2, 3, 4, 8, 0, 0, 0}, /* 0x8F (10001111) */ + {5, 8, 0, 0, 0, 0, 0, 0}, /* 0x90 (10010000) */ + {1, 5, 8, 0, 0, 0, 0, 0}, /* 0x91 (10010001) */ + {2, 5, 8, 0, 0, 0, 0, 0}, /* 0x92 (10010010) */ + {1, 2, 5, 8, 0, 0, 0, 0}, /* 0x93 (10010011) */ + {3, 5, 8, 0, 0, 0, 0, 0}, /* 0x94 (10010100) */ + {1, 3, 5, 8, 0, 0, 0, 0}, /* 0x95 (10010101) */ + {2, 3, 5, 8, 0, 0, 0, 0}, /* 0x96 (10010110) */ + {1, 2, 3, 5, 8, 0, 0, 0}, /* 0x97 (10010111) */ + {4, 5, 8, 0, 0, 0, 0, 0}, /* 0x98 (10011000) */ + {1, 4, 5, 8, 0, 0, 0, 0}, /* 0x99 (10011001) */ + {2, 4, 5, 8, 0, 0, 0, 0}, /* 0x9A (10011010) */ + {1, 2, 4, 5, 8, 0, 0, 0}, /* 0x9B (10011011) */ + {3, 4, 5, 8, 0, 0, 0, 0}, /* 0x9C (10011100) */ + {1, 3, 4, 5, 8, 0, 0, 0}, /* 0x9D (10011101) */ + {2, 3, 4, 5, 8, 0, 0, 0}, /* 0x9E (10011110) */ + {1, 2, 3, 4, 5, 8, 0, 0}, /* 0x9F (10011111) */ + {6, 8, 0, 0, 0, 0, 0, 0}, /* 0xA0 (10100000) */ + {1, 6, 8, 0, 0, 0, 0, 0}, /* 0xA1 (10100001) */ + {2, 6, 8, 0, 0, 0, 0, 0}, /* 0xA2 (10100010) */ + {1, 2, 6, 8, 0, 0, 0, 0}, /* 0xA3 (10100011) */ + {3, 6, 8, 0, 0, 0, 0, 0}, /* 0xA4 (10100100) */ + {1, 3, 6, 8, 0, 0, 0, 0}, /* 0xA5 (10100101) */ + {2, 3, 6, 8, 0, 0, 0, 0}, /* 0xA6 (10100110) */ + {1, 2, 3, 6, 8, 0, 0, 0}, /* 0xA7 (10100111) */ + {4, 6, 8, 0, 0, 0, 0, 0}, /* 0xA8 (10101000) */ + {1, 4, 6, 8, 0, 0, 0, 0}, /* 0xA9 (10101001) */ + {2, 4, 6, 8, 0, 0, 0, 0}, /* 0xAA (10101010) */ + {1, 2, 4, 6, 8, 0, 0, 0}, /* 0xAB (10101011) */ + {3, 4, 6, 8, 0, 0, 0, 0}, /* 0xAC (10101100) */ + {1, 3, 4, 6, 8, 0, 0, 0}, /* 0xAD (10101101) */ + {2, 3, 4, 6, 8, 0, 0, 0}, /* 0xAE (10101110) */ + {1, 2, 3, 4, 6, 8, 0, 0}, /* 0xAF (10101111) */ + {5, 6, 8, 0, 0, 0, 0, 0}, /* 0xB0 (10110000) */ + {1, 5, 6, 8, 0, 0, 0, 0}, /* 0xB1 (10110001) */ + {2, 5, 6, 8, 0, 0, 0, 0}, /* 0xB2 (10110010) */ + {1, 2, 5, 6, 8, 0, 0, 0}, /* 0xB3 (10110011) */ + {3, 5, 6, 8, 0, 0, 0, 0}, /* 0xB4 (10110100) */ + {1, 3, 5, 6, 8, 0, 0, 0}, /* 0xB5 (10110101) */ + {2, 3, 5, 6, 8, 0, 0, 0}, /* 0xB6 (10110110) */ + {1, 2, 3, 5, 6, 8, 0, 0}, /* 0xB7 (10110111) */ + {4, 5, 6, 8, 0, 0, 0, 0}, /* 0xB8 (10111000) */ + {1, 4, 5, 6, 8, 0, 0, 0}, /* 0xB9 (10111001) */ + {2, 4, 5, 6, 8, 0, 0, 0}, /* 0xBA (10111010) */ + {1, 2, 4, 5, 6, 8, 0, 0}, /* 0xBB (10111011) */ + {3, 4, 5, 6, 8, 0, 0, 0}, /* 0xBC (10111100) */ + {1, 3, 4, 5, 6, 8, 0, 0}, /* 0xBD (10111101) */ + {2, 3, 4, 5, 6, 8, 0, 0}, /* 0xBE (10111110) */ + {1, 2, 3, 4, 5, 6, 8, 0}, /* 0xBF (10111111) */ + {7, 8, 0, 0, 0, 0, 0, 0}, /* 0xC0 (11000000) */ + {1, 7, 8, 0, 0, 0, 0, 0}, /* 0xC1 (11000001) */ + {2, 7, 8, 0, 0, 0, 0, 0}, /* 0xC2 (11000010) */ + {1, 2, 7, 8, 0, 0, 0, 0}, /* 0xC3 (11000011) */ + {3, 7, 8, 0, 0, 0, 0, 0}, /* 0xC4 (11000100) */ + {1, 3, 7, 8, 0, 0, 0, 0}, /* 0xC5 (11000101) */ + {2, 3, 7, 8, 0, 0, 0, 0}, /* 0xC6 (11000110) */ + {1, 2, 3, 7, 8, 0, 0, 0}, /* 0xC7 (11000111) */ + {4, 7, 8, 0, 0, 0, 0, 0}, /* 0xC8 (11001000) */ + {1, 4, 7, 8, 0, 0, 0, 0}, /* 0xC9 (11001001) */ + {2, 4, 7, 8, 0, 0, 0, 0}, /* 0xCA (11001010) */ + {1, 2, 4, 7, 8, 0, 0, 0}, /* 0xCB (11001011) */ + {3, 4, 7, 8, 0, 0, 0, 0}, /* 0xCC (11001100) */ + {1, 3, 4, 7, 8, 0, 0, 0}, /* 0xCD (11001101) */ + {2, 3, 4, 7, 8, 0, 0, 0}, /* 0xCE (11001110) */ + {1, 2, 3, 4, 7, 8, 0, 0}, /* 0xCF (11001111) */ + {5, 7, 8, 0, 0, 0, 0, 0}, /* 0xD0 (11010000) */ + {1, 5, 7, 8, 0, 0, 0, 0}, /* 0xD1 (11010001) */ + {2, 5, 7, 8, 0, 0, 0, 0}, /* 0xD2 (11010010) */ + {1, 2, 5, 7, 8, 0, 0, 0}, /* 0xD3 (11010011) */ + {3, 5, 7, 8, 0, 0, 0, 0}, /* 0xD4 (11010100) */ + {1, 3, 5, 7, 8, 0, 0, 0}, /* 0xD5 (11010101) */ + {2, 3, 5, 7, 8, 0, 0, 0}, /* 0xD6 (11010110) */ + {1, 2, 3, 5, 7, 8, 0, 0}, /* 0xD7 (11010111) */ + {4, 5, 7, 8, 0, 0, 0, 0}, /* 0xD8 (11011000) */ + {1, 4, 5, 7, 8, 0, 0, 0}, /* 0xD9 (11011001) */ + {2, 4, 5, 7, 8, 0, 0, 0}, /* 0xDA (11011010) */ + {1, 2, 4, 5, 7, 8, 0, 0}, /* 0xDB (11011011) */ + {3, 4, 5, 7, 8, 0, 0, 0}, /* 0xDC (11011100) */ + {1, 3, 4, 5, 7, 8, 0, 0}, /* 0xDD (11011101) */ + {2, 3, 4, 5, 7, 8, 0, 0}, /* 0xDE (11011110) */ + {1, 2, 3, 4, 5, 7, 8, 0}, /* 0xDF (11011111) */ + {6, 7, 8, 0, 0, 0, 0, 0}, /* 0xE0 (11100000) */ + {1, 6, 7, 8, 0, 0, 0, 0}, /* 0xE1 (11100001) */ + {2, 6, 7, 8, 0, 0, 0, 0}, /* 0xE2 (11100010) */ + {1, 2, 6, 7, 8, 0, 0, 0}, /* 0xE3 (11100011) */ + {3, 6, 7, 8, 0, 0, 0, 0}, /* 0xE4 (11100100) */ + {1, 3, 6, 7, 8, 0, 0, 0}, /* 0xE5 (11100101) */ + {2, 3, 6, 7, 8, 0, 0, 0}, /* 0xE6 (11100110) */ + {1, 2, 3, 6, 7, 8, 0, 0}, /* 0xE7 (11100111) */ + {4, 6, 7, 8, 0, 0, 0, 0}, /* 0xE8 (11101000) */ + {1, 4, 6, 7, 8, 0, 0, 0}, /* 0xE9 (11101001) */ + {2, 4, 6, 7, 8, 0, 0, 0}, /* 0xEA (11101010) */ + {1, 2, 4, 6, 7, 8, 0, 0}, /* 0xEB (11101011) */ + {3, 4, 6, 7, 8, 0, 0, 0}, /* 0xEC (11101100) */ + {1, 3, 4, 6, 7, 8, 0, 0}, /* 0xED (11101101) */ + {2, 3, 4, 6, 7, 8, 0, 0}, /* 0xEE (11101110) */ + {1, 2, 3, 4, 6, 7, 8, 0}, /* 0xEF (11101111) */ + {5, 6, 7, 8, 0, 0, 0, 0}, /* 0xF0 (11110000) */ + {1, 5, 6, 7, 8, 0, 0, 0}, /* 0xF1 (11110001) */ + {2, 5, 6, 7, 8, 0, 0, 0}, /* 0xF2 (11110010) */ + {1, 2, 5, 6, 7, 8, 0, 0}, /* 0xF3 (11110011) */ + {3, 5, 6, 7, 8, 0, 0, 0}, /* 0xF4 (11110100) */ + {1, 3, 5, 6, 7, 8, 0, 0}, /* 0xF5 (11110101) */ + {2, 3, 5, 6, 7, 8, 0, 0}, /* 0xF6 (11110110) */ + {1, 2, 3, 5, 6, 7, 8, 0}, /* 0xF7 (11110111) */ + {4, 5, 6, 7, 8, 0, 0, 0}, /* 0xF8 (11111000) */ + {1, 4, 5, 6, 7, 8, 0, 0}, /* 0xF9 (11111001) */ + {2, 4, 5, 6, 7, 8, 0, 0}, /* 0xFA (11111010) */ + {1, 2, 4, 5, 6, 7, 8, 0}, /* 0xFB (11111011) */ + {3, 4, 5, 6, 7, 8, 0, 0}, /* 0xFC (11111100) */ + {1, 3, 4, 5, 6, 7, 8, 0}, /* 0xFD (11111101) */ + {2, 3, 4, 5, 6, 7, 8, 0}, /* 0xFE (11111110) */ + {1, 2, 3, 4, 5, 6, 7, 8} /* 0xFF (11111111) */ +}; + +#endif // #if CROARING_IS_X64 + +#if CROARING_IS_X64 +// same as vecDecodeTable but in 16 bits +ALIGNED(32) +static uint16_t vecDecodeTable_uint16[256][8] = { + {0, 0, 0, 0, 0, 0, 0, 0}, /* 0x00 (00000000) */ + {1, 0, 0, 0, 0, 0, 0, 0}, /* 0x01 (00000001) */ + {2, 0, 0, 0, 0, 0, 0, 0}, /* 0x02 (00000010) */ + {1, 2, 0, 0, 0, 0, 0, 0}, /* 0x03 (00000011) */ + {3, 0, 0, 0, 0, 0, 0, 0}, /* 0x04 (00000100) */ + {1, 3, 0, 0, 0, 0, 0, 0}, /* 0x05 (00000101) */ + {2, 3, 0, 0, 0, 0, 0, 0}, /* 0x06 (00000110) */ + {1, 2, 3, 0, 0, 0, 0, 0}, /* 0x07 (00000111) */ + {4, 0, 0, 0, 0, 0, 0, 0}, /* 0x08 (00001000) */ + {1, 4, 0, 0, 0, 0, 0, 0}, /* 0x09 (00001001) */ + {2, 4, 0, 0, 0, 0, 0, 0}, /* 0x0A (00001010) */ + {1, 2, 4, 0, 0, 0, 0, 0}, /* 0x0B (00001011) */ + {3, 4, 0, 0, 0, 0, 0, 0}, /* 0x0C (00001100) */ + {1, 3, 4, 0, 0, 0, 0, 0}, /* 0x0D (00001101) */ + {2, 3, 4, 0, 0, 0, 0, 0}, /* 0x0E (00001110) */ + {1, 2, 3, 4, 0, 0, 0, 0}, /* 0x0F (00001111) */ + {5, 0, 0, 0, 0, 0, 0, 0}, /* 0x10 (00010000) */ + {1, 5, 0, 0, 0, 0, 0, 0}, /* 0x11 (00010001) */ + {2, 5, 0, 0, 0, 0, 0, 0}, /* 0x12 (00010010) */ + {1, 2, 5, 0, 0, 0, 0, 0}, /* 0x13 (00010011) */ + {3, 5, 0, 0, 0, 0, 0, 0}, /* 0x14 (00010100) */ + {1, 3, 5, 0, 0, 0, 0, 0}, /* 0x15 (00010101) */ + {2, 3, 5, 0, 0, 0, 0, 0}, /* 0x16 (00010110) */ + {1, 2, 3, 5, 0, 0, 0, 0}, /* 0x17 (00010111) */ + {4, 5, 0, 0, 0, 0, 0, 0}, /* 0x18 (00011000) */ + {1, 4, 5, 0, 0, 0, 0, 0}, /* 0x19 (00011001) */ + {2, 4, 5, 0, 0, 0, 0, 0}, /* 0x1A (00011010) */ + {1, 2, 4, 5, 0, 0, 0, 0}, /* 0x1B (00011011) */ + {3, 4, 5, 0, 0, 0, 0, 0}, /* 0x1C (00011100) */ + {1, 3, 4, 5, 0, 0, 0, 0}, /* 0x1D (00011101) */ + {2, 3, 4, 5, 0, 0, 0, 0}, /* 0x1E (00011110) */ + {1, 2, 3, 4, 5, 0, 0, 0}, /* 0x1F (00011111) */ + {6, 0, 0, 0, 0, 0, 0, 0}, /* 0x20 (00100000) */ + {1, 6, 0, 0, 0, 0, 0, 0}, /* 0x21 (00100001) */ + {2, 6, 0, 0, 0, 0, 0, 0}, /* 0x22 (00100010) */ + {1, 2, 6, 0, 0, 0, 0, 0}, /* 0x23 (00100011) */ + {3, 6, 0, 0, 0, 0, 0, 0}, /* 0x24 (00100100) */ + {1, 3, 6, 0, 0, 0, 0, 0}, /* 0x25 (00100101) */ + {2, 3, 6, 0, 0, 0, 0, 0}, /* 0x26 (00100110) */ + {1, 2, 3, 6, 0, 0, 0, 0}, /* 0x27 (00100111) */ + {4, 6, 0, 0, 0, 0, 0, 0}, /* 0x28 (00101000) */ + {1, 4, 6, 0, 0, 0, 0, 0}, /* 0x29 (00101001) */ + {2, 4, 6, 0, 0, 0, 0, 0}, /* 0x2A (00101010) */ + {1, 2, 4, 6, 0, 0, 0, 0}, /* 0x2B (00101011) */ + {3, 4, 6, 0, 0, 0, 0, 0}, /* 0x2C (00101100) */ + {1, 3, 4, 6, 0, 0, 0, 0}, /* 0x2D (00101101) */ + {2, 3, 4, 6, 0, 0, 0, 0}, /* 0x2E (00101110) */ + {1, 2, 3, 4, 6, 0, 0, 0}, /* 0x2F (00101111) */ + {5, 6, 0, 0, 0, 0, 0, 0}, /* 0x30 (00110000) */ + {1, 5, 6, 0, 0, 0, 0, 0}, /* 0x31 (00110001) */ + {2, 5, 6, 0, 0, 0, 0, 0}, /* 0x32 (00110010) */ + {1, 2, 5, 6, 0, 0, 0, 0}, /* 0x33 (00110011) */ + {3, 5, 6, 0, 0, 0, 0, 0}, /* 0x34 (00110100) */ + {1, 3, 5, 6, 0, 0, 0, 0}, /* 0x35 (00110101) */ + {2, 3, 5, 6, 0, 0, 0, 0}, /* 0x36 (00110110) */ + {1, 2, 3, 5, 6, 0, 0, 0}, /* 0x37 (00110111) */ + {4, 5, 6, 0, 0, 0, 0, 0}, /* 0x38 (00111000) */ + {1, 4, 5, 6, 0, 0, 0, 0}, /* 0x39 (00111001) */ + {2, 4, 5, 6, 0, 0, 0, 0}, /* 0x3A (00111010) */ + {1, 2, 4, 5, 6, 0, 0, 0}, /* 0x3B (00111011) */ + {3, 4, 5, 6, 0, 0, 0, 0}, /* 0x3C (00111100) */ + {1, 3, 4, 5, 6, 0, 0, 0}, /* 0x3D (00111101) */ + {2, 3, 4, 5, 6, 0, 0, 0}, /* 0x3E (00111110) */ + {1, 2, 3, 4, 5, 6, 0, 0}, /* 0x3F (00111111) */ + {7, 0, 0, 0, 0, 0, 0, 0}, /* 0x40 (01000000) */ + {1, 7, 0, 0, 0, 0, 0, 0}, /* 0x41 (01000001) */ + {2, 7, 0, 0, 0, 0, 0, 0}, /* 0x42 (01000010) */ + {1, 2, 7, 0, 0, 0, 0, 0}, /* 0x43 (01000011) */ + {3, 7, 0, 0, 0, 0, 0, 0}, /* 0x44 (01000100) */ + {1, 3, 7, 0, 0, 0, 0, 0}, /* 0x45 (01000101) */ + {2, 3, 7, 0, 0, 0, 0, 0}, /* 0x46 (01000110) */ + {1, 2, 3, 7, 0, 0, 0, 0}, /* 0x47 (01000111) */ + {4, 7, 0, 0, 0, 0, 0, 0}, /* 0x48 (01001000) */ + {1, 4, 7, 0, 0, 0, 0, 0}, /* 0x49 (01001001) */ + {2, 4, 7, 0, 0, 0, 0, 0}, /* 0x4A (01001010) */ + {1, 2, 4, 7, 0, 0, 0, 0}, /* 0x4B (01001011) */ + {3, 4, 7, 0, 0, 0, 0, 0}, /* 0x4C (01001100) */ + {1, 3, 4, 7, 0, 0, 0, 0}, /* 0x4D (01001101) */ + {2, 3, 4, 7, 0, 0, 0, 0}, /* 0x4E (01001110) */ + {1, 2, 3, 4, 7, 0, 0, 0}, /* 0x4F (01001111) */ + {5, 7, 0, 0, 0, 0, 0, 0}, /* 0x50 (01010000) */ + {1, 5, 7, 0, 0, 0, 0, 0}, /* 0x51 (01010001) */ + {2, 5, 7, 0, 0, 0, 0, 0}, /* 0x52 (01010010) */ + {1, 2, 5, 7, 0, 0, 0, 0}, /* 0x53 (01010011) */ + {3, 5, 7, 0, 0, 0, 0, 0}, /* 0x54 (01010100) */ + {1, 3, 5, 7, 0, 0, 0, 0}, /* 0x55 (01010101) */ + {2, 3, 5, 7, 0, 0, 0, 0}, /* 0x56 (01010110) */ + {1, 2, 3, 5, 7, 0, 0, 0}, /* 0x57 (01010111) */ + {4, 5, 7, 0, 0, 0, 0, 0}, /* 0x58 (01011000) */ + {1, 4, 5, 7, 0, 0, 0, 0}, /* 0x59 (01011001) */ + {2, 4, 5, 7, 0, 0, 0, 0}, /* 0x5A (01011010) */ + {1, 2, 4, 5, 7, 0, 0, 0}, /* 0x5B (01011011) */ + {3, 4, 5, 7, 0, 0, 0, 0}, /* 0x5C (01011100) */ + {1, 3, 4, 5, 7, 0, 0, 0}, /* 0x5D (01011101) */ + {2, 3, 4, 5, 7, 0, 0, 0}, /* 0x5E (01011110) */ + {1, 2, 3, 4, 5, 7, 0, 0}, /* 0x5F (01011111) */ + {6, 7, 0, 0, 0, 0, 0, 0}, /* 0x60 (01100000) */ + {1, 6, 7, 0, 0, 0, 0, 0}, /* 0x61 (01100001) */ + {2, 6, 7, 0, 0, 0, 0, 0}, /* 0x62 (01100010) */ + {1, 2, 6, 7, 0, 0, 0, 0}, /* 0x63 (01100011) */ + {3, 6, 7, 0, 0, 0, 0, 0}, /* 0x64 (01100100) */ + {1, 3, 6, 7, 0, 0, 0, 0}, /* 0x65 (01100101) */ + {2, 3, 6, 7, 0, 0, 0, 0}, /* 0x66 (01100110) */ + {1, 2, 3, 6, 7, 0, 0, 0}, /* 0x67 (01100111) */ + {4, 6, 7, 0, 0, 0, 0, 0}, /* 0x68 (01101000) */ + {1, 4, 6, 7, 0, 0, 0, 0}, /* 0x69 (01101001) */ + {2, 4, 6, 7, 0, 0, 0, 0}, /* 0x6A (01101010) */ + {1, 2, 4, 6, 7, 0, 0, 0}, /* 0x6B (01101011) */ + {3, 4, 6, 7, 0, 0, 0, 0}, /* 0x6C (01101100) */ + {1, 3, 4, 6, 7, 0, 0, 0}, /* 0x6D (01101101) */ + {2, 3, 4, 6, 7, 0, 0, 0}, /* 0x6E (01101110) */ + {1, 2, 3, 4, 6, 7, 0, 0}, /* 0x6F (01101111) */ + {5, 6, 7, 0, 0, 0, 0, 0}, /* 0x70 (01110000) */ + {1, 5, 6, 7, 0, 0, 0, 0}, /* 0x71 (01110001) */ + {2, 5, 6, 7, 0, 0, 0, 0}, /* 0x72 (01110010) */ + {1, 2, 5, 6, 7, 0, 0, 0}, /* 0x73 (01110011) */ + {3, 5, 6, 7, 0, 0, 0, 0}, /* 0x74 (01110100) */ + {1, 3, 5, 6, 7, 0, 0, 0}, /* 0x75 (01110101) */ + {2, 3, 5, 6, 7, 0, 0, 0}, /* 0x76 (01110110) */ + {1, 2, 3, 5, 6, 7, 0, 0}, /* 0x77 (01110111) */ + {4, 5, 6, 7, 0, 0, 0, 0}, /* 0x78 (01111000) */ + {1, 4, 5, 6, 7, 0, 0, 0}, /* 0x79 (01111001) */ + {2, 4, 5, 6, 7, 0, 0, 0}, /* 0x7A (01111010) */ + {1, 2, 4, 5, 6, 7, 0, 0}, /* 0x7B (01111011) */ + {3, 4, 5, 6, 7, 0, 0, 0}, /* 0x7C (01111100) */ + {1, 3, 4, 5, 6, 7, 0, 0}, /* 0x7D (01111101) */ + {2, 3, 4, 5, 6, 7, 0, 0}, /* 0x7E (01111110) */ + {1, 2, 3, 4, 5, 6, 7, 0}, /* 0x7F (01111111) */ + {8, 0, 0, 0, 0, 0, 0, 0}, /* 0x80 (10000000) */ + {1, 8, 0, 0, 0, 0, 0, 0}, /* 0x81 (10000001) */ + {2, 8, 0, 0, 0, 0, 0, 0}, /* 0x82 (10000010) */ + {1, 2, 8, 0, 0, 0, 0, 0}, /* 0x83 (10000011) */ + {3, 8, 0, 0, 0, 0, 0, 0}, /* 0x84 (10000100) */ + {1, 3, 8, 0, 0, 0, 0, 0}, /* 0x85 (10000101) */ + {2, 3, 8, 0, 0, 0, 0, 0}, /* 0x86 (10000110) */ + {1, 2, 3, 8, 0, 0, 0, 0}, /* 0x87 (10000111) */ + {4, 8, 0, 0, 0, 0, 0, 0}, /* 0x88 (10001000) */ + {1, 4, 8, 0, 0, 0, 0, 0}, /* 0x89 (10001001) */ + {2, 4, 8, 0, 0, 0, 0, 0}, /* 0x8A (10001010) */ + {1, 2, 4, 8, 0, 0, 0, 0}, /* 0x8B (10001011) */ + {3, 4, 8, 0, 0, 0, 0, 0}, /* 0x8C (10001100) */ + {1, 3, 4, 8, 0, 0, 0, 0}, /* 0x8D (10001101) */ + {2, 3, 4, 8, 0, 0, 0, 0}, /* 0x8E (10001110) */ + {1, 2, 3, 4, 8, 0, 0, 0}, /* 0x8F (10001111) */ + {5, 8, 0, 0, 0, 0, 0, 0}, /* 0x90 (10010000) */ + {1, 5, 8, 0, 0, 0, 0, 0}, /* 0x91 (10010001) */ + {2, 5, 8, 0, 0, 0, 0, 0}, /* 0x92 (10010010) */ + {1, 2, 5, 8, 0, 0, 0, 0}, /* 0x93 (10010011) */ + {3, 5, 8, 0, 0, 0, 0, 0}, /* 0x94 (10010100) */ + {1, 3, 5, 8, 0, 0, 0, 0}, /* 0x95 (10010101) */ + {2, 3, 5, 8, 0, 0, 0, 0}, /* 0x96 (10010110) */ + {1, 2, 3, 5, 8, 0, 0, 0}, /* 0x97 (10010111) */ + {4, 5, 8, 0, 0, 0, 0, 0}, /* 0x98 (10011000) */ + {1, 4, 5, 8, 0, 0, 0, 0}, /* 0x99 (10011001) */ + {2, 4, 5, 8, 0, 0, 0, 0}, /* 0x9A (10011010) */ + {1, 2, 4, 5, 8, 0, 0, 0}, /* 0x9B (10011011) */ + {3, 4, 5, 8, 0, 0, 0, 0}, /* 0x9C (10011100) */ + {1, 3, 4, 5, 8, 0, 0, 0}, /* 0x9D (10011101) */ + {2, 3, 4, 5, 8, 0, 0, 0}, /* 0x9E (10011110) */ + {1, 2, 3, 4, 5, 8, 0, 0}, /* 0x9F (10011111) */ + {6, 8, 0, 0, 0, 0, 0, 0}, /* 0xA0 (10100000) */ + {1, 6, 8, 0, 0, 0, 0, 0}, /* 0xA1 (10100001) */ + {2, 6, 8, 0, 0, 0, 0, 0}, /* 0xA2 (10100010) */ + {1, 2, 6, 8, 0, 0, 0, 0}, /* 0xA3 (10100011) */ + {3, 6, 8, 0, 0, 0, 0, 0}, /* 0xA4 (10100100) */ + {1, 3, 6, 8, 0, 0, 0, 0}, /* 0xA5 (10100101) */ + {2, 3, 6, 8, 0, 0, 0, 0}, /* 0xA6 (10100110) */ + {1, 2, 3, 6, 8, 0, 0, 0}, /* 0xA7 (10100111) */ + {4, 6, 8, 0, 0, 0, 0, 0}, /* 0xA8 (10101000) */ + {1, 4, 6, 8, 0, 0, 0, 0}, /* 0xA9 (10101001) */ + {2, 4, 6, 8, 0, 0, 0, 0}, /* 0xAA (10101010) */ + {1, 2, 4, 6, 8, 0, 0, 0}, /* 0xAB (10101011) */ + {3, 4, 6, 8, 0, 0, 0, 0}, /* 0xAC (10101100) */ + {1, 3, 4, 6, 8, 0, 0, 0}, /* 0xAD (10101101) */ + {2, 3, 4, 6, 8, 0, 0, 0}, /* 0xAE (10101110) */ + {1, 2, 3, 4, 6, 8, 0, 0}, /* 0xAF (10101111) */ + {5, 6, 8, 0, 0, 0, 0, 0}, /* 0xB0 (10110000) */ + {1, 5, 6, 8, 0, 0, 0, 0}, /* 0xB1 (10110001) */ + {2, 5, 6, 8, 0, 0, 0, 0}, /* 0xB2 (10110010) */ + {1, 2, 5, 6, 8, 0, 0, 0}, /* 0xB3 (10110011) */ + {3, 5, 6, 8, 0, 0, 0, 0}, /* 0xB4 (10110100) */ + {1, 3, 5, 6, 8, 0, 0, 0}, /* 0xB5 (10110101) */ + {2, 3, 5, 6, 8, 0, 0, 0}, /* 0xB6 (10110110) */ + {1, 2, 3, 5, 6, 8, 0, 0}, /* 0xB7 (10110111) */ + {4, 5, 6, 8, 0, 0, 0, 0}, /* 0xB8 (10111000) */ + {1, 4, 5, 6, 8, 0, 0, 0}, /* 0xB9 (10111001) */ + {2, 4, 5, 6, 8, 0, 0, 0}, /* 0xBA (10111010) */ + {1, 2, 4, 5, 6, 8, 0, 0}, /* 0xBB (10111011) */ + {3, 4, 5, 6, 8, 0, 0, 0}, /* 0xBC (10111100) */ + {1, 3, 4, 5, 6, 8, 0, 0}, /* 0xBD (10111101) */ + {2, 3, 4, 5, 6, 8, 0, 0}, /* 0xBE (10111110) */ + {1, 2, 3, 4, 5, 6, 8, 0}, /* 0xBF (10111111) */ + {7, 8, 0, 0, 0, 0, 0, 0}, /* 0xC0 (11000000) */ + {1, 7, 8, 0, 0, 0, 0, 0}, /* 0xC1 (11000001) */ + {2, 7, 8, 0, 0, 0, 0, 0}, /* 0xC2 (11000010) */ + {1, 2, 7, 8, 0, 0, 0, 0}, /* 0xC3 (11000011) */ + {3, 7, 8, 0, 0, 0, 0, 0}, /* 0xC4 (11000100) */ + {1, 3, 7, 8, 0, 0, 0, 0}, /* 0xC5 (11000101) */ + {2, 3, 7, 8, 0, 0, 0, 0}, /* 0xC6 (11000110) */ + {1, 2, 3, 7, 8, 0, 0, 0}, /* 0xC7 (11000111) */ + {4, 7, 8, 0, 0, 0, 0, 0}, /* 0xC8 (11001000) */ + {1, 4, 7, 8, 0, 0, 0, 0}, /* 0xC9 (11001001) */ + {2, 4, 7, 8, 0, 0, 0, 0}, /* 0xCA (11001010) */ + {1, 2, 4, 7, 8, 0, 0, 0}, /* 0xCB (11001011) */ + {3, 4, 7, 8, 0, 0, 0, 0}, /* 0xCC (11001100) */ + {1, 3, 4, 7, 8, 0, 0, 0}, /* 0xCD (11001101) */ + {2, 3, 4, 7, 8, 0, 0, 0}, /* 0xCE (11001110) */ + {1, 2, 3, 4, 7, 8, 0, 0}, /* 0xCF (11001111) */ + {5, 7, 8, 0, 0, 0, 0, 0}, /* 0xD0 (11010000) */ + {1, 5, 7, 8, 0, 0, 0, 0}, /* 0xD1 (11010001) */ + {2, 5, 7, 8, 0, 0, 0, 0}, /* 0xD2 (11010010) */ + {1, 2, 5, 7, 8, 0, 0, 0}, /* 0xD3 (11010011) */ + {3, 5, 7, 8, 0, 0, 0, 0}, /* 0xD4 (11010100) */ + {1, 3, 5, 7, 8, 0, 0, 0}, /* 0xD5 (11010101) */ + {2, 3, 5, 7, 8, 0, 0, 0}, /* 0xD6 (11010110) */ + {1, 2, 3, 5, 7, 8, 0, 0}, /* 0xD7 (11010111) */ + {4, 5, 7, 8, 0, 0, 0, 0}, /* 0xD8 (11011000) */ + {1, 4, 5, 7, 8, 0, 0, 0}, /* 0xD9 (11011001) */ + {2, 4, 5, 7, 8, 0, 0, 0}, /* 0xDA (11011010) */ + {1, 2, 4, 5, 7, 8, 0, 0}, /* 0xDB (11011011) */ + {3, 4, 5, 7, 8, 0, 0, 0}, /* 0xDC (11011100) */ + {1, 3, 4, 5, 7, 8, 0, 0}, /* 0xDD (11011101) */ + {2, 3, 4, 5, 7, 8, 0, 0}, /* 0xDE (11011110) */ + {1, 2, 3, 4, 5, 7, 8, 0}, /* 0xDF (11011111) */ + {6, 7, 8, 0, 0, 0, 0, 0}, /* 0xE0 (11100000) */ + {1, 6, 7, 8, 0, 0, 0, 0}, /* 0xE1 (11100001) */ + {2, 6, 7, 8, 0, 0, 0, 0}, /* 0xE2 (11100010) */ + {1, 2, 6, 7, 8, 0, 0, 0}, /* 0xE3 (11100011) */ + {3, 6, 7, 8, 0, 0, 0, 0}, /* 0xE4 (11100100) */ + {1, 3, 6, 7, 8, 0, 0, 0}, /* 0xE5 (11100101) */ + {2, 3, 6, 7, 8, 0, 0, 0}, /* 0xE6 (11100110) */ + {1, 2, 3, 6, 7, 8, 0, 0}, /* 0xE7 (11100111) */ + {4, 6, 7, 8, 0, 0, 0, 0}, /* 0xE8 (11101000) */ + {1, 4, 6, 7, 8, 0, 0, 0}, /* 0xE9 (11101001) */ + {2, 4, 6, 7, 8, 0, 0, 0}, /* 0xEA (11101010) */ + {1, 2, 4, 6, 7, 8, 0, 0}, /* 0xEB (11101011) */ + {3, 4, 6, 7, 8, 0, 0, 0}, /* 0xEC (11101100) */ + {1, 3, 4, 6, 7, 8, 0, 0}, /* 0xED (11101101) */ + {2, 3, 4, 6, 7, 8, 0, 0}, /* 0xEE (11101110) */ + {1, 2, 3, 4, 6, 7, 8, 0}, /* 0xEF (11101111) */ + {5, 6, 7, 8, 0, 0, 0, 0}, /* 0xF0 (11110000) */ + {1, 5, 6, 7, 8, 0, 0, 0}, /* 0xF1 (11110001) */ + {2, 5, 6, 7, 8, 0, 0, 0}, /* 0xF2 (11110010) */ + {1, 2, 5, 6, 7, 8, 0, 0}, /* 0xF3 (11110011) */ + {3, 5, 6, 7, 8, 0, 0, 0}, /* 0xF4 (11110100) */ + {1, 3, 5, 6, 7, 8, 0, 0}, /* 0xF5 (11110101) */ + {2, 3, 5, 6, 7, 8, 0, 0}, /* 0xF6 (11110110) */ + {1, 2, 3, 5, 6, 7, 8, 0}, /* 0xF7 (11110111) */ + {4, 5, 6, 7, 8, 0, 0, 0}, /* 0xF8 (11111000) */ + {1, 4, 5, 6, 7, 8, 0, 0}, /* 0xF9 (11111001) */ + {2, 4, 5, 6, 7, 8, 0, 0}, /* 0xFA (11111010) */ + {1, 2, 4, 5, 6, 7, 8, 0}, /* 0xFB (11111011) */ + {3, 4, 5, 6, 7, 8, 0, 0}, /* 0xFC (11111100) */ + {1, 3, 4, 5, 6, 7, 8, 0}, /* 0xFD (11111101) */ + {2, 3, 4, 5, 6, 7, 8, 0}, /* 0xFE (11111110) */ + {1, 2, 3, 4, 5, 6, 7, 8} /* 0xFF (11111111) */ +}; + +#endif + +#if CROARING_IS_X64 +#if CROARING_COMPILER_SUPPORTS_AVX512 +CROARING_TARGET_AVX512 +const uint8_t vbmi2_table[64] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}; +size_t bitset_extract_setbits_avx512(const uint64_t *words, size_t length, uint32_t *vout, + size_t outcapacity, uint32_t base) { + uint32_t *out = (uint32_t *)vout; + uint32_t *initout = out; + uint32_t *safeout = out + outcapacity; + __m512i base_v = _mm512_set1_epi32(base); + __m512i index_table = _mm512_loadu_si512(vbmi2_table); + size_t i = 0; + + for (; (i < length) && ((out + 64) < safeout); i += 1) + { + uint64_t v = words[i]; + __m512i vec = _mm512_maskz_compress_epi8(v, index_table); + + uint8_t advance = roaring_hamming(v); + + __m512i vbase = _mm512_add_epi32(base_v, _mm512_set1_epi32(i * 64)); + __m512i r1 = _mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32(vec,0)); + __m512i r2 = _mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32(vec,1)); + __m512i r3 = _mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32(vec,2)); + __m512i r4 = _mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32(vec,3)); + + r1 = _mm512_add_epi32(r1, vbase); + r2 = _mm512_add_epi32(r2, vbase); + r3 = _mm512_add_epi32(r3, vbase); + r4 = _mm512_add_epi32(r4, vbase); + _mm512_storeu_si512((__m512i *)out, r1); + _mm512_storeu_si512((__m512i *)(out + 16), r2); + _mm512_storeu_si512((__m512i *)(out + 32), r3); + _mm512_storeu_si512((__m512i *)(out + 48), r4); + + out += advance; + + } + + base += i * 64; + + for (; (i < length) && (out < safeout); ++i) { + uint64_t w = words[i]; + while ((w != 0) && (out < safeout)) { + uint64_t t = w & (~w + 1); // on x64, should compile to BLSI (careful: the Intel compiler seems to fail) + int r = roaring_trailing_zeroes(w); // on x64, should compile to TZCNT + uint32_t val = r + base; + memcpy(out, &val, + sizeof(uint32_t)); // should be compiled as a MOV on x64 + out++; + w ^= t; + } + base += 64; + } + + + return out - initout; + +} + +// Reference: https://lemire.me/blog/2022/05/10/faster-bitset-decoding-using-intel-avx-512/ +size_t bitset_extract_setbits_avx512_uint16(const uint64_t *array, size_t length, + uint16_t *vout, size_t capacity, uint16_t base) { + uint16_t *out = (uint16_t *)vout; + uint16_t *initout = out; + uint16_t *safeout = vout + capacity; + + __m512i base_v = _mm512_set1_epi16(base); + __m512i index_table = _mm512_loadu_si512(vbmi2_table); + size_t i = 0; + + for (; (i < length) && ((out + 64) < safeout); i++) + { + uint64_t v = array[i]; + __m512i vec = _mm512_maskz_compress_epi8(v, index_table); + + uint8_t advance = roaring_hamming(v); + + __m512i vbase = _mm512_add_epi16(base_v, _mm512_set1_epi16(i * 64)); + __m512i r1 = _mm512_cvtepi8_epi16(_mm512_extracti32x8_epi32(vec,0)); + __m512i r2 = _mm512_cvtepi8_epi16(_mm512_extracti32x8_epi32(vec,1)); + + r1 = _mm512_add_epi16(r1, vbase); + r2 = _mm512_add_epi16(r2, vbase); + + _mm512_storeu_si512((__m512i *)out, r1); + _mm512_storeu_si512((__m512i *)(out + 32), r2); + out += advance; + + } + + base += i * 64; + + for (; (i < length) && (out < safeout); ++i) { + uint64_t w = array[i]; + while ((w != 0) && (out < safeout)) { + uint64_t t = w & (~w + 1); // on x64, should compile to BLSI (careful: the Intel compiler seems to fail) + int r = roaring_trailing_zeroes(w); // on x64, should compile to TZCNT + uint32_t val = r + base; + memcpy(out, &val, + sizeof(uint16_t)); + out++; + w ^= t; + } + base += 64; + } + + return out - initout; +} +CROARING_UNTARGET_AVX512 +#endif + +CROARING_TARGET_AVX2 +size_t bitset_extract_setbits_avx2(const uint64_t *words, size_t length, + uint32_t *out, size_t outcapacity, + uint32_t base) { + uint32_t *initout = out; + __m256i baseVec = _mm256_set1_epi32(base - 1); + __m256i incVec = _mm256_set1_epi32(64); + __m256i add8 = _mm256_set1_epi32(8); + uint32_t *safeout = out + outcapacity; + size_t i = 0; + for (; (i < length) && (out + 64 <= safeout); ++i) { + uint64_t w = words[i]; + if (w == 0) { + baseVec = _mm256_add_epi32(baseVec, incVec); + } else { + for (int k = 0; k < 4; ++k) { + uint8_t byteA = (uint8_t)w; + uint8_t byteB = (uint8_t)(w >> 8); + w >>= 16; + __m256i vecA = + _mm256_loadu_si256((const __m256i *)vecDecodeTable[byteA]); + __m256i vecB = + _mm256_loadu_si256((const __m256i *)vecDecodeTable[byteB]); + uint8_t advanceA = lengthTable[byteA]; + uint8_t advanceB = lengthTable[byteB]; + vecA = _mm256_add_epi32(baseVec, vecA); + baseVec = _mm256_add_epi32(baseVec, add8); + vecB = _mm256_add_epi32(baseVec, vecB); + baseVec = _mm256_add_epi32(baseVec, add8); + _mm256_storeu_si256((__m256i *)out, vecA); + out += advanceA; + _mm256_storeu_si256((__m256i *)out, vecB); + out += advanceB; + } + } + } + base += i * 64; + for (; (i < length) && (out < safeout); ++i) { + uint64_t w = words[i]; + while ((w != 0) && (out < safeout)) { + uint64_t t = w & (~w + 1); // on x64, should compile to BLSI (careful: the Intel compiler seems to fail) + int r = roaring_trailing_zeroes(w); // on x64, should compile to TZCNT + uint32_t val = r + base; + memcpy(out, &val, + sizeof(uint32_t)); // should be compiled as a MOV on x64 + out++; + w ^= t; + } + base += 64; + } + return out - initout; +} +CROARING_UNTARGET_AVX2 +#endif // CROARING_IS_X64 + +size_t bitset_extract_setbits(const uint64_t *words, size_t length, + uint32_t *out, uint32_t base) { + int outpos = 0; + for (size_t i = 0; i < length; ++i) { + uint64_t w = words[i]; + while (w != 0) { + uint64_t t = w & (~w + 1); // on x64, should compile to BLSI (careful: the Intel compiler seems to fail) + int r = roaring_trailing_zeroes(w); // on x64, should compile to TZCNT + uint32_t val = r + base; + memcpy(out + outpos, &val, + sizeof(uint32_t)); // should be compiled as a MOV on x64 + outpos++; + w ^= t; + } + base += 64; + } + return outpos; +} + +size_t bitset_extract_intersection_setbits_uint16(const uint64_t * __restrict__ words1, + const uint64_t * __restrict__ words2, + size_t length, uint16_t *out, + uint16_t base) { + int outpos = 0; + for (size_t i = 0; i < length; ++i) { + uint64_t w = words1[i] & words2[i]; + while (w != 0) { + uint64_t t = w & (~w + 1); + int r = roaring_trailing_zeroes(w); + out[outpos++] = r + base; + w ^= t; + } + base += 64; + } + return outpos; +} + +#if CROARING_IS_X64 +/* + * Given a bitset containing "length" 64-bit words, write out the position + * of all the set bits to "out" as 16-bit integers, values start at "base" (can + *be set to zero). + * + * The "out" pointer should be sufficient to store the actual number of bits + *set. + * + * Returns how many values were actually decoded. + * + * This function uses SSE decoding. + */ +CROARING_TARGET_AVX2 +size_t bitset_extract_setbits_sse_uint16(const uint64_t *words, size_t length, + uint16_t *out, size_t outcapacity, + uint16_t base) { + uint16_t *initout = out; + __m128i baseVec = _mm_set1_epi16(base - 1); + __m128i incVec = _mm_set1_epi16(64); + __m128i add8 = _mm_set1_epi16(8); + uint16_t *safeout = out + outcapacity; + const int numberofbytes = 2; // process two bytes at a time + size_t i = 0; + for (; (i < length) && (out + numberofbytes * 8 <= safeout); ++i) { + uint64_t w = words[i]; + if (w == 0) { + baseVec = _mm_add_epi16(baseVec, incVec); + } else { + for (int k = 0; k < 4; ++k) { + uint8_t byteA = (uint8_t)w; + uint8_t byteB = (uint8_t)(w >> 8); + w >>= 16; + __m128i vecA = _mm_loadu_si128( + (const __m128i *)vecDecodeTable_uint16[byteA]); + __m128i vecB = _mm_loadu_si128( + (const __m128i *)vecDecodeTable_uint16[byteB]); + uint8_t advanceA = lengthTable[byteA]; + uint8_t advanceB = lengthTable[byteB]; + vecA = _mm_add_epi16(baseVec, vecA); + baseVec = _mm_add_epi16(baseVec, add8); + vecB = _mm_add_epi16(baseVec, vecB); + baseVec = _mm_add_epi16(baseVec, add8); + _mm_storeu_si128((__m128i *)out, vecA); + out += advanceA; + _mm_storeu_si128((__m128i *)out, vecB); + out += advanceB; + } + } + } + base += (uint16_t)(i * 64); + for (; (i < length) && (out < safeout); ++i) { + uint64_t w = words[i]; + while ((w != 0) && (out < safeout)) { + uint64_t t = w & (~w + 1); + int r = roaring_trailing_zeroes(w); + *out = r + base; + out++; + w ^= t; + } + base += 64; + } + return out - initout; +} +CROARING_UNTARGET_AVX2 +#endif + +/* + * Given a bitset containing "length" 64-bit words, write out the position + * of all the set bits to "out", values start at "base" (can be set to zero). + * + * The "out" pointer should be sufficient to store the actual number of bits + *set. + * + * Returns how many values were actually decoded. + */ +size_t bitset_extract_setbits_uint16(const uint64_t *words, size_t length, + uint16_t *out, uint16_t base) { + int outpos = 0; + for (size_t i = 0; i < length; ++i) { + uint64_t w = words[i]; + while (w != 0) { + uint64_t t = w & (~w + 1); + int r = roaring_trailing_zeroes(w); + out[outpos++] = r + base; + w ^= t; + } + base += 64; + } + return outpos; +} + +#if defined(CROARING_ASMBITMANIPOPTIMIZATION) && defined(CROARING_IS_X64) + +static inline uint64_t _asm_bitset_set_list_withcard(uint64_t *words, uint64_t card, + const uint16_t *list, uint64_t length) { + uint64_t offset, load, pos; + uint64_t shift = 6; + const uint16_t *end = list + length; + if (!length) return card; + // TODO: could unroll for performance, see bitset_set_list + // bts is not available as an intrinsic in GCC + __asm volatile( + "1:\n" + "movzwq (%[list]), %[pos]\n" + "shrx %[shift], %[pos], %[offset]\n" + "mov (%[words],%[offset],8), %[load]\n" + "bts %[pos], %[load]\n" + "mov %[load], (%[words],%[offset],8)\n" + "sbb $-1, %[card]\n" + "add $2, %[list]\n" + "cmp %[list], %[end]\n" + "jnz 1b" + : [card] "+&r"(card), [list] "+&r"(list), [load] "=&r"(load), + [pos] "=&r"(pos), [offset] "=&r"(offset) + : [end] "r"(end), [words] "r"(words), [shift] "r"(shift)); + return card; +} + +static inline void _asm_bitset_set_list(uint64_t *words, const uint16_t *list, uint64_t length) { + uint64_t pos; + const uint16_t *end = list + length; + + uint64_t shift = 6; + uint64_t offset; + uint64_t load; + for (; list + 3 < end; list += 4) { + pos = list[0]; + __asm volatile( + "shrx %[shift], %[pos], %[offset]\n" + "mov (%[words],%[offset],8), %[load]\n" + "bts %[pos], %[load]\n" + "mov %[load], (%[words],%[offset],8)" + : [load] "=&r"(load), [offset] "=&r"(offset) + : [words] "r"(words), [shift] "r"(shift), [pos] "r"(pos)); + pos = list[1]; + __asm volatile( + "shrx %[shift], %[pos], %[offset]\n" + "mov (%[words],%[offset],8), %[load]\n" + "bts %[pos], %[load]\n" + "mov %[load], (%[words],%[offset],8)" + : [load] "=&r"(load), [offset] "=&r"(offset) + : [words] "r"(words), [shift] "r"(shift), [pos] "r"(pos)); + pos = list[2]; + __asm volatile( + "shrx %[shift], %[pos], %[offset]\n" + "mov (%[words],%[offset],8), %[load]\n" + "bts %[pos], %[load]\n" + "mov %[load], (%[words],%[offset],8)" + : [load] "=&r"(load), [offset] "=&r"(offset) + : [words] "r"(words), [shift] "r"(shift), [pos] "r"(pos)); + pos = list[3]; + __asm volatile( + "shrx %[shift], %[pos], %[offset]\n" + "mov (%[words],%[offset],8), %[load]\n" + "bts %[pos], %[load]\n" + "mov %[load], (%[words],%[offset],8)" + : [load] "=&r"(load), [offset] "=&r"(offset) + : [words] "r"(words), [shift] "r"(shift), [pos] "r"(pos)); + } + + while (list != end) { + pos = list[0]; + __asm volatile( + "shrx %[shift], %[pos], %[offset]\n" + "mov (%[words],%[offset],8), %[load]\n" + "bts %[pos], %[load]\n" + "mov %[load], (%[words],%[offset],8)" + : [load] "=&r"(load), [offset] "=&r"(offset) + : [words] "r"(words), [shift] "r"(shift), [pos] "r"(pos)); + list++; + } +} + +static inline uint64_t _asm_bitset_clear_list(uint64_t *words, uint64_t card, const uint16_t *list, + uint64_t length) { + uint64_t offset, load, pos; + uint64_t shift = 6; + const uint16_t *end = list + length; + if (!length) return card; + // btr is not available as an intrinsic in GCC + __asm volatile( + "1:\n" + "movzwq (%[list]), %[pos]\n" + "shrx %[shift], %[pos], %[offset]\n" + "mov (%[words],%[offset],8), %[load]\n" + "btr %[pos], %[load]\n" + "mov %[load], (%[words],%[offset],8)\n" + "sbb $0, %[card]\n" + "add $2, %[list]\n" + "cmp %[list], %[end]\n" + "jnz 1b" + : [card] "+&r"(card), [list] "+&r"(list), [load] "=&r"(load), + [pos] "=&r"(pos), [offset] "=&r"(offset) + : [end] "r"(end), [words] "r"(words), [shift] "r"(shift) + : + /* clobbers */ "memory"); + return card; +} + +static inline uint64_t _scalar_bitset_clear_list(uint64_t *words, uint64_t card, const uint16_t *list, + uint64_t length) { + uint64_t offset, load, newload, pos, index; + const uint16_t *end = list + length; + while (list != end) { + pos = *(const uint16_t *)list; + offset = pos >> 6; + index = pos % 64; + load = words[offset]; + newload = load & ~(UINT64_C(1) << index); + card -= (load ^ newload) >> index; + words[offset] = newload; + list++; + } + return card; +} + +static inline uint64_t _scalar_bitset_set_list_withcard(uint64_t *words, uint64_t card, + const uint16_t *list, uint64_t length) { + uint64_t offset, load, newload, pos, index; + const uint16_t *end = list + length; + while (list != end) { + pos = *list; + offset = pos >> 6; + index = pos % 64; + load = words[offset]; + newload = load | (UINT64_C(1) << index); + card += (load ^ newload) >> index; + words[offset] = newload; + list++; + } + return card; +} + +static inline void _scalar_bitset_set_list(uint64_t *words, const uint16_t *list, uint64_t length) { + uint64_t offset, load, newload, pos, index; + const uint16_t *end = list + length; + while (list != end) { + pos = *list; + offset = pos >> 6; + index = pos % 64; + load = words[offset]; + newload = load | (UINT64_C(1) << index); + words[offset] = newload; + list++; + } +} + +uint64_t bitset_clear_list(uint64_t *words, uint64_t card, const uint16_t *list, + uint64_t length) { + if( croaring_hardware_support() & ROARING_SUPPORTS_AVX2 ) { + return _asm_bitset_clear_list(words, card, list, length); + } else { + return _scalar_bitset_clear_list(words, card, list, length); + } +} + +uint64_t bitset_set_list_withcard(uint64_t *words, uint64_t card, + const uint16_t *list, uint64_t length) { + if( croaring_hardware_support() & ROARING_SUPPORTS_AVX2 ) { + return _asm_bitset_set_list_withcard(words, card, list, length); + } else { + return _scalar_bitset_set_list_withcard(words, card, list, length); + } +} + +void bitset_set_list(uint64_t *words, const uint16_t *list, uint64_t length) { + if( croaring_hardware_support() & ROARING_SUPPORTS_AVX2 ) { + _asm_bitset_set_list(words, list, length); + } else { + _scalar_bitset_set_list(words, list, length); + } +} +#else +uint64_t bitset_clear_list(uint64_t *words, uint64_t card, const uint16_t *list, + uint64_t length) { + uint64_t offset, load, newload, pos, index; + const uint16_t *end = list + length; + while (list != end) { + pos = *(const uint16_t *)list; + offset = pos >> 6; + index = pos % 64; + load = words[offset]; + newload = load & ~(UINT64_C(1) << index); + card -= (load ^ newload) >> index; + words[offset] = newload; + list++; + } + return card; +} + +uint64_t bitset_set_list_withcard(uint64_t *words, uint64_t card, + const uint16_t *list, uint64_t length) { + uint64_t offset, load, newload, pos, index; + const uint16_t *end = list + length; + while (list != end) { + pos = *list; + offset = pos >> 6; + index = pos % 64; + load = words[offset]; + newload = load | (UINT64_C(1) << index); + card += (load ^ newload) >> index; + words[offset] = newload; + list++; + } + return card; +} + +void bitset_set_list(uint64_t *words, const uint16_t *list, uint64_t length) { + uint64_t offset, load, newload, pos, index; + const uint16_t *end = list + length; + while (list != end) { + pos = *list; + offset = pos >> 6; + index = pos % 64; + load = words[offset]; + newload = load | (UINT64_C(1) << index); + words[offset] = newload; + list++; + } +} + +#endif + +/* flip specified bits */ +/* TODO: consider whether worthwhile to make an asm version */ + +uint64_t bitset_flip_list_withcard(uint64_t *words, uint64_t card, + const uint16_t *list, uint64_t length) { + uint64_t offset, load, newload, pos, index; + const uint16_t *end = list + length; + while (list != end) { + pos = *list; + offset = pos >> 6; + index = pos % 64; + load = words[offset]; + newload = load ^ (UINT64_C(1) << index); + // todo: is a branch here all that bad? + card += + (1 - 2 * (((UINT64_C(1) << index) & load) >> index)); // +1 or -1 + words[offset] = newload; + list++; + } + return card; +} + +void bitset_flip_list(uint64_t *words, const uint16_t *list, uint64_t length) { + uint64_t offset, load, newload, pos, index; + const uint16_t *end = list + length; + while (list != end) { + pos = *list; + offset = pos >> 6; + index = pos % 64; + load = words[offset]; + newload = load ^ (UINT64_C(1) << index); + words[offset] = newload; + list++; + } +} + +#ifdef __cplusplus +} } } // extern "C" { namespace roaring { namespace api { +#endif +/* end file src/bitset_util.c */ +/* begin file src/containers/array.c */ +/* + * array.c + * + */ + +#include +#include +#include + +#if CROARING_IS_X64 +#ifndef CROARING_COMPILER_SUPPORTS_AVX512 +#error "CROARING_COMPILER_SUPPORTS_AVX512 needs to be defined." +#endif // CROARING_COMPILER_SUPPORTS_AVX512 +#endif + +#ifdef __cplusplus +extern "C" { namespace roaring { namespace internal { +#endif + +extern inline uint16_t array_container_minimum(const array_container_t *arr); +extern inline uint16_t array_container_maximum(const array_container_t *arr); +extern inline int array_container_index_equalorlarger(const array_container_t *arr, uint16_t x); + +extern inline int array_container_rank(const array_container_t *arr, + uint16_t x); +extern inline int array_container_get_index(const array_container_t *arr, + uint16_t x); +extern inline bool array_container_contains(const array_container_t *arr, + uint16_t pos); +extern inline int array_container_cardinality(const array_container_t *array); +extern inline bool array_container_nonzero_cardinality(const array_container_t *array); +extern inline int32_t array_container_serialized_size_in_bytes(int32_t card); +extern inline bool array_container_empty(const array_container_t *array); +extern inline bool array_container_full(const array_container_t *array); + +/* Create a new array with capacity size. Return NULL in case of failure. */ +array_container_t *array_container_create_given_capacity(int32_t size) { + array_container_t *container; + + if ((container = (array_container_t *)roaring_malloc(sizeof(array_container_t))) == + NULL) { + return NULL; + } + + if( size <= 0 ) { // we don't want to rely on malloc(0) + container->array = NULL; + } else if ((container->array = (uint16_t *)roaring_malloc(sizeof(uint16_t) * size)) == + NULL) { + roaring_free(container); + return NULL; + } + + container->capacity = size; + container->cardinality = 0; + + return container; +} + +/* Create a new array. Return NULL in case of failure. */ +array_container_t *array_container_create(void) { + return array_container_create_given_capacity(ARRAY_DEFAULT_INIT_SIZE); +} + +/* Create a new array containing all values in [min,max). */ +array_container_t * array_container_create_range(uint32_t min, uint32_t max) { + array_container_t * answer = array_container_create_given_capacity(max - min + 1); + if(answer == NULL) return answer; + answer->cardinality = 0; + for(uint32_t k = min; k < max; k++) { + answer->array[answer->cardinality++] = k; + } + return answer; +} + +/* Duplicate container */ +array_container_t *array_container_clone(const array_container_t *src) { + array_container_t *newcontainer = + array_container_create_given_capacity(src->capacity); + if (newcontainer == NULL) return NULL; + + newcontainer->cardinality = src->cardinality; + + memcpy(newcontainer->array, src->array, + src->cardinality * sizeof(uint16_t)); + + return newcontainer; +} + +void array_container_offset(const array_container_t *c, + container_t **loc, container_t **hic, + uint16_t offset) { + array_container_t *lo = NULL, *hi = NULL; + int top, lo_cap, hi_cap; + + top = (1 << 16) - offset; + + lo_cap = count_less(c->array, c->cardinality, top); + if (loc && lo_cap) { + lo = array_container_create_given_capacity(lo_cap); + for (int i = 0; i < lo_cap; ++i) { + array_container_add(lo, c->array[i] + offset); + } + *loc = (container_t*)lo; + } + + hi_cap = c->cardinality - lo_cap; + if (hic && hi_cap) { + hi = array_container_create_given_capacity(hi_cap); + for (int i = lo_cap; i < c->cardinality; ++i) { + array_container_add(hi, c->array[i] + offset); + } + *hic = (container_t*)hi; + } +} + +int array_container_shrink_to_fit(array_container_t *src) { + if (src->cardinality == src->capacity) return 0; // nothing to do + int savings = src->capacity - src->cardinality; + src->capacity = src->cardinality; + if( src->capacity == 0) { // we do not want to rely on realloc for zero allocs + roaring_free(src->array); + src->array = NULL; + } else { + uint16_t *oldarray = src->array; + src->array = + (uint16_t *)roaring_realloc(oldarray, src->capacity * sizeof(uint16_t)); + if (src->array == NULL) roaring_free(oldarray); // should never happen? + } + return savings; +} + +/* Free memory. */ +void array_container_free(array_container_t *arr) { + if(arr->array != NULL) {// Jon Strabala reports that some tools complain otherwise + roaring_free(arr->array); + arr->array = NULL; // pedantic + } + roaring_free(arr); +} + +static inline int32_t grow_capacity(int32_t capacity) { + return (capacity <= 0) ? ARRAY_DEFAULT_INIT_SIZE + : capacity < 64 ? capacity * 2 + : capacity < 1024 ? capacity * 3 / 2 + : capacity * 5 / 4; +} + +static inline int32_t clamp(int32_t val, int32_t min, int32_t max) { + return ((val < min) ? min : (val > max) ? max : val); +} + +void array_container_grow(array_container_t *container, int32_t min, + bool preserve) { + + int32_t max = (min <= DEFAULT_MAX_SIZE ? DEFAULT_MAX_SIZE : 65536); + int32_t new_capacity = clamp(grow_capacity(container->capacity), min, max); + + container->capacity = new_capacity; + uint16_t *array = container->array; + + if (preserve) { + container->array = + (uint16_t *)roaring_realloc(array, new_capacity * sizeof(uint16_t)); + if (container->array == NULL) roaring_free(array); + } else { + // Jon Strabala reports that some tools complain otherwise + if (array != NULL) { + roaring_free(array); + } + container->array = (uint16_t *)roaring_malloc(new_capacity * sizeof(uint16_t)); + } + + // handle the case where realloc fails + if (container->array == NULL) { + fprintf(stderr, "could not allocate memory\n"); + } + assert(container->array != NULL); +} + +/* Copy one container into another. We assume that they are distinct. */ +void array_container_copy(const array_container_t *src, + array_container_t *dst) { + const int32_t cardinality = src->cardinality; + if (cardinality > dst->capacity) { + array_container_grow(dst, cardinality, false); + } + + dst->cardinality = cardinality; + memcpy(dst->array, src->array, cardinality * sizeof(uint16_t)); +} + +void array_container_add_from_range(array_container_t *arr, uint32_t min, + uint32_t max, uint16_t step) { + for (uint32_t value = min; value < max; value += step) { + array_container_append(arr, value); + } +} + +/* Computes the union of array1 and array2 and write the result to arrayout. + * It is assumed that arrayout is distinct from both array1 and array2. + */ +void array_container_union(const array_container_t *array_1, + const array_container_t *array_2, + array_container_t *out) { + const int32_t card_1 = array_1->cardinality, card_2 = array_2->cardinality; + const int32_t max_cardinality = card_1 + card_2; + + if (out->capacity < max_cardinality) { + array_container_grow(out, max_cardinality, false); + } + out->cardinality = (int32_t)fast_union_uint16(array_1->array, card_1, + array_2->array, card_2, out->array); + +} + +/* Computes the difference of array1 and array2 and write the result + * to array out. + * Array out does not need to be distinct from array_1 + */ +void array_container_andnot(const array_container_t *array_1, + const array_container_t *array_2, + array_container_t *out) { + if (out->capacity < array_1->cardinality) + array_container_grow(out, array_1->cardinality, false); +#if CROARING_IS_X64 + if(( croaring_hardware_support() & ROARING_SUPPORTS_AVX2 ) && (out != array_1) && (out != array_2)) { + out->cardinality = + difference_vector16(array_1->array, array_1->cardinality, + array_2->array, array_2->cardinality, out->array); + } else { + out->cardinality = + difference_uint16(array_1->array, array_1->cardinality, array_2->array, + array_2->cardinality, out->array); + } +#else + out->cardinality = + difference_uint16(array_1->array, array_1->cardinality, array_2->array, + array_2->cardinality, out->array); +#endif +} + +/* Computes the symmetric difference of array1 and array2 and write the + * result + * to arrayout. + * It is assumed that arrayout is distinct from both array1 and array2. + */ +void array_container_xor(const array_container_t *array_1, + const array_container_t *array_2, + array_container_t *out) { + const int32_t card_1 = array_1->cardinality, card_2 = array_2->cardinality; + const int32_t max_cardinality = card_1 + card_2; + if (out->capacity < max_cardinality) { + array_container_grow(out, max_cardinality, false); + } + +#if CROARING_IS_X64 + if( croaring_hardware_support() & ROARING_SUPPORTS_AVX2 ) { + out->cardinality = + xor_vector16(array_1->array, array_1->cardinality, array_2->array, + array_2->cardinality, out->array); + } else { + out->cardinality = + xor_uint16(array_1->array, array_1->cardinality, array_2->array, + array_2->cardinality, out->array); + } +#else + out->cardinality = + xor_uint16(array_1->array, array_1->cardinality, array_2->array, + array_2->cardinality, out->array); +#endif +} + +static inline int32_t minimum_int32(int32_t a, int32_t b) { + return (a < b) ? a : b; +} + +/* computes the intersection of array1 and array2 and write the result to + * arrayout. + * It is assumed that arrayout is distinct from both array1 and array2. + * */ +void array_container_intersection(const array_container_t *array1, + const array_container_t *array2, + array_container_t *out) { + int32_t card_1 = array1->cardinality, card_2 = array2->cardinality, + min_card = minimum_int32(card_1, card_2); + const int threshold = 64; // subject to tuning +#if CROARING_IS_X64 + if (out->capacity < min_card) { + array_container_grow(out, min_card + sizeof(__m128i) / sizeof(uint16_t), + false); + } +#else + if (out->capacity < min_card) { + array_container_grow(out, min_card, false); + } +#endif + + if (card_1 * threshold < card_2) { + out->cardinality = intersect_skewed_uint16( + array1->array, card_1, array2->array, card_2, out->array); + } else if (card_2 * threshold < card_1) { + out->cardinality = intersect_skewed_uint16( + array2->array, card_2, array1->array, card_1, out->array); + } else { +#if CROARING_IS_X64 + if( croaring_hardware_support() & ROARING_SUPPORTS_AVX2 ) { + out->cardinality = intersect_vector16( + array1->array, card_1, array2->array, card_2, out->array); + } else { + out->cardinality = intersect_uint16(array1->array, card_1, + array2->array, card_2, out->array); + } +#else + out->cardinality = intersect_uint16(array1->array, card_1, + array2->array, card_2, out->array); +#endif + } +} + +/* computes the size of the intersection of array1 and array2 + * */ +int array_container_intersection_cardinality(const array_container_t *array1, + const array_container_t *array2) { + int32_t card_1 = array1->cardinality, card_2 = array2->cardinality; + const int threshold = 64; // subject to tuning + if (card_1 * threshold < card_2) { + return intersect_skewed_uint16_cardinality(array1->array, card_1, + array2->array, card_2); + } else if (card_2 * threshold < card_1) { + return intersect_skewed_uint16_cardinality(array2->array, card_2, + array1->array, card_1); + } else { +#if CROARING_IS_X64 + if( croaring_hardware_support() & ROARING_SUPPORTS_AVX2 ) { + return intersect_vector16_cardinality(array1->array, card_1, + array2->array, card_2); + } else { + return intersect_uint16_cardinality(array1->array, card_1, + array2->array, card_2); + } +#else + return intersect_uint16_cardinality(array1->array, card_1, + array2->array, card_2); +#endif + } +} + +bool array_container_intersect(const array_container_t *array1, + const array_container_t *array2) { + int32_t card_1 = array1->cardinality, card_2 = array2->cardinality; + const int threshold = 64; // subject to tuning + if (card_1 * threshold < card_2) { + return intersect_skewed_uint16_nonempty( + array1->array, card_1, array2->array, card_2); + } else if (card_2 * threshold < card_1) { + return intersect_skewed_uint16_nonempty( + array2->array, card_2, array1->array, card_1); + } else { + // we do not bother vectorizing + return intersect_uint16_nonempty(array1->array, card_1, + array2->array, card_2); + } +} + +/* computes the intersection of array1 and array2 and write the result to + * array1. + * */ +void array_container_intersection_inplace(array_container_t *src_1, + const array_container_t *src_2) { + int32_t card_1 = src_1->cardinality, card_2 = src_2->cardinality; + const int threshold = 64; // subject to tuning + if (card_1 * threshold < card_2) { + src_1->cardinality = intersect_skewed_uint16( + src_1->array, card_1, src_2->array, card_2, src_1->array); + } else if (card_2 * threshold < card_1) { + src_1->cardinality = intersect_skewed_uint16( + src_2->array, card_2, src_1->array, card_1, src_1->array); + } else { +#if CROARING_IS_X64 + if (croaring_hardware_support() & ROARING_SUPPORTS_AVX2) { + src_1->cardinality = intersect_vector16_inplace( + src_1->array, card_1, src_2->array, card_2); + } else { + src_1->cardinality = intersect_uint16( + src_1->array, card_1, src_2->array, card_2, src_1->array); + } +#else + src_1->cardinality = intersect_uint16( + src_1->array, card_1, src_2->array, card_2, src_1->array); +#endif + } +} + +ALLOW_UNALIGNED +int array_container_to_uint32_array(void *vout, const array_container_t *cont, + uint32_t base) { + +#if CROARING_IS_X64 + int support = croaring_hardware_support(); +#if CROARING_COMPILER_SUPPORTS_AVX512 + if (support & ROARING_SUPPORTS_AVX512) { + return avx512_array_container_to_uint32_array(vout, cont->array, cont->cardinality, base); + } +#endif + if (support & ROARING_SUPPORTS_AVX2) { + return array_container_to_uint32_array_vector16(vout, cont->array, cont->cardinality, base); + } +#endif // CROARING_IS_X64 + int outpos = 0; + uint32_t *out = (uint32_t *)vout; + size_t i = 0; + for ( ; i < (size_t)cont->cardinality; ++i) { + const uint32_t val = base + cont->array[i]; + memcpy(out + outpos, &val, + sizeof(uint32_t)); // should be compiled as a MOV on x64 + outpos++; + } + return outpos; +} + +void array_container_printf(const array_container_t *v) { + if (v->cardinality == 0) { + printf("{}"); + return; + } + printf("{"); + printf("%d", v->array[0]); + for (int i = 1; i < v->cardinality; ++i) { + printf(",%d", v->array[i]); + } + printf("}"); +} + +void array_container_printf_as_uint32_array(const array_container_t *v, + uint32_t base) { + if (v->cardinality == 0) { + return; + } + printf("%u", v->array[0] + base); + for (int i = 1; i < v->cardinality; ++i) { + printf(",%u", v->array[i] + base); + } +} + +/* Compute the number of runs */ +int32_t array_container_number_of_runs(const array_container_t *ac) { + // Can SIMD work here? + int32_t nr_runs = 0; + int32_t prev = -2; + for (const uint16_t *p = ac->array; p != ac->array + ac->cardinality; ++p) { + if (*p != prev + 1) nr_runs++; + prev = *p; + } + return nr_runs; +} + +/** + * Writes the underlying array to buf, outputs how many bytes were written. + * The number of bytes written should be + * array_container_size_in_bytes(container). + * + */ +int32_t array_container_write(const array_container_t *container, char *buf) { + memcpy(buf, container->array, container->cardinality * sizeof(uint16_t)); + return array_container_size_in_bytes(container); +} + +bool array_container_is_subset(const array_container_t *container1, + const array_container_t *container2) { + if (container1->cardinality > container2->cardinality) { + return false; + } + int i1 = 0, i2 = 0; + while (i1 < container1->cardinality && i2 < container2->cardinality) { + if (container1->array[i1] == container2->array[i2]) { + i1++; + i2++; + } else if (container1->array[i1] > container2->array[i2]) { + i2++; + } else { // container1->array[i1] < container2->array[i2] + return false; + } + } + if (i1 == container1->cardinality) { + return true; + } else { + return false; + } +} + +int32_t array_container_read(int32_t cardinality, array_container_t *container, + const char *buf) { + if (container->capacity < cardinality) { + array_container_grow(container, cardinality, false); + } + container->cardinality = cardinality; + memcpy(container->array, buf, container->cardinality * sizeof(uint16_t)); + + return array_container_size_in_bytes(container); +} + +bool array_container_iterate(const array_container_t *cont, uint32_t base, + roaring_iterator iterator, void *ptr) { + for (int i = 0; i < cont->cardinality; i++) + if (!iterator(cont->array[i] + base, ptr)) return false; + return true; +} + +bool array_container_iterate64(const array_container_t *cont, uint32_t base, + roaring_iterator64 iterator, uint64_t high_bits, + void *ptr) { + for (int i = 0; i < cont->cardinality; i++) + if (!iterator(high_bits | (uint64_t)(cont->array[i] + base), ptr)) + return false; + return true; +} + +#ifdef __cplusplus +} } } // extern "C" { namespace roaring { namespace internal { +#endif +/* end file src/containers/array.c */ +/* begin file src/containers/bitset.c */ +/* + * bitset.c + * + */ +#ifndef _POSIX_C_SOURCE +#define _POSIX_C_SOURCE 200809L +#endif +#include +#include +#include +#include + + +#if CROARING_IS_X64 +#ifndef CROARING_COMPILER_SUPPORTS_AVX512 +#error "CROARING_COMPILER_SUPPORTS_AVX512 needs to be defined." +#endif // CROARING_COMPILER_SUPPORTS_AVX512 +#endif + +#ifdef __cplusplus +extern "C" { namespace roaring { namespace internal { +#endif + +extern inline int bitset_container_cardinality(const bitset_container_t *bitset); +extern inline void bitset_container_set(bitset_container_t *bitset, uint16_t pos); +// unused at this time: +//extern inline void bitset_container_unset(bitset_container_t *bitset, uint16_t pos); +extern inline bool bitset_container_get(const bitset_container_t *bitset, + uint16_t pos); +extern inline int32_t bitset_container_serialized_size_in_bytes(void); +extern inline bool bitset_container_add(bitset_container_t *bitset, uint16_t pos); +extern inline bool bitset_container_remove(bitset_container_t *bitset, uint16_t pos); +extern inline bool bitset_container_contains(const bitset_container_t *bitset, + uint16_t pos); + +void bitset_container_clear(bitset_container_t *bitset) { + memset(bitset->words, 0, sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS); + bitset->cardinality = 0; +} + +void bitset_container_set_all(bitset_container_t *bitset) { + memset(bitset->words, INT64_C(-1), + sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS); + bitset->cardinality = (1 << 16); +} + + + +/* Create a new bitset. Return NULL in case of failure. */ +bitset_container_t *bitset_container_create(void) { + bitset_container_t *bitset = + (bitset_container_t *)roaring_malloc(sizeof(bitset_container_t)); + + if (!bitset) { + return NULL; + } + + size_t align_size = 32; +#if CROARING_IS_X64 + int support = croaring_hardware_support(); + if ( support & ROARING_SUPPORTS_AVX512 ) { + // sizeof(__m512i) == 64 + align_size = 64; + } + else { + // sizeof(__m256i) == 32 + align_size = 32; + } +#endif + bitset->words = (uint64_t *)roaring_aligned_malloc( + align_size, sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS); + if (!bitset->words) { + roaring_free(bitset); + return NULL; + } + bitset_container_clear(bitset); + return bitset; +} + +/* Copy one container into another. We assume that they are distinct. */ +void bitset_container_copy(const bitset_container_t *source, + bitset_container_t *dest) { + dest->cardinality = source->cardinality; + memcpy(dest->words, source->words, + sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS); +} + +void bitset_container_add_from_range(bitset_container_t *bitset, uint32_t min, + uint32_t max, uint16_t step) { + if (step == 0) return; // refuse to crash + if ((64 % step) == 0) { // step divides 64 + uint64_t mask = 0; // construct the repeated mask + for (uint32_t value = (min % step); value < 64; value += step) { + mask |= ((uint64_t)1 << value); + } + uint32_t firstword = min / 64; + uint32_t endword = (max - 1) / 64; + bitset->cardinality = (max - min + step - 1) / step; + if (firstword == endword) { + bitset->words[firstword] |= + mask & (((~UINT64_C(0)) << (min % 64)) & + ((~UINT64_C(0)) >> ((~max + 1) % 64))); + return; + } + bitset->words[firstword] = mask & ((~UINT64_C(0)) << (min % 64)); + for (uint32_t i = firstword + 1; i < endword; i++) + bitset->words[i] = mask; + bitset->words[endword] = mask & ((~UINT64_C(0)) >> ((~max + 1) % 64)); + } else { + for (uint32_t value = min; value < max; value += step) { + bitset_container_add(bitset, value); + } + } +} + +/* Free memory. */ +void bitset_container_free(bitset_container_t *bitset) { + if(bitset->words != NULL) {// Jon Strabala reports that some tools complain otherwise + roaring_aligned_free(bitset->words); + bitset->words = NULL; // pedantic + } + roaring_free(bitset); +} + +/* duplicate container. */ +bitset_container_t *bitset_container_clone(const bitset_container_t *src) { + bitset_container_t *bitset = + (bitset_container_t *)roaring_malloc(sizeof(bitset_container_t)); + + if (!bitset) { + return NULL; + } + + size_t align_size = 32; +#if CROARING_IS_X64 + if ( croaring_hardware_support() & ROARING_SUPPORTS_AVX512 ) { + // sizeof(__m512i) == 64 + align_size = 64; + } + else { + // sizeof(__m256i) == 32 + align_size = 32; + } +#endif + bitset->words = (uint64_t *)roaring_aligned_malloc( + align_size, sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS); + if (!bitset->words) { + roaring_free(bitset); + return NULL; + } + bitset->cardinality = src->cardinality; + memcpy(bitset->words, src->words, + sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS); + return bitset; +} + +void bitset_container_offset(const bitset_container_t *c, + container_t **loc, container_t **hic, + uint16_t offset) { + bitset_container_t *bc = NULL; + uint64_t val; + uint16_t b, i, end; + + b = offset >> 6; + i = offset % 64; + end = 1024 - b; + + if (loc != NULL) { + bc = bitset_container_create(); + if (i == 0) { + memcpy(bc->words+b, c->words, 8*end); + } else { + bc->words[b] = c->words[0] << i; + for (uint32_t k = 1; k < end; ++k) { + val = c->words[k] << i; + val |= c->words[k-1] >> (64 - i); + bc->words[b+k] = val; + } + } + + bc->cardinality = bitset_container_compute_cardinality(bc); + if (bc->cardinality != 0) { + *loc = bc; + } + if (bc->cardinality == c->cardinality) { + return; + } + } + + if (hic == NULL) { + // Both hic and loc can't be NULL, so bc is never NULL here + if (bc->cardinality == 0) { + bitset_container_free(bc); + } + return; + } + + if (bc == NULL || bc->cardinality != 0) { + bc = bitset_container_create(); + } + + if (i == 0) { + memcpy(bc->words, c->words+end, 8*b); + } else { + for (uint32_t k = end; k < 1024; ++k) { + val = c->words[k] << i; + val |= c->words[k-1] >> (64 - i); + bc->words[k-end] = val; + } + bc->words[b] = c->words[1023] >> (64 - i); + } + + bc->cardinality = bitset_container_compute_cardinality(bc); + if (bc->cardinality == 0) { + bitset_container_free(bc); + return; + } + *hic = bc; +} + +void bitset_container_set_range(bitset_container_t *bitset, uint32_t begin, + uint32_t end) { + bitset_set_range(bitset->words, begin, end); + bitset->cardinality = + bitset_container_compute_cardinality(bitset); // could be smarter +} + + +bool bitset_container_intersect(const bitset_container_t *src_1, + const bitset_container_t *src_2) { + // could vectorize, but this is probably already quite fast in practice + const uint64_t * __restrict__ words_1 = src_1->words; + const uint64_t * __restrict__ words_2 = src_2->words; + for (int i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i ++) { + if((words_1[i] & words_2[i]) != 0) return true; + } + return false; +} + + +#if CROARING_IS_X64 +#ifndef WORDS_IN_AVX2_REG +#define WORDS_IN_AVX2_REG sizeof(__m256i) / sizeof(uint64_t) +#endif +#ifndef WORDS_IN_AVX512_REG +#define WORDS_IN_AVX512_REG sizeof(__m512i) / sizeof(uint64_t) +#endif +/* Get the number of bits set (force computation) */ +static inline int _scalar_bitset_container_compute_cardinality(const bitset_container_t *bitset) { + const uint64_t *words = bitset->words; + int32_t sum = 0; + for (int i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 4) { + sum += roaring_hamming(words[i]); + sum += roaring_hamming(words[i + 1]); + sum += roaring_hamming(words[i + 2]); + sum += roaring_hamming(words[i + 3]); + } + return sum; +} +/* Get the number of bits set (force computation) */ +int bitset_container_compute_cardinality(const bitset_container_t *bitset) { + int support = croaring_hardware_support(); +#if CROARING_COMPILER_SUPPORTS_AVX512 + if( support & ROARING_SUPPORTS_AVX512 ) { + return (int) avx512_vpopcount( + (const __m512i *)bitset->words, + BITSET_CONTAINER_SIZE_IN_WORDS / (WORDS_IN_AVX512_REG)); + } else +#endif // CROARING_COMPILER_SUPPORTS_AVX512 + if( support & ROARING_SUPPORTS_AVX2 ) { + return (int) avx2_harley_seal_popcount256( + (const __m256i *)bitset->words, + BITSET_CONTAINER_SIZE_IN_WORDS / (WORDS_IN_AVX2_REG)); + } else { + return _scalar_bitset_container_compute_cardinality(bitset); + + } +} + +#elif defined(CROARING_USENEON) +int bitset_container_compute_cardinality(const bitset_container_t *bitset) { + uint16x8_t n0 = vdupq_n_u16(0); + uint16x8_t n1 = vdupq_n_u16(0); + uint16x8_t n2 = vdupq_n_u16(0); + uint16x8_t n3 = vdupq_n_u16(0); + for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 8) { + uint64x2_t c0 = vld1q_u64(&bitset->words[i + 0]); + n0 = vaddq_u16(n0, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c0)))); + uint64x2_t c1 = vld1q_u64(&bitset->words[i + 2]); + n1 = vaddq_u16(n1, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c1)))); + uint64x2_t c2 = vld1q_u64(&bitset->words[i + 4]); + n2 = vaddq_u16(n2, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c2)))); + uint64x2_t c3 = vld1q_u64(&bitset->words[i + 6]); + n3 = vaddq_u16(n3, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c3)))); + } + uint64x2_t n = vdupq_n_u64(0); + n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n0))); + n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n1))); + n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n2))); + n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n3))); + return vgetq_lane_u64(n, 0) + vgetq_lane_u64(n, 1); +} + +#else // CROARING_IS_X64 + +/* Get the number of bits set (force computation) */ +int bitset_container_compute_cardinality(const bitset_container_t *bitset) { + const uint64_t *words = bitset->words; + int32_t sum = 0; + for (int i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 4) { + sum += roaring_hamming(words[i]); + sum += roaring_hamming(words[i + 1]); + sum += roaring_hamming(words[i + 2]); + sum += roaring_hamming(words[i + 3]); + } + return sum; +} + +#endif // CROARING_IS_X64 + +#if CROARING_IS_X64 + +#define BITSET_CONTAINER_FN_REPEAT 8 +#ifndef WORDS_IN_AVX512_REG +#define WORDS_IN_AVX512_REG sizeof(__m512i) / sizeof(uint64_t) +#endif // WORDS_IN_AVX512_REG + +/* Computes a binary operation (eg union) on bitset1 and bitset2 and write the + result to bitsetout */ +// clang-format off +#define AVX512_BITSET_CONTAINER_FN1(before, opname, opsymbol, avx_intrinsic, \ + neon_intrinsic, after) \ + static inline int _avx512_bitset_container_##opname##_nocard( \ + const bitset_container_t *src_1, const bitset_container_t *src_2, \ + bitset_container_t *dst) { \ + const uint8_t * __restrict__ words_1 = (const uint8_t *)src_1->words; \ + const uint8_t * __restrict__ words_2 = (const uint8_t *)src_2->words; \ + /* not using the blocking optimization for some reason*/ \ + uint8_t *out = (uint8_t*)dst->words; \ + const int innerloop = 8; \ + for (size_t i = 0; \ + i < BITSET_CONTAINER_SIZE_IN_WORDS / (WORDS_IN_AVX512_REG); \ + i+=innerloop) { \ + __m512i A1, A2, AO; \ + A1 = _mm512_loadu_si512((const __m512i *)(words_1)); \ + A2 = _mm512_loadu_si512((const __m512i *)(words_2)); \ + AO = avx_intrinsic(A2, A1); \ + _mm512_storeu_si512((__m512i *)out, AO); \ + A1 = _mm512_loadu_si512((const __m512i *)(words_1 + 64)); \ + A2 = _mm512_loadu_si512((const __m512i *)(words_2 + 64)); \ + AO = avx_intrinsic(A2, A1); \ + _mm512_storeu_si512((__m512i *)(out+64), AO); \ + A1 = _mm512_loadu_si512((const __m512i *)(words_1 + 128)); \ + A2 = _mm512_loadu_si512((const __m512i *)(words_2 + 128)); \ + AO = avx_intrinsic(A2, A1); \ + _mm512_storeu_si512((__m512i *)(out+128), AO); \ + A1 = _mm512_loadu_si512((const __m512i *)(words_1 + 192)); \ + A2 = _mm512_loadu_si512((const __m512i *)(words_2 + 192)); \ + AO = avx_intrinsic(A2, A1); \ + _mm512_storeu_si512((__m512i *)(out+192), AO); \ + A1 = _mm512_loadu_si512((const __m512i *)(words_1 + 256)); \ + A2 = _mm512_loadu_si512((const __m512i *)(words_2 + 256)); \ + AO = avx_intrinsic(A2, A1); \ + _mm512_storeu_si512((__m512i *)(out+256), AO); \ + A1 = _mm512_loadu_si512((const __m512i *)(words_1 + 320)); \ + A2 = _mm512_loadu_si512((const __m512i *)(words_2 + 320)); \ + AO = avx_intrinsic(A2, A1); \ + _mm512_storeu_si512((__m512i *)(out+320), AO); \ + A1 = _mm512_loadu_si512((const __m512i *)(words_1 + 384)); \ + A2 = _mm512_loadu_si512((const __m512i *)(words_2 + 384)); \ + AO = avx_intrinsic(A2, A1); \ + _mm512_storeu_si512((__m512i *)(out+384), AO); \ + A1 = _mm512_loadu_si512((const __m512i *)(words_1 + 448)); \ + A2 = _mm512_loadu_si512((const __m512i *)(words_2 + 448)); \ + AO = avx_intrinsic(A2, A1); \ + _mm512_storeu_si512((__m512i *)(out+448), AO); \ + out+=512; \ + words_1 += 512; \ + words_2 += 512; \ + } \ + dst->cardinality = BITSET_UNKNOWN_CARDINALITY; \ + return dst->cardinality; \ + } + +#define AVX512_BITSET_CONTAINER_FN2(before, opname, opsymbol, avx_intrinsic, \ + neon_intrinsic, after) \ + /* next, a version that updates cardinality*/ \ + static inline int _avx512_bitset_container_##opname(const bitset_container_t *src_1, \ + const bitset_container_t *src_2, \ + bitset_container_t *dst) { \ + const __m512i * __restrict__ words_1 = (const __m512i *) src_1->words; \ + const __m512i * __restrict__ words_2 = (const __m512i *) src_2->words; \ + __m512i *out = (__m512i *) dst->words; \ + dst->cardinality = (int32_t)avx512_harley_seal_popcount512andstore_##opname(words_2,\ + words_1, out,BITSET_CONTAINER_SIZE_IN_WORDS / (WORDS_IN_AVX512_REG)); \ + return dst->cardinality; \ + } + +#define AVX512_BITSET_CONTAINER_FN3(before, opname, opsymbol, avx_intrinsic, \ + neon_intrinsic, after) \ + /* next, a version that just computes the cardinality*/ \ + static inline int _avx512_bitset_container_##opname##_justcard( \ + const bitset_container_t *src_1, const bitset_container_t *src_2) { \ + const __m512i * __restrict__ data1 = (const __m512i *) src_1->words; \ + const __m512i * __restrict__ data2 = (const __m512i *) src_2->words; \ + return (int)avx512_harley_seal_popcount512_##opname(data2, \ + data1, BITSET_CONTAINER_SIZE_IN_WORDS / (WORDS_IN_AVX512_REG)); \ + } + + +// we duplicate the function because other containers use the "or" term, makes API more consistent +#if CROARING_COMPILER_SUPPORTS_AVX512 +CROARING_TARGET_AVX512 +AVX512_BITSET_CONTAINER_FN1(CROARING_TARGET_AVX512, or, |, _mm512_or_si512, vorrq_u64, CROARING_UNTARGET_AVX512) +CROARING_UNTARGET_AVX512 +CROARING_TARGET_AVX512 +AVX512_BITSET_CONTAINER_FN1(CROARING_TARGET_AVX512, union, |, _mm512_or_si512, vorrq_u64, CROARING_UNTARGET_AVX512) +CROARING_UNTARGET_AVX512 + +// we duplicate the function because other containers use the "intersection" term, makes API more consistent +CROARING_TARGET_AVX512 +AVX512_BITSET_CONTAINER_FN1(CROARING_TARGET_AVX512, and, &, _mm512_and_si512, vandq_u64, CROARING_UNTARGET_AVX512) +CROARING_UNTARGET_AVX512 +CROARING_TARGET_AVX512 +AVX512_BITSET_CONTAINER_FN1(CROARING_TARGET_AVX512, intersection, &, _mm512_and_si512, vandq_u64, CROARING_UNTARGET_AVX512) +CROARING_UNTARGET_AVX512 + +CROARING_TARGET_AVX512 +AVX512_BITSET_CONTAINER_FN1(CROARING_TARGET_AVX512, xor, ^, _mm512_xor_si512, veorq_u64, CROARING_UNTARGET_AVX512) +CROARING_UNTARGET_AVX512 +CROARING_TARGET_AVX512 +AVX512_BITSET_CONTAINER_FN1(CROARING_TARGET_AVX512, andnot, &~, _mm512_andnot_si512, vbicq_u64, CROARING_UNTARGET_AVX512) +CROARING_UNTARGET_AVX512 + +// we duplicate the function because other containers use the "or" term, makes API more consistent +CROARING_TARGET_AVX512 +AVX512_BITSET_CONTAINER_FN2(CROARING_TARGET_AVX512, or, |, _mm512_or_si512, vorrq_u64, CROARING_UNTARGET_AVX512) +CROARING_UNTARGET_AVX512 +CROARING_TARGET_AVX512 +AVX512_BITSET_CONTAINER_FN2(CROARING_TARGET_AVX512, union, |, _mm512_or_si512, vorrq_u64, CROARING_UNTARGET_AVX512) +CROARING_UNTARGET_AVX512 + +// we duplicate the function because other containers use the "intersection" term, makes API more consistent +CROARING_TARGET_AVX512 +AVX512_BITSET_CONTAINER_FN2(CROARING_TARGET_AVX512, and, &, _mm512_and_si512, vandq_u64, CROARING_UNTARGET_AVX512) +CROARING_UNTARGET_AVX512 +CROARING_TARGET_AVX512 +AVX512_BITSET_CONTAINER_FN2(CROARING_TARGET_AVX512, intersection, &, _mm512_and_si512, vandq_u64, CROARING_UNTARGET_AVX512) +CROARING_UNTARGET_AVX512 + +CROARING_TARGET_AVX512 +AVX512_BITSET_CONTAINER_FN2(CROARING_TARGET_AVX512, xor, ^, _mm512_xor_si512, veorq_u64, CROARING_UNTARGET_AVX512) +CROARING_UNTARGET_AVX512 +CROARING_TARGET_AVX512 +AVX512_BITSET_CONTAINER_FN2(CROARING_TARGET_AVX512, andnot, &~, _mm512_andnot_si512, vbicq_u64, CROARING_UNTARGET_AVX512) +CROARING_UNTARGET_AVX512 + +// we duplicate the function because other containers use the "or" term, makes API more consistent +CROARING_TARGET_AVX512 +AVX512_BITSET_CONTAINER_FN3(CROARING_TARGET_AVX512, or, |, _mm512_or_si512, vorrq_u64, CROARING_UNTARGET_AVX512) +CROARING_UNTARGET_AVX512 +CROARING_TARGET_AVX512 +AVX512_BITSET_CONTAINER_FN3(CROARING_TARGET_AVX512, union, |, _mm512_or_si512, vorrq_u64, CROARING_UNTARGET_AVX512) +CROARING_UNTARGET_AVX512 + +// we duplicate the function because other containers use the "intersection" term, makes API more consistent +CROARING_TARGET_AVX512 +AVX512_BITSET_CONTAINER_FN3(CROARING_TARGET_AVX512, and, &, _mm512_and_si512, vandq_u64, CROARING_UNTARGET_AVX512) +CROARING_UNTARGET_AVX512 +CROARING_TARGET_AVX512 +AVX512_BITSET_CONTAINER_FN3(CROARING_TARGET_AVX512, intersection, &, _mm512_and_si512, vandq_u64, CROARING_UNTARGET_AVX512) +CROARING_UNTARGET_AVX512 + +CROARING_TARGET_AVX512 +AVX512_BITSET_CONTAINER_FN3(CROARING_TARGET_AVX512, xor, ^, _mm512_xor_si512, veorq_u64, CROARING_UNTARGET_AVX512) +CROARING_UNTARGET_AVX512 +CROARING_TARGET_AVX512 +AVX512_BITSET_CONTAINER_FN3(CROARING_TARGET_AVX512, andnot, &~, _mm512_andnot_si512, vbicq_u64, CROARING_UNTARGET_AVX512) +CROARING_UNTARGET_AVX512 +#endif // CROARING_COMPILER_SUPPORTS_AVX512 + +#ifndef WORDS_IN_AVX2_REG +#define WORDS_IN_AVX2_REG sizeof(__m256i) / sizeof(uint64_t) +#endif // WORDS_IN_AVX2_REG +#define LOOP_SIZE \ + BITSET_CONTAINER_SIZE_IN_WORDS / \ + ((WORDS_IN_AVX2_REG)*BITSET_CONTAINER_FN_REPEAT) + +/* Computes a binary operation (eg union) on bitset1 and bitset2 and write the + result to bitsetout */ +// clang-format off +#define AVX_BITSET_CONTAINER_FN1(before, opname, opsymbol, avx_intrinsic, \ + neon_intrinsic, after) \ + static inline int _avx2_bitset_container_##opname##_nocard( \ + const bitset_container_t *src_1, const bitset_container_t *src_2, \ + bitset_container_t *dst) { \ + const uint8_t *__restrict__ words_1 = (const uint8_t *)src_1->words; \ + const uint8_t *__restrict__ words_2 = (const uint8_t *)src_2->words; \ + /* not using the blocking optimization for some reason*/ \ + uint8_t *out = (uint8_t *)dst->words; \ + const int innerloop = 8; \ + for (size_t i = 0; \ + i < BITSET_CONTAINER_SIZE_IN_WORDS / (WORDS_IN_AVX2_REG); \ + i += innerloop) { \ + __m256i A1, A2, AO; \ + A1 = _mm256_lddqu_si256((const __m256i *)(words_1)); \ + A2 = _mm256_lddqu_si256((const __m256i *)(words_2)); \ + AO = avx_intrinsic(A2, A1); \ + _mm256_storeu_si256((__m256i *)out, AO); \ + A1 = _mm256_lddqu_si256((const __m256i *)(words_1 + 32)); \ + A2 = _mm256_lddqu_si256((const __m256i *)(words_2 + 32)); \ + AO = avx_intrinsic(A2, A1); \ + _mm256_storeu_si256((__m256i *)(out + 32), AO); \ + A1 = _mm256_lddqu_si256((const __m256i *)(words_1 + 64)); \ + A2 = _mm256_lddqu_si256((const __m256i *)(words_2 + 64)); \ + AO = avx_intrinsic(A2, A1); \ + _mm256_storeu_si256((__m256i *)(out + 64), AO); \ + A1 = _mm256_lddqu_si256((const __m256i *)(words_1 + 96)); \ + A2 = _mm256_lddqu_si256((const __m256i *)(words_2 + 96)); \ + AO = avx_intrinsic(A2, A1); \ + _mm256_storeu_si256((__m256i *)(out + 96), AO); \ + A1 = _mm256_lddqu_si256((const __m256i *)(words_1 + 128)); \ + A2 = _mm256_lddqu_si256((const __m256i *)(words_2 + 128)); \ + AO = avx_intrinsic(A2, A1); \ + _mm256_storeu_si256((__m256i *)(out + 128), AO); \ + A1 = _mm256_lddqu_si256((const __m256i *)(words_1 + 160)); \ + A2 = _mm256_lddqu_si256((const __m256i *)(words_2 + 160)); \ + AO = avx_intrinsic(A2, A1); \ + _mm256_storeu_si256((__m256i *)(out + 160), AO); \ + A1 = _mm256_lddqu_si256((const __m256i *)(words_1 + 192)); \ + A2 = _mm256_lddqu_si256((const __m256i *)(words_2 + 192)); \ + AO = avx_intrinsic(A2, A1); \ + _mm256_storeu_si256((__m256i *)(out + 192), AO); \ + A1 = _mm256_lddqu_si256((const __m256i *)(words_1 + 224)); \ + A2 = _mm256_lddqu_si256((const __m256i *)(words_2 + 224)); \ + AO = avx_intrinsic(A2, A1); \ + _mm256_storeu_si256((__m256i *)(out + 224), AO); \ + out += 256; \ + words_1 += 256; \ + words_2 += 256; \ + } \ + dst->cardinality = BITSET_UNKNOWN_CARDINALITY; \ + return dst->cardinality; \ + } + +#define AVX_BITSET_CONTAINER_FN2(before, opname, opsymbol, avx_intrinsic, \ + neon_intrinsic, after) \ + /* next, a version that updates cardinality*/ \ + static inline int _avx2_bitset_container_##opname(const bitset_container_t *src_1, \ + const bitset_container_t *src_2, \ + bitset_container_t *dst) { \ + const __m256i *__restrict__ words_1 = (const __m256i *)src_1->words; \ + const __m256i *__restrict__ words_2 = (const __m256i *)src_2->words; \ + __m256i *out = (__m256i *)dst->words; \ + dst->cardinality = (int32_t)avx2_harley_seal_popcount256andstore_##opname( \ + words_2, words_1, out, \ + BITSET_CONTAINER_SIZE_IN_WORDS / (WORDS_IN_AVX2_REG)); \ + return dst->cardinality; \ + } \ + +#define AVX_BITSET_CONTAINER_FN3(before, opname, opsymbol, avx_intrinsic, \ + neon_intrinsic, after) \ + /* next, a version that just computes the cardinality*/ \ + static inline int _avx2_bitset_container_##opname##_justcard( \ + const bitset_container_t *src_1, const bitset_container_t *src_2) { \ + const __m256i *__restrict__ data1 = (const __m256i *)src_1->words; \ + const __m256i *__restrict__ data2 = (const __m256i *)src_2->words; \ + return (int)avx2_harley_seal_popcount256_##opname( \ + data2, data1, BITSET_CONTAINER_SIZE_IN_WORDS / (WORDS_IN_AVX2_REG)); \ + } + + +// we duplicate the function because other containers use the "or" term, makes API more consistent +CROARING_TARGET_AVX2 +AVX_BITSET_CONTAINER_FN1(CROARING_TARGET_AVX2, or, |, _mm256_or_si256, vorrq_u64, CROARING_UNTARGET_AVX2) +CROARING_UNTARGET_AVX2 +CROARING_TARGET_AVX2 +AVX_BITSET_CONTAINER_FN1(CROARING_TARGET_AVX2, union, |, _mm256_or_si256, vorrq_u64, CROARING_UNTARGET_AVX2) +CROARING_UNTARGET_AVX2 + +// we duplicate the function because other containers use the "intersection" term, makes API more consistent +CROARING_TARGET_AVX2 +AVX_BITSET_CONTAINER_FN1(CROARING_TARGET_AVX2, and, &, _mm256_and_si256, vandq_u64, CROARING_UNTARGET_AVX2) +CROARING_UNTARGET_AVX2 +CROARING_TARGET_AVX2 +AVX_BITSET_CONTAINER_FN1(CROARING_TARGET_AVX2, intersection, &, _mm256_and_si256, vandq_u64, CROARING_UNTARGET_AVX2) +CROARING_UNTARGET_AVX2 + +CROARING_TARGET_AVX2 +AVX_BITSET_CONTAINER_FN1(CROARING_TARGET_AVX2, xor, ^, _mm256_xor_si256, veorq_u64, CROARING_UNTARGET_AVX2) +CROARING_UNTARGET_AVX2 +CROARING_TARGET_AVX2 +AVX_BITSET_CONTAINER_FN1(CROARING_TARGET_AVX2, andnot, &~, _mm256_andnot_si256, vbicq_u64, CROARING_UNTARGET_AVX2) +CROARING_UNTARGET_AVX2 + +// we duplicate the function because other containers use the "or" term, makes API more consistent +CROARING_TARGET_AVX2 +AVX_BITSET_CONTAINER_FN2(CROARING_TARGET_AVX2, or, |, _mm256_or_si256, vorrq_u64, CROARING_UNTARGET_AVX2) +CROARING_UNTARGET_AVX2 +CROARING_TARGET_AVX2 +AVX_BITSET_CONTAINER_FN2(CROARING_TARGET_AVX2, union, |, _mm256_or_si256, vorrq_u64, CROARING_UNTARGET_AVX2) +CROARING_UNTARGET_AVX2 + +// we duplicate the function because other containers use the "intersection" term, makes API more consistent +CROARING_TARGET_AVX2 +AVX_BITSET_CONTAINER_FN2(CROARING_TARGET_AVX2, and, &, _mm256_and_si256, vandq_u64, CROARING_UNTARGET_AVX2) +CROARING_UNTARGET_AVX2 +CROARING_TARGET_AVX2 +AVX_BITSET_CONTAINER_FN2(CROARING_TARGET_AVX2, intersection, &, _mm256_and_si256, vandq_u64, CROARING_UNTARGET_AVX2) +CROARING_UNTARGET_AVX2 + +CROARING_TARGET_AVX2 +AVX_BITSET_CONTAINER_FN2(CROARING_TARGET_AVX2, xor, ^, _mm256_xor_si256, veorq_u64, CROARING_UNTARGET_AVX2) +CROARING_UNTARGET_AVX2 +CROARING_TARGET_AVX2 +AVX_BITSET_CONTAINER_FN2(CROARING_TARGET_AVX2, andnot, &~, _mm256_andnot_si256, vbicq_u64, CROARING_UNTARGET_AVX2) +CROARING_UNTARGET_AVX2 + +// we duplicate the function because other containers use the "or" term, makes API more consistent +CROARING_TARGET_AVX2 +AVX_BITSET_CONTAINER_FN3(CROARING_TARGET_AVX2, or, |, _mm256_or_si256, vorrq_u64, CROARING_UNTARGET_AVX2) +CROARING_UNTARGET_AVX2 +CROARING_TARGET_AVX2 +AVX_BITSET_CONTAINER_FN3(CROARING_TARGET_AVX2, union, |, _mm256_or_si256, vorrq_u64, CROARING_UNTARGET_AVX2) +CROARING_UNTARGET_AVX2 + +// we duplicate the function because other containers use the "intersection" term, makes API more consistent +CROARING_TARGET_AVX2 +AVX_BITSET_CONTAINER_FN3(CROARING_TARGET_AVX2, and, &, _mm256_and_si256, vandq_u64, CROARING_UNTARGET_AVX2) +CROARING_UNTARGET_AVX2 +CROARING_TARGET_AVX2 +AVX_BITSET_CONTAINER_FN3(CROARING_TARGET_AVX2, intersection, &, _mm256_and_si256, vandq_u64, CROARING_UNTARGET_AVX2) +CROARING_UNTARGET_AVX2 + +CROARING_TARGET_AVX2 +AVX_BITSET_CONTAINER_FN3(CROARING_TARGET_AVX2, xor, ^, _mm256_xor_si256, veorq_u64, CROARING_UNTARGET_AVX2) +CROARING_UNTARGET_AVX2 +CROARING_TARGET_AVX2 +AVX_BITSET_CONTAINER_FN3(CROARING_TARGET_AVX2, andnot, &~, _mm256_andnot_si256, vbicq_u64, CROARING_UNTARGET_AVX2) +CROARING_UNTARGET_AVX2 + + +#define SCALAR_BITSET_CONTAINER_FN(opname, opsymbol, avx_intrinsic, \ + neon_intrinsic) \ + static inline int _scalar_bitset_container_##opname(const bitset_container_t *src_1, \ + const bitset_container_t *src_2, \ + bitset_container_t *dst) { \ + const uint64_t *__restrict__ words_1 = src_1->words; \ + const uint64_t *__restrict__ words_2 = src_2->words; \ + uint64_t *out = dst->words; \ + int32_t sum = 0; \ + for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 2) { \ + const uint64_t word_1 = (words_1[i])opsymbol(words_2[i]), \ + word_2 = (words_1[i + 1]) opsymbol(words_2[i + 1]); \ + out[i] = word_1; \ + out[i + 1] = word_2; \ + sum += roaring_hamming(word_1); \ + sum += roaring_hamming(word_2); \ + } \ + dst->cardinality = sum; \ + return dst->cardinality; \ + } \ + static inline int _scalar_bitset_container_##opname##_nocard( \ + const bitset_container_t *src_1, const bitset_container_t *src_2, \ + bitset_container_t *dst) { \ + const uint64_t *__restrict__ words_1 = src_1->words; \ + const uint64_t *__restrict__ words_2 = src_2->words; \ + uint64_t *out = dst->words; \ + for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i++) { \ + out[i] = (words_1[i])opsymbol(words_2[i]); \ + } \ + dst->cardinality = BITSET_UNKNOWN_CARDINALITY; \ + return dst->cardinality; \ + } \ + static inline int _scalar_bitset_container_##opname##_justcard( \ + const bitset_container_t *src_1, const bitset_container_t *src_2) { \ + const uint64_t *__restrict__ words_1 = src_1->words; \ + const uint64_t *__restrict__ words_2 = src_2->words; \ + int32_t sum = 0; \ + for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 2) { \ + const uint64_t word_1 = (words_1[i])opsymbol(words_2[i]), \ + word_2 = (words_1[i + 1]) opsymbol(words_2[i + 1]); \ + sum += roaring_hamming(word_1); \ + sum += roaring_hamming(word_2); \ + } \ + return sum; \ + } + +// we duplicate the function because other containers use the "or" term, makes API more consistent +SCALAR_BITSET_CONTAINER_FN(or, |, _mm256_or_si256, vorrq_u64) +SCALAR_BITSET_CONTAINER_FN(union, |, _mm256_or_si256, vorrq_u64) + +// we duplicate the function because other containers use the "intersection" term, makes API more consistent +SCALAR_BITSET_CONTAINER_FN(and, &, _mm256_and_si256, vandq_u64) +SCALAR_BITSET_CONTAINER_FN(intersection, &, _mm256_and_si256, vandq_u64) + +SCALAR_BITSET_CONTAINER_FN(xor, ^, _mm256_xor_si256, veorq_u64) +SCALAR_BITSET_CONTAINER_FN(andnot, &~, _mm256_andnot_si256, vbicq_u64) + +#if CROARING_COMPILER_SUPPORTS_AVX512 +#define BITSET_CONTAINER_FN(opname, opsymbol, avx_intrinsic, neon_intrinsic) \ + int bitset_container_##opname(const bitset_container_t *src_1, \ + const bitset_container_t *src_2, \ + bitset_container_t *dst) { \ + int support = croaring_hardware_support(); \ + if ( support & ROARING_SUPPORTS_AVX512 ) { \ + return _avx512_bitset_container_##opname(src_1, src_2, dst); \ + } \ + else if ( support & ROARING_SUPPORTS_AVX2 ) { \ + return _avx2_bitset_container_##opname(src_1, src_2, dst); \ + } else { \ + return _scalar_bitset_container_##opname(src_1, src_2, dst); \ + } \ + } \ + int bitset_container_##opname##_nocard(const bitset_container_t *src_1, \ + const bitset_container_t *src_2, \ + bitset_container_t *dst) { \ + int support = croaring_hardware_support(); \ + if ( support & ROARING_SUPPORTS_AVX512 ) { \ + return _avx512_bitset_container_##opname##_nocard(src_1, src_2, dst); \ + } \ + else if ( support & ROARING_SUPPORTS_AVX2 ) { \ + return _avx2_bitset_container_##opname##_nocard(src_1, src_2, dst); \ + } else { \ + return _scalar_bitset_container_##opname##_nocard(src_1, src_2, dst); \ + } \ + } \ + int bitset_container_##opname##_justcard(const bitset_container_t *src_1, \ + const bitset_container_t *src_2) { \ + int support = croaring_hardware_support(); \ + if ( support & ROARING_SUPPORTS_AVX512 ) { \ + return _avx512_bitset_container_##opname##_justcard(src_1, src_2); \ + } \ + else if ( support & ROARING_SUPPORTS_AVX2 ) { \ + return _avx2_bitset_container_##opname##_justcard(src_1, src_2); \ + } else { \ + return _scalar_bitset_container_##opname##_justcard(src_1, src_2); \ + } \ + } + +#else // CROARING_COMPILER_SUPPORTS_AVX512 + + +#define BITSET_CONTAINER_FN(opname, opsymbol, avx_intrinsic, neon_intrinsic) \ + int bitset_container_##opname(const bitset_container_t *src_1, \ + const bitset_container_t *src_2, \ + bitset_container_t *dst) { \ + if ( croaring_hardware_support() & ROARING_SUPPORTS_AVX2 ) { \ + return _avx2_bitset_container_##opname(src_1, src_2, dst); \ + } else { \ + return _scalar_bitset_container_##opname(src_1, src_2, dst); \ + } \ + } \ + int bitset_container_##opname##_nocard(const bitset_container_t *src_1, \ + const bitset_container_t *src_2, \ + bitset_container_t *dst) { \ + if ( croaring_hardware_support() & ROARING_SUPPORTS_AVX2 ) { \ + return _avx2_bitset_container_##opname##_nocard(src_1, src_2, dst); \ + } else { \ + return _scalar_bitset_container_##opname##_nocard(src_1, src_2, dst); \ + } \ + } \ + int bitset_container_##opname##_justcard(const bitset_container_t *src_1, \ + const bitset_container_t *src_2) { \ + if ( croaring_hardware_support() & ROARING_SUPPORTS_AVX2 ) { \ + return _avx2_bitset_container_##opname##_justcard(src_1, src_2); \ + } else { \ + return _scalar_bitset_container_##opname##_justcard(src_1, src_2); \ + } \ + } + +#endif // CROARING_COMPILER_SUPPORTS_AVX512 + +#elif defined(CROARING_USENEON) + +#define BITSET_CONTAINER_FN(opname, opsymbol, avx_intrinsic, neon_intrinsic) \ +int bitset_container_##opname(const bitset_container_t *src_1, \ + const bitset_container_t *src_2, \ + bitset_container_t *dst) { \ + const uint64_t * __restrict__ words_1 = src_1->words; \ + const uint64_t * __restrict__ words_2 = src_2->words; \ + uint64_t *out = dst->words; \ + uint16x8_t n0 = vdupq_n_u16(0); \ + uint16x8_t n1 = vdupq_n_u16(0); \ + uint16x8_t n2 = vdupq_n_u16(0); \ + uint16x8_t n3 = vdupq_n_u16(0); \ + for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 8) { \ + uint64x2_t c0 = neon_intrinsic(vld1q_u64(&words_1[i + 0]), \ + vld1q_u64(&words_2[i + 0])); \ + n0 = vaddq_u16(n0, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c0)))); \ + vst1q_u64(&out[i + 0], c0); \ + uint64x2_t c1 = neon_intrinsic(vld1q_u64(&words_1[i + 2]), \ + vld1q_u64(&words_2[i + 2])); \ + n1 = vaddq_u16(n1, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c1)))); \ + vst1q_u64(&out[i + 2], c1); \ + uint64x2_t c2 = neon_intrinsic(vld1q_u64(&words_1[i + 4]), \ + vld1q_u64(&words_2[i + 4])); \ + n2 = vaddq_u16(n2, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c2)))); \ + vst1q_u64(&out[i + 4], c2); \ + uint64x2_t c3 = neon_intrinsic(vld1q_u64(&words_1[i + 6]), \ + vld1q_u64(&words_2[i + 6])); \ + n3 = vaddq_u16(n3, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c3)))); \ + vst1q_u64(&out[i + 6], c3); \ + } \ + uint64x2_t n = vdupq_n_u64(0); \ + n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n0))); \ + n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n1))); \ + n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n2))); \ + n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n3))); \ + dst->cardinality = vgetq_lane_u64(n, 0) + vgetq_lane_u64(n, 1); \ + return dst->cardinality; \ +} \ +int bitset_container_##opname##_nocard(const bitset_container_t *src_1, \ + const bitset_container_t *src_2, \ + bitset_container_t *dst) { \ + const uint64_t * __restrict__ words_1 = src_1->words; \ + const uint64_t * __restrict__ words_2 = src_2->words; \ + uint64_t *out = dst->words; \ + for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 8) { \ + vst1q_u64(&out[i + 0], neon_intrinsic(vld1q_u64(&words_1[i + 0]), \ + vld1q_u64(&words_2[i + 0]))); \ + vst1q_u64(&out[i + 2], neon_intrinsic(vld1q_u64(&words_1[i + 2]), \ + vld1q_u64(&words_2[i + 2]))); \ + vst1q_u64(&out[i + 4], neon_intrinsic(vld1q_u64(&words_1[i + 4]), \ + vld1q_u64(&words_2[i + 4]))); \ + vst1q_u64(&out[i + 6], neon_intrinsic(vld1q_u64(&words_1[i + 6]), \ + vld1q_u64(&words_2[i + 6]))); \ + } \ + dst->cardinality = BITSET_UNKNOWN_CARDINALITY; \ + return dst->cardinality; \ +} \ +int bitset_container_##opname##_justcard(const bitset_container_t *src_1, \ + const bitset_container_t *src_2) { \ + const uint64_t * __restrict__ words_1 = src_1->words; \ + const uint64_t * __restrict__ words_2 = src_2->words; \ + uint16x8_t n0 = vdupq_n_u16(0); \ + uint16x8_t n1 = vdupq_n_u16(0); \ + uint16x8_t n2 = vdupq_n_u16(0); \ + uint16x8_t n3 = vdupq_n_u16(0); \ + for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 8) { \ + uint64x2_t c0 = neon_intrinsic(vld1q_u64(&words_1[i + 0]), \ + vld1q_u64(&words_2[i + 0])); \ + n0 = vaddq_u16(n0, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c0)))); \ + uint64x2_t c1 = neon_intrinsic(vld1q_u64(&words_1[i + 2]), \ + vld1q_u64(&words_2[i + 2])); \ + n1 = vaddq_u16(n1, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c1)))); \ + uint64x2_t c2 = neon_intrinsic(vld1q_u64(&words_1[i + 4]), \ + vld1q_u64(&words_2[i + 4])); \ + n2 = vaddq_u16(n2, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c2)))); \ + uint64x2_t c3 = neon_intrinsic(vld1q_u64(&words_1[i + 6]), \ + vld1q_u64(&words_2[i + 6])); \ + n3 = vaddq_u16(n3, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c3)))); \ + } \ + uint64x2_t n = vdupq_n_u64(0); \ + n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n0))); \ + n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n1))); \ + n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n2))); \ + n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n3))); \ + return vgetq_lane_u64(n, 0) + vgetq_lane_u64(n, 1); \ +} + +#else + +#define BITSET_CONTAINER_FN(opname, opsymbol, avx_intrinsic, neon_intrinsic) \ +int bitset_container_##opname(const bitset_container_t *src_1, \ + const bitset_container_t *src_2, \ + bitset_container_t *dst) { \ + const uint64_t * __restrict__ words_1 = src_1->words; \ + const uint64_t * __restrict__ words_2 = src_2->words; \ + uint64_t *out = dst->words; \ + int32_t sum = 0; \ + for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 2) { \ + const uint64_t word_1 = (words_1[i])opsymbol(words_2[i]), \ + word_2 = (words_1[i + 1])opsymbol(words_2[i + 1]); \ + out[i] = word_1; \ + out[i + 1] = word_2; \ + sum += roaring_hamming(word_1); \ + sum += roaring_hamming(word_2); \ + } \ + dst->cardinality = sum; \ + return dst->cardinality; \ +} \ +int bitset_container_##opname##_nocard(const bitset_container_t *src_1, \ + const bitset_container_t *src_2, \ + bitset_container_t *dst) { \ + const uint64_t * __restrict__ words_1 = src_1->words; \ + const uint64_t * __restrict__ words_2 = src_2->words; \ + uint64_t *out = dst->words; \ + for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i++) { \ + out[i] = (words_1[i])opsymbol(words_2[i]); \ + } \ + dst->cardinality = BITSET_UNKNOWN_CARDINALITY; \ + return dst->cardinality; \ +} \ +int bitset_container_##opname##_justcard(const bitset_container_t *src_1, \ + const bitset_container_t *src_2) { \ + printf("A1\n"); const uint64_t * __restrict__ words_1 = src_1->words; \ + const uint64_t * __restrict__ words_2 = src_2->words; \ + int32_t sum = 0; \ + for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 2) { \ + const uint64_t word_1 = (words_1[i])opsymbol(words_2[i]), \ + word_2 = (words_1[i + 1])opsymbol(words_2[i + 1]); \ + sum += roaring_hamming(word_1); \ + sum += roaring_hamming(word_2); \ + } \ + return sum; \ +} + +#endif // CROARING_IS_X64 + +// we duplicate the function because other containers use the "or" term, makes API more consistent +BITSET_CONTAINER_FN(or, |, _mm256_or_si256, vorrq_u64) +BITSET_CONTAINER_FN(union, |, _mm256_or_si256, vorrq_u64) + +// we duplicate the function because other containers use the "intersection" term, makes API more consistent +BITSET_CONTAINER_FN(and, &, _mm256_and_si256, vandq_u64) +BITSET_CONTAINER_FN(intersection, &, _mm256_and_si256, vandq_u64) + +BITSET_CONTAINER_FN(xor, ^, _mm256_xor_si256, veorq_u64) +BITSET_CONTAINER_FN(andnot, &~, _mm256_andnot_si256, vbicq_u64) +// clang-format On + + +ALLOW_UNALIGNED +int bitset_container_to_uint32_array( + uint32_t *out, + const bitset_container_t *bc, + uint32_t base +){ +#if CROARING_IS_X64 + int support = croaring_hardware_support(); +#if CROARING_COMPILER_SUPPORTS_AVX512 + if(( support & ROARING_SUPPORTS_AVX512 ) && (bc->cardinality >= 8192)) // heuristic + return (int) bitset_extract_setbits_avx512(bc->words, + BITSET_CONTAINER_SIZE_IN_WORDS, out, bc->cardinality, base); + else +#endif + if(( support & ROARING_SUPPORTS_AVX2 ) && (bc->cardinality >= 8192)) // heuristic + return (int) bitset_extract_setbits_avx2(bc->words, + BITSET_CONTAINER_SIZE_IN_WORDS, out, bc->cardinality, base); + else + return (int) bitset_extract_setbits(bc->words, + BITSET_CONTAINER_SIZE_IN_WORDS, out, base); +#else + return (int) bitset_extract_setbits(bc->words, + BITSET_CONTAINER_SIZE_IN_WORDS, out, base); +#endif +} + +/* + * Print this container using printf (useful for debugging). + */ +void bitset_container_printf(const bitset_container_t * v) { + printf("{"); + uint32_t base = 0; + bool iamfirst = true;// TODO: rework so that this is not necessary yet still readable + for (int i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; ++i) { + uint64_t w = v->words[i]; + while (w != 0) { + uint64_t t = w & (~w + 1); + int r = roaring_trailing_zeroes(w); + if(iamfirst) {// predicted to be false + printf("%u",base + r); + iamfirst = false; + } else { + printf(",%u",base + r); + } + w ^= t; + } + base += 64; + } + printf("}"); +} + + +/* + * Print this container using printf as a comma-separated list of 32-bit integers starting at base. + */ +void bitset_container_printf_as_uint32_array(const bitset_container_t * v, uint32_t base) { + bool iamfirst = true;// TODO: rework so that this is not necessary yet still readable + for (int i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; ++i) { + uint64_t w = v->words[i]; + while (w != 0) { + uint64_t t = w & (~w + 1); + int r = roaring_trailing_zeroes(w); + if(iamfirst) {// predicted to be false + printf("%u", r + base); + iamfirst = false; + } else { + printf(",%u",r + base); + } + w ^= t; + } + base += 64; + } +} + + +// TODO: use the fast lower bound, also +int bitset_container_number_of_runs(bitset_container_t *bc) { + int num_runs = 0; + uint64_t next_word = bc->words[0]; + + for (int i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS-1; ++i) { + uint64_t word = next_word; + next_word = bc->words[i+1]; + num_runs += roaring_hamming((~word) & (word << 1)) + ( (word >> 63) & ~next_word); + } + + uint64_t word = next_word; + num_runs += roaring_hamming((~word) & (word << 1)); + if((word & 0x8000000000000000ULL) != 0) + num_runs++; + return num_runs; +} + + +int32_t bitset_container_write(const bitset_container_t *container, + char *buf) { + memcpy(buf, container->words, BITSET_CONTAINER_SIZE_IN_WORDS * sizeof(uint64_t)); + return bitset_container_size_in_bytes(container); +} + + +int32_t bitset_container_read(int32_t cardinality, bitset_container_t *container, + const char *buf) { + container->cardinality = cardinality; + memcpy(container->words, buf, BITSET_CONTAINER_SIZE_IN_WORDS * sizeof(uint64_t)); + return bitset_container_size_in_bytes(container); +} + +bool bitset_container_iterate(const bitset_container_t *cont, uint32_t base, roaring_iterator iterator, void *ptr) { + for (int32_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; ++i ) { + uint64_t w = cont->words[i]; + while (w != 0) { + uint64_t t = w & (~w + 1); + int r = roaring_trailing_zeroes(w); + if(!iterator(r + base, ptr)) return false; + w ^= t; + } + base += 64; + } + return true; +} + +bool bitset_container_iterate64(const bitset_container_t *cont, uint32_t base, roaring_iterator64 iterator, uint64_t high_bits, void *ptr) { + for (int32_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; ++i ) { + uint64_t w = cont->words[i]; + while (w != 0) { + uint64_t t = w & (~w + 1); + int r = roaring_trailing_zeroes(w); + if(!iterator(high_bits | (uint64_t)(r + base), ptr)) return false; + w ^= t; + } + base += 64; + } + return true; +} + +#if CROARING_IS_X64 +#if CROARING_COMPILER_SUPPORTS_AVX512 +CROARING_TARGET_AVX512 +ALLOW_UNALIGNED +static inline bool _avx512_bitset_container_equals(const bitset_container_t *container1, const bitset_container_t *container2) { + const __m512i *ptr1 = (const __m512i*)container1->words; + const __m512i *ptr2 = (const __m512i*)container2->words; + for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS*sizeof(uint64_t)/64; i++) { + __m512i r1 = _mm512_loadu_si512(ptr1+i); + __m512i r2 = _mm512_loadu_si512(ptr2+i); + __mmask64 mask = _mm512_cmpeq_epi8_mask(r1, r2); + if ((uint64_t)mask != UINT64_MAX) { + return false; + } + } + return true; +} +CROARING_UNTARGET_AVX512 +#endif // CROARING_COMPILER_SUPPORTS_AVX512 +CROARING_TARGET_AVX2 +ALLOW_UNALIGNED +static inline bool _avx2_bitset_container_equals(const bitset_container_t *container1, const bitset_container_t *container2) { + const __m256i *ptr1 = (const __m256i*)container1->words; + const __m256i *ptr2 = (const __m256i*)container2->words; + for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS*sizeof(uint64_t)/32; i++) { + __m256i r1 = _mm256_loadu_si256(ptr1+i); + __m256i r2 = _mm256_loadu_si256(ptr2+i); + int mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(r1, r2)); + if ((uint32_t)mask != UINT32_MAX) { + return false; + } + } + return true; +} +CROARING_UNTARGET_AVX2 +#endif // CROARING_IS_X64 + +ALLOW_UNALIGNED +bool bitset_container_equals(const bitset_container_t *container1, const bitset_container_t *container2) { + if((container1->cardinality != BITSET_UNKNOWN_CARDINALITY) && (container2->cardinality != BITSET_UNKNOWN_CARDINALITY)) { + if(container1->cardinality != container2->cardinality) { + return false; + } + if (container1->cardinality == INT32_C(0x10000)) { + return true; + } + } +#if CROARING_IS_X64 + int support = croaring_hardware_support(); +#if CROARING_COMPILER_SUPPORTS_AVX512 + if( support & ROARING_SUPPORTS_AVX512 ) { + return _avx512_bitset_container_equals(container1, container2); + } + else +#endif + if( support & ROARING_SUPPORTS_AVX2 ) { + return _avx2_bitset_container_equals(container1, container2); + } +#endif + return memcmp(container1->words, + container2->words, + BITSET_CONTAINER_SIZE_IN_WORDS*sizeof(uint64_t)) == 0; +} + +bool bitset_container_is_subset(const bitset_container_t *container1, + const bitset_container_t *container2) { + if((container1->cardinality != BITSET_UNKNOWN_CARDINALITY) && (container2->cardinality != BITSET_UNKNOWN_CARDINALITY)) { + if(container1->cardinality > container2->cardinality) { + return false; + } + } + for(int32_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; ++i ) { + if((container1->words[i] & container2->words[i]) != container1->words[i]) { + return false; + } + } + return true; +} + +bool bitset_container_select(const bitset_container_t *container, uint32_t *start_rank, uint32_t rank, uint32_t *element) { + int card = bitset_container_cardinality(container); + if(rank >= *start_rank + card) { + *start_rank += card; + return false; + } + const uint64_t *words = container->words; + int32_t size; + for (int i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 1) { + size = roaring_hamming(words[i]); + if(rank <= *start_rank + size) { + uint64_t w = container->words[i]; + uint16_t base = i*64; + while (w != 0) { + uint64_t t = w & (~w + 1); + int r = roaring_trailing_zeroes(w); + if(*start_rank == rank) { + *element = r+base; + return true; + } + w ^= t; + *start_rank += 1; + } + } + else + *start_rank += size; + } + assert(false); + roaring_unreachable; +} + + +/* Returns the smallest value (assumes not empty) */ +uint16_t bitset_container_minimum(const bitset_container_t *container) { + for (int32_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; ++i ) { + uint64_t w = container->words[i]; + if (w != 0) { + int r = roaring_trailing_zeroes(w); + return r + i * 64; + } + } + return UINT16_MAX; +} + +/* Returns the largest value (assumes not empty) */ +uint16_t bitset_container_maximum(const bitset_container_t *container) { + for (int32_t i = BITSET_CONTAINER_SIZE_IN_WORDS - 1; i > 0; --i ) { + uint64_t w = container->words[i]; + if (w != 0) { + int r = roaring_leading_zeroes(w); + return i * 64 + 63 - r; + } + } + return 0; +} + +/* Returns the number of values equal or smaller than x */ +int bitset_container_rank(const bitset_container_t *container, uint16_t x) { + // credit: aqrit + int sum = 0; + int i = 0; + for (int end = x / 64; i < end; i++){ + sum += roaring_hamming(container->words[i]); + } + uint64_t lastword = container->words[i]; + uint64_t lastpos = UINT64_C(1) << (x % 64); + uint64_t mask = lastpos + lastpos - 1; // smear right + sum += roaring_hamming(lastword & mask); + return sum; +} + +/* Returns the index of x , if not exsist return -1 */ +int bitset_container_get_index(const bitset_container_t *container, uint16_t x) { + if (bitset_container_get(container, x)) { + // credit: aqrit + int sum = 0; + int i = 0; + for (int end = x / 64; i < end; i++){ + sum += roaring_hamming(container->words[i]); + } + uint64_t lastword = container->words[i]; + uint64_t lastpos = UINT64_C(1) << (x % 64); + uint64_t mask = lastpos + lastpos - 1; // smear right + sum += roaring_hamming(lastword & mask); + return sum - 1; + } else { + return -1; + } +} + +/* Returns the index of the first value equal or larger than x, or -1 */ +int bitset_container_index_equalorlarger(const bitset_container_t *container, uint16_t x) { + uint32_t x32 = x; + uint32_t k = x32 / 64; + uint64_t word = container->words[k]; + const int diff = x32 - k * 64; // in [0,64) + word = (word >> diff) << diff; // a mask is faster, but we don't care + while(word == 0) { + k++; + if(k == BITSET_CONTAINER_SIZE_IN_WORDS) return -1; + word = container->words[k]; + } + return k * 64 + roaring_trailing_zeroes(word); +} + +#ifdef __cplusplus +} } } // extern "C" { namespace roaring { namespace internal { +#endif +/* end file src/containers/bitset.c */ +/* begin file src/containers/containers.c */ + + +#ifdef __cplusplus +extern "C" { namespace roaring { namespace internal { +#endif + +extern inline const container_t *container_unwrap_shared( + const container_t *candidate_shared_container, uint8_t *type); + +extern inline container_t *container_mutable_unwrap_shared( + container_t *candidate_shared_container, uint8_t *type); + +extern inline int container_get_cardinality( + const container_t *c, uint8_t typecode); + +extern inline container_t *container_iand( + container_t *c1, uint8_t type1, + const container_t *c2, uint8_t type2, + uint8_t *result_type); + +extern inline container_t *container_ior( + container_t *c1, uint8_t type1, + const container_t *c2, uint8_t type2, + uint8_t *result_type); + +extern inline container_t *container_ixor( + container_t *c1, uint8_t type1, + const container_t *c2, uint8_t type2, + uint8_t *result_type); + +extern inline container_t *container_iandnot( + container_t *c1, uint8_t type1, + const container_t *c2, uint8_t type2, + uint8_t *result_type); + +void container_free(container_t *c, uint8_t type) { + switch (type) { + case BITSET_CONTAINER_TYPE: + bitset_container_free(CAST_bitset(c)); + break; + case ARRAY_CONTAINER_TYPE: + array_container_free(CAST_array(c)); + break; + case RUN_CONTAINER_TYPE: + run_container_free(CAST_run(c)); + break; + case SHARED_CONTAINER_TYPE: + shared_container_free(CAST_shared(c)); + break; + default: + assert(false); + roaring_unreachable; + } +} + +void container_printf(const container_t *c, uint8_t type) { + c = container_unwrap_shared(c, &type); + switch (type) { + case BITSET_CONTAINER_TYPE: + bitset_container_printf(const_CAST_bitset(c)); + return; + case ARRAY_CONTAINER_TYPE: + array_container_printf(const_CAST_array(c)); + return; + case RUN_CONTAINER_TYPE: + run_container_printf(const_CAST_run(c)); + return; + default: + roaring_unreachable; + } +} + +void container_printf_as_uint32_array( + const container_t *c, uint8_t typecode, + uint32_t base +){ + c = container_unwrap_shared(c, &typecode); + switch (typecode) { + case BITSET_CONTAINER_TYPE: + bitset_container_printf_as_uint32_array( + const_CAST_bitset(c), base); + return; + case ARRAY_CONTAINER_TYPE: + array_container_printf_as_uint32_array( + const_CAST_array(c), base); + return; + case RUN_CONTAINER_TYPE: + run_container_printf_as_uint32_array( + const_CAST_run(c), base); + return; + default: + roaring_unreachable; + } +} + +extern inline bool container_nonzero_cardinality( + const container_t *c, uint8_t typecode); + +extern inline int container_to_uint32_array( + uint32_t *output, + const container_t *c, uint8_t typecode, + uint32_t base); + +extern inline container_t *container_add( + container_t *c, + uint16_t val, + uint8_t typecode, // !!! 2nd arg? + uint8_t *new_typecode); + +extern inline bool container_contains( + const container_t *c, + uint16_t val, + uint8_t typecode); // !!! 2nd arg? + +extern inline container_t *container_and( + const container_t *c1, uint8_t type1, + const container_t *c2, uint8_t type2, + uint8_t *result_type); + +extern inline container_t *container_or( + const container_t *c1, uint8_t type1, + const container_t *c2, uint8_t type2, + uint8_t *result_type); + +extern inline container_t *container_xor( + const container_t *c1, uint8_t type1, + const container_t *c2, uint8_t type2, + uint8_t *result_type); + +container_t *get_copy_of_container( + container_t *c, uint8_t *typecode, + bool copy_on_write +){ + if (copy_on_write) { + shared_container_t *shared_container; + if (*typecode == SHARED_CONTAINER_TYPE) { + shared_container = CAST_shared(c); + croaring_refcount_inc(&shared_container->counter); + return shared_container; + } + assert(*typecode != SHARED_CONTAINER_TYPE); + + if ((shared_container = (shared_container_t *)roaring_malloc( + sizeof(shared_container_t))) == NULL) { + return NULL; + } + + shared_container->container = c; + shared_container->typecode = *typecode; + // At this point, we are creating new shared container + // so there should be no other references, and setting + // the counter to 2 - even non-atomically - is safe as + // long as the value is set before the return statement. + shared_container->counter = 2; + *typecode = SHARED_CONTAINER_TYPE; + + return shared_container; + } // copy_on_write + // otherwise, no copy on write... + const container_t *actual_container = container_unwrap_shared(c, typecode); + assert(*typecode != SHARED_CONTAINER_TYPE); + return container_clone(actual_container, *typecode); +} + +/** + * Copies a container, requires a typecode. This allocates new memory, caller + * is responsible for deallocation. + */ +container_t *container_clone(const container_t *c, uint8_t typecode) { + // We do not want to allow cloning of shared containers. + // c = container_unwrap_shared(c, &typecode); + switch (typecode) { + case BITSET_CONTAINER_TYPE: + return bitset_container_clone(const_CAST_bitset(c)); + case ARRAY_CONTAINER_TYPE: + return array_container_clone(const_CAST_array(c)); + case RUN_CONTAINER_TYPE: + return run_container_clone(const_CAST_run(c)); + case SHARED_CONTAINER_TYPE: + // Shared containers are not cloneable. Are you mixing COW and non-COW bitmaps? + return NULL; + default: + assert(false); + roaring_unreachable; + return NULL; + } +} + +container_t *shared_container_extract_copy( + shared_container_t *sc, uint8_t *typecode +){ + assert(sc->typecode != SHARED_CONTAINER_TYPE); + *typecode = sc->typecode; + container_t *answer; + if (croaring_refcount_dec(&sc->counter)) { + answer = sc->container; + sc->container = NULL; // paranoid + roaring_free(sc); + } else { + answer = container_clone(sc->container, *typecode); + } + assert(*typecode != SHARED_CONTAINER_TYPE); + return answer; +} + +void shared_container_free(shared_container_t *container) { + if (croaring_refcount_dec(&container->counter)) { + assert(container->typecode != SHARED_CONTAINER_TYPE); + container_free(container->container, container->typecode); + container->container = NULL; // paranoid + roaring_free(container); + } +} + +extern inline container_t *container_not( + const container_t *c1, uint8_t type1, + uint8_t *result_type); + +extern inline container_t *container_not_range( + const container_t *c1, uint8_t type1, + uint32_t range_start, uint32_t range_end, + uint8_t *result_type); + +extern inline container_t *container_inot( + container_t *c1, uint8_t type1, + uint8_t *result_type); + +extern inline container_t *container_inot_range( + container_t *c1, uint8_t type1, + uint32_t range_start, uint32_t range_end, + uint8_t *result_type); + +extern inline container_t *container_range_of_ones( + uint32_t range_start, uint32_t range_end, + uint8_t *result_type); + +// where are the correponding things for union and intersection?? +extern inline container_t *container_lazy_xor( + const container_t *c1, uint8_t type1, + const container_t *c2, uint8_t type2, + uint8_t *result_type); + +extern inline container_t *container_lazy_ixor( + container_t *c1, uint8_t type1, + const container_t *c2, uint8_t type2, + uint8_t *result_type); + +extern inline container_t *container_andnot( + const container_t *c1, uint8_t type1, + const container_t *c2, uint8_t type2, + uint8_t *result_type); + +#ifdef __cplusplus +} } } // extern "C" { namespace roaring { namespace internal { +#endif +/* end file src/containers/containers.c */ +/* begin file src/containers/convert.c */ +#include + + +#if CROARING_IS_X64 +#ifndef CROARING_COMPILER_SUPPORTS_AVX512 +#error "CROARING_COMPILER_SUPPORTS_AVX512 needs to be defined." +#endif // CROARING_COMPILER_SUPPORTS_AVX512 +#endif + +#ifdef __cplusplus +extern "C" { namespace roaring { namespace internal { +#endif + +// file contains grubby stuff that must know impl. details of all container +// types. +bitset_container_t *bitset_container_from_array(const array_container_t *ac) { + bitset_container_t *ans = bitset_container_create(); + int limit = array_container_cardinality(ac); + for (int i = 0; i < limit; ++i) bitset_container_set(ans, ac->array[i]); + return ans; +} + +bitset_container_t *bitset_container_from_run(const run_container_t *arr) { + int card = run_container_cardinality(arr); + bitset_container_t *answer = bitset_container_create(); + for (int rlepos = 0; rlepos < arr->n_runs; ++rlepos) { + rle16_t vl = arr->runs[rlepos]; + bitset_set_lenrange(answer->words, vl.value, vl.length); + } + answer->cardinality = card; + return answer; +} + +array_container_t *array_container_from_run(const run_container_t *arr) { + array_container_t *answer = + array_container_create_given_capacity(run_container_cardinality(arr)); + answer->cardinality = 0; + for (int rlepos = 0; rlepos < arr->n_runs; ++rlepos) { + int run_start = arr->runs[rlepos].value; + int run_end = run_start + arr->runs[rlepos].length; + + for (int run_value = run_start; run_value <= run_end; ++run_value) { + answer->array[answer->cardinality++] = (uint16_t)run_value; + } + } + return answer; +} + +array_container_t *array_container_from_bitset(const bitset_container_t *bits) { + array_container_t *result = + array_container_create_given_capacity(bits->cardinality); + result->cardinality = bits->cardinality; +#if CROARING_IS_X64 +#if CROARING_COMPILER_SUPPORTS_AVX512 + if( croaring_hardware_support() & ROARING_SUPPORTS_AVX512 ) { + bitset_extract_setbits_avx512_uint16(bits->words, BITSET_CONTAINER_SIZE_IN_WORDS, + result->array, bits->cardinality , 0); + } else +#endif + { + // sse version ends up being slower here + // (bitset_extract_setbits_sse_uint16) + // because of the sparsity of the data + bitset_extract_setbits_uint16(bits->words, BITSET_CONTAINER_SIZE_IN_WORDS, + result->array, 0); + } +#else + // If the system is not x64, then we have no accelerated function. + bitset_extract_setbits_uint16(bits->words, BITSET_CONTAINER_SIZE_IN_WORDS, + result->array, 0); +#endif + + + return result; +} + +/* assumes that container has adequate space. Run from [s,e] (inclusive) */ +static void add_run(run_container_t *rc, int s, int e) { + rc->runs[rc->n_runs].value = s; + rc->runs[rc->n_runs].length = e - s; + rc->n_runs++; +} + +run_container_t *run_container_from_array(const array_container_t *c) { + int32_t n_runs = array_container_number_of_runs(c); + run_container_t *answer = run_container_create_given_capacity(n_runs); + int prev = -2; + int run_start = -1; + int32_t card = c->cardinality; + if (card == 0) return answer; + for (int i = 0; i < card; ++i) { + const uint16_t cur_val = c->array[i]; + if (cur_val != prev + 1) { + // new run starts; flush old one, if any + if (run_start != -1) add_run(answer, run_start, prev); + run_start = cur_val; + } + prev = c->array[i]; + } + // now prev is the last seen value + add_run(answer, run_start, prev); + // assert(run_container_cardinality(answer) == c->cardinality); + return answer; +} + +/** + * Convert the runcontainer to either a Bitmap or an Array Container, depending + * on the cardinality. Frees the container. + * Allocates and returns new container, which caller is responsible for freeing. + * It does not free the run container. + */ +container_t *convert_to_bitset_or_array_container( + run_container_t *rc, int32_t card, + uint8_t *resulttype +){ + if (card <= DEFAULT_MAX_SIZE) { + array_container_t *answer = array_container_create_given_capacity(card); + answer->cardinality = 0; + for (int rlepos = 0; rlepos < rc->n_runs; ++rlepos) { + uint16_t run_start = rc->runs[rlepos].value; + uint16_t run_end = run_start + rc->runs[rlepos].length; + for (uint16_t run_value = run_start; run_value < run_end; + ++run_value) { + answer->array[answer->cardinality++] = run_value; + } + answer->array[answer->cardinality++] = run_end; + } + assert(card == answer->cardinality); + *resulttype = ARRAY_CONTAINER_TYPE; + //run_container_free(r); + return answer; + } + bitset_container_t *answer = bitset_container_create(); + for (int rlepos = 0; rlepos < rc->n_runs; ++rlepos) { + uint16_t run_start = rc->runs[rlepos].value; + bitset_set_lenrange(answer->words, run_start, rc->runs[rlepos].length); + } + answer->cardinality = card; + *resulttype = BITSET_CONTAINER_TYPE; + //run_container_free(r); + return answer; +} + +/* Converts a run container to either an array or a bitset, IF it saves space. + */ +/* If a conversion occurs, the caller is responsible to free the original + * container and + * he becomes responsible to free the new one. */ +container_t *convert_run_to_efficient_container( + run_container_t *c, + uint8_t *typecode_after +){ + int32_t size_as_run_container = + run_container_serialized_size_in_bytes(c->n_runs); + + int32_t size_as_bitset_container = + bitset_container_serialized_size_in_bytes(); + int32_t card = run_container_cardinality(c); + int32_t size_as_array_container = + array_container_serialized_size_in_bytes(card); + + int32_t min_size_non_run = + size_as_bitset_container < size_as_array_container + ? size_as_bitset_container + : size_as_array_container; + if (size_as_run_container <= min_size_non_run) { // no conversion + *typecode_after = RUN_CONTAINER_TYPE; + return c; + } + if (card <= DEFAULT_MAX_SIZE) { + // to array + array_container_t *answer = array_container_create_given_capacity(card); + answer->cardinality = 0; + for (int rlepos = 0; rlepos < c->n_runs; ++rlepos) { + int run_start = c->runs[rlepos].value; + int run_end = run_start + c->runs[rlepos].length; + + for (int run_value = run_start; run_value <= run_end; ++run_value) { + answer->array[answer->cardinality++] = (uint16_t)run_value; + } + } + *typecode_after = ARRAY_CONTAINER_TYPE; + return answer; + } + + // else to bitset + bitset_container_t *answer = bitset_container_create(); + + for (int rlepos = 0; rlepos < c->n_runs; ++rlepos) { + int start = c->runs[rlepos].value; + int end = start + c->runs[rlepos].length; + bitset_set_range(answer->words, start, end + 1); + } + answer->cardinality = card; + *typecode_after = BITSET_CONTAINER_TYPE; + return answer; +} + +// like convert_run_to_efficient_container but frees the old result if needed +container_t *convert_run_to_efficient_container_and_free( + run_container_t *c, + uint8_t *typecode_after +){ + container_t *answer = convert_run_to_efficient_container(c, typecode_after); + if (answer != c) run_container_free(c); + return answer; +} + +/* once converted, the original container is disposed here, rather than + in roaring_array +*/ + +// TODO: split into run- array- and bitset- subfunctions for sanity; +// a few function calls won't really matter. + +container_t *convert_run_optimize( + container_t *c, uint8_t typecode_original, + uint8_t *typecode_after +){ + if (typecode_original == RUN_CONTAINER_TYPE) { + container_t *newc = convert_run_to_efficient_container( + CAST_run(c), typecode_after); + if (newc != c) { + container_free(c, typecode_original); + } + return newc; + } else if (typecode_original == ARRAY_CONTAINER_TYPE) { + // it might need to be converted to a run container. + array_container_t *c_qua_array = CAST_array(c); + int32_t n_runs = array_container_number_of_runs(c_qua_array); + int32_t size_as_run_container = + run_container_serialized_size_in_bytes(n_runs); + int32_t card = array_container_cardinality(c_qua_array); + int32_t size_as_array_container = + array_container_serialized_size_in_bytes(card); + + if (size_as_run_container >= size_as_array_container) { + *typecode_after = ARRAY_CONTAINER_TYPE; + return c; + } + // else convert array to run container + run_container_t *answer = run_container_create_given_capacity(n_runs); + int prev = -2; + int run_start = -1; + + assert(card > 0); + for (int i = 0; i < card; ++i) { + uint16_t cur_val = c_qua_array->array[i]; + if (cur_val != prev + 1) { + // new run starts; flush old one, if any + if (run_start != -1) add_run(answer, run_start, prev); + run_start = cur_val; + } + prev = c_qua_array->array[i]; + } + assert(run_start >= 0); + // now prev is the last seen value + add_run(answer, run_start, prev); + *typecode_after = RUN_CONTAINER_TYPE; + array_container_free(c_qua_array); + return answer; + } else if (typecode_original == + BITSET_CONTAINER_TYPE) { // run conversions on bitset + // does bitset need conversion to run? + bitset_container_t *c_qua_bitset = CAST_bitset(c); + int32_t n_runs = bitset_container_number_of_runs(c_qua_bitset); + int32_t size_as_run_container = + run_container_serialized_size_in_bytes(n_runs); + int32_t size_as_bitset_container = + bitset_container_serialized_size_in_bytes(); + + if (size_as_bitset_container <= size_as_run_container) { + // no conversion needed. + *typecode_after = BITSET_CONTAINER_TYPE; + return c; + } + // bitset to runcontainer (ported from Java RunContainer( + // BitmapContainer bc, int nbrRuns)) + assert(n_runs > 0); // no empty bitmaps + run_container_t *answer = run_container_create_given_capacity(n_runs); + + int long_ctr = 0; + uint64_t cur_word = c_qua_bitset->words[0]; + while (true) { + while (cur_word == UINT64_C(0) && + long_ctr < BITSET_CONTAINER_SIZE_IN_WORDS - 1) + cur_word = c_qua_bitset->words[++long_ctr]; + + if (cur_word == UINT64_C(0)) { + bitset_container_free(c_qua_bitset); + *typecode_after = RUN_CONTAINER_TYPE; + return answer; + } + + int local_run_start = roaring_trailing_zeroes(cur_word); + int run_start = local_run_start + 64 * long_ctr; + uint64_t cur_word_with_1s = cur_word | (cur_word - 1); + + int run_end = 0; + while (cur_word_with_1s == UINT64_C(0xFFFFFFFFFFFFFFFF) && + long_ctr < BITSET_CONTAINER_SIZE_IN_WORDS - 1) + cur_word_with_1s = c_qua_bitset->words[++long_ctr]; + + if (cur_word_with_1s == UINT64_C(0xFFFFFFFFFFFFFFFF)) { + run_end = 64 + long_ctr * 64; // exclusive, I guess + add_run(answer, run_start, run_end - 1); + bitset_container_free(c_qua_bitset); + *typecode_after = RUN_CONTAINER_TYPE; + return answer; + } + int local_run_end = roaring_trailing_zeroes(~cur_word_with_1s); + run_end = local_run_end + long_ctr * 64; + add_run(answer, run_start, run_end - 1); + cur_word = cur_word_with_1s & (cur_word_with_1s + 1); + } + return answer; + } else { + assert(false); + roaring_unreachable; + return NULL; + } +} + +container_t *container_from_run_range( + const run_container_t *run, + uint32_t min, uint32_t max, uint8_t *typecode_after +){ + // We expect most of the time to end up with a bitset container + bitset_container_t *bitset = bitset_container_create(); + *typecode_after = BITSET_CONTAINER_TYPE; + int32_t union_cardinality = 0; + for (int32_t i = 0; i < run->n_runs; ++i) { + uint32_t rle_min = run->runs[i].value; + uint32_t rle_max = rle_min + run->runs[i].length; + bitset_set_lenrange(bitset->words, rle_min, rle_max - rle_min); + union_cardinality += run->runs[i].length + 1; + } + union_cardinality += max - min + 1; + union_cardinality -= bitset_lenrange_cardinality(bitset->words, min, max-min); + bitset_set_lenrange(bitset->words, min, max - min); + bitset->cardinality = union_cardinality; + if(bitset->cardinality <= DEFAULT_MAX_SIZE) { + // we need to convert to an array container + array_container_t * array = array_container_from_bitset(bitset); + *typecode_after = ARRAY_CONTAINER_TYPE; + bitset_container_free(bitset); + return array; + } + return bitset; +} + +#ifdef __cplusplus +} } } // extern "C" { namespace roaring { namespace internal { +#endif +/* end file src/containers/convert.c */ +/* begin file src/containers/mixed_andnot.c */ +/* + * mixed_andnot.c. More methods since operation is not symmetric, + * except no "wide" andnot , so no lazy options motivated. + */ + +#include +#include + + +#ifdef __cplusplus +extern "C" { namespace roaring { namespace internal { +#endif + +/* Compute the andnot of src_1 and src_2 and write the result to + * dst, a valid array container that could be the same as dst.*/ +void array_bitset_container_andnot(const array_container_t *src_1, + const bitset_container_t *src_2, + array_container_t *dst) { + // follows Java implementation as of June 2016 + if (dst->capacity < src_1->cardinality) { + array_container_grow(dst, src_1->cardinality, false); + } + int32_t newcard = 0; + const int32_t origcard = src_1->cardinality; + for (int i = 0; i < origcard; ++i) { + uint16_t key = src_1->array[i]; + dst->array[newcard] = key; + newcard += 1 - bitset_container_contains(src_2, key); + } + dst->cardinality = newcard; +} + +/* Compute the andnot of src_1 and src_2 and write the result to + * src_1 */ + +void array_bitset_container_iandnot(array_container_t *src_1, + const bitset_container_t *src_2) { + array_bitset_container_andnot(src_1, src_2, src_1); +} + +/* Compute the andnot of src_1 and src_2 and write the result to + * dst, which does not initially have a valid container. + * Return true for a bitset result; false for array + */ + +bool bitset_array_container_andnot( + const bitset_container_t *src_1, const array_container_t *src_2, + container_t **dst +){ + // Java did this directly, but we have option of asm or avx + bitset_container_t *result = bitset_container_create(); + bitset_container_copy(src_1, result); + result->cardinality = + (int32_t)bitset_clear_list(result->words, (uint64_t)result->cardinality, + src_2->array, (uint64_t)src_2->cardinality); + + // do required type conversions. + if (result->cardinality <= DEFAULT_MAX_SIZE) { + *dst = array_container_from_bitset(result); + bitset_container_free(result); + return false; + } + *dst = result; + return true; +} + +/* Compute the andnot of src_1 and src_2 and write the result to + * dst (which has no container initially). It will modify src_1 + * to be dst if the result is a bitset. Otherwise, it will + * free src_1 and dst will be a new array container. In both + * cases, the caller is responsible for deallocating dst. + * Returns true iff dst is a bitset */ + +bool bitset_array_container_iandnot( + bitset_container_t *src_1, const array_container_t *src_2, + container_t **dst +){ + *dst = src_1; + src_1->cardinality = + (int32_t)bitset_clear_list(src_1->words, (uint64_t)src_1->cardinality, + src_2->array, (uint64_t)src_2->cardinality); + + if (src_1->cardinality <= DEFAULT_MAX_SIZE) { + *dst = array_container_from_bitset(src_1); + bitset_container_free(src_1); + return false; // not bitset + } else + return true; +} + +/* Compute the andnot of src_1 and src_2 and write the result to + * dst. Result may be either a bitset or an array container + * (returns "result is bitset"). dst does not initially have + * any container, but becomes either a bitset container (return + * result true) or an array container. + */ + +bool run_bitset_container_andnot( + const run_container_t *src_1, const bitset_container_t *src_2, + container_t **dst +){ + // follows the Java implementation as of June 2016 + int card = run_container_cardinality(src_1); + if (card <= DEFAULT_MAX_SIZE) { + // must be an array + array_container_t *answer = array_container_create_given_capacity(card); + answer->cardinality = 0; + for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) { + rle16_t rle = src_1->runs[rlepos]; + for (int run_value = rle.value; run_value <= rle.value + rle.length; + ++run_value) { + if (!bitset_container_get(src_2, (uint16_t)run_value)) { + answer->array[answer->cardinality++] = (uint16_t)run_value; + } + } + } + *dst = answer; + return false; + } else { // we guess it will be a bitset, though have to check guess when + // done + bitset_container_t *answer = bitset_container_clone(src_2); + + uint32_t last_pos = 0; + for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) { + rle16_t rle = src_1->runs[rlepos]; + + uint32_t start = rle.value; + uint32_t end = start + rle.length + 1; + bitset_reset_range(answer->words, last_pos, start); + bitset_flip_range(answer->words, start, end); + last_pos = end; + } + bitset_reset_range(answer->words, last_pos, (uint32_t)(1 << 16)); + + answer->cardinality = bitset_container_compute_cardinality(answer); + + if (answer->cardinality <= DEFAULT_MAX_SIZE) { + *dst = array_container_from_bitset(answer); + bitset_container_free(answer); + return false; // not bitset + } + *dst = answer; + return true; // bitset + } +} + +/* Compute the andnot of src_1 and src_2 and write the result to + * dst. Result may be either a bitset or an array container + * (returns "result is bitset"). dst does not initially have + * any container, but becomes either a bitset container (return + * result true) or an array container. + */ + +bool run_bitset_container_iandnot( + run_container_t *src_1, const bitset_container_t *src_2, + container_t **dst +){ + // dummy implementation + bool ans = run_bitset_container_andnot(src_1, src_2, dst); + run_container_free(src_1); + return ans; +} + +/* Compute the andnot of src_1 and src_2 and write the result to + * dst. Result may be either a bitset or an array container + * (returns "result is bitset"). dst does not initially have + * any container, but becomes either a bitset container (return + * result true) or an array container. + */ + +bool bitset_run_container_andnot( + const bitset_container_t *src_1, const run_container_t *src_2, + container_t **dst +){ + // follows Java implementation + bitset_container_t *result = bitset_container_create(); + + bitset_container_copy(src_1, result); + for (int32_t rlepos = 0; rlepos < src_2->n_runs; ++rlepos) { + rle16_t rle = src_2->runs[rlepos]; + bitset_reset_range(result->words, rle.value, + rle.value + rle.length + UINT32_C(1)); + } + result->cardinality = bitset_container_compute_cardinality(result); + + if (result->cardinality <= DEFAULT_MAX_SIZE) { + *dst = array_container_from_bitset(result); + bitset_container_free(result); + return false; // not bitset + } + *dst = result; + return true; // bitset +} + +/* Compute the andnot of src_1 and src_2 and write the result to + * dst (which has no container initially). It will modify src_1 + * to be dst if the result is a bitset. Otherwise, it will + * free src_1 and dst will be a new array container. In both + * cases, the caller is responsible for deallocating dst. + * Returns true iff dst is a bitset */ + +bool bitset_run_container_iandnot( + bitset_container_t *src_1, const run_container_t *src_2, + container_t **dst +){ + *dst = src_1; + + for (int32_t rlepos = 0; rlepos < src_2->n_runs; ++rlepos) { + rle16_t rle = src_2->runs[rlepos]; + bitset_reset_range(src_1->words, rle.value, + rle.value + rle.length + UINT32_C(1)); + } + src_1->cardinality = bitset_container_compute_cardinality(src_1); + + if (src_1->cardinality <= DEFAULT_MAX_SIZE) { + *dst = array_container_from_bitset(src_1); + bitset_container_free(src_1); + return false; // not bitset + } else + return true; +} + +/* helper. a_out must be a valid array container with adequate capacity. + * Returns the cardinality of the output container. Partly Based on Java + * implementation Util.unsignedDifference. + * + * TODO: Util.unsignedDifference does not use advanceUntil. Is it cheaper + * to avoid advanceUntil? + */ + +static int run_array_array_subtract(const run_container_t *rc, + const array_container_t *a_in, + array_container_t *a_out) { + int out_card = 0; + int32_t in_array_pos = + -1; // since advanceUntil always assumes we start the search AFTER this + + for (int rlepos = 0; rlepos < rc->n_runs; rlepos++) { + int32_t start = rc->runs[rlepos].value; + int32_t end = start + rc->runs[rlepos].length + 1; + + in_array_pos = advanceUntil(a_in->array, in_array_pos, + a_in->cardinality, (uint16_t)start); + + if (in_array_pos >= a_in->cardinality) { // run has no items subtracted + for (int32_t i = start; i < end; ++i) + a_out->array[out_card++] = (uint16_t)i; + } else { + uint16_t next_nonincluded = a_in->array[in_array_pos]; + if (next_nonincluded >= end) { + // another case when run goes unaltered + for (int32_t i = start; i < end; ++i) + a_out->array[out_card++] = (uint16_t)i; + in_array_pos--; // ensure we see this item again if necessary + } else { + for (int32_t i = start; i < end; ++i) + if (i != next_nonincluded) + a_out->array[out_card++] = (uint16_t)i; + else // 0 should ensure we don't match + next_nonincluded = + (in_array_pos + 1 >= a_in->cardinality) + ? 0 + : a_in->array[++in_array_pos]; + in_array_pos--; // see again + } + } + } + return out_card; +} + +/* dst does not indicate a valid container initially. Eventually it + * can become any type of container. + */ + +int run_array_container_andnot( + const run_container_t *src_1, const array_container_t *src_2, + container_t **dst +){ + // follows the Java impl as of June 2016 + + int card = run_container_cardinality(src_1); + const int arbitrary_threshold = 32; + + if (card <= arbitrary_threshold) { + if (src_2->cardinality == 0) { + *dst = run_container_clone(src_1); + return RUN_CONTAINER_TYPE; + } + // Java's "lazyandNot.toEfficientContainer" thing + run_container_t *answer = run_container_create_given_capacity( + card + array_container_cardinality(src_2)); + + int rlepos = 0; + int xrlepos = 0; // "x" is src_2 + rle16_t rle = src_1->runs[rlepos]; + int32_t start = rle.value; + int32_t end = start + rle.length + 1; + int32_t xstart = src_2->array[xrlepos]; + + while ((rlepos < src_1->n_runs) && (xrlepos < src_2->cardinality)) { + if (end <= xstart) { + // output the first run + answer->runs[answer->n_runs++] = + MAKE_RLE16(start, end - start - 1); + rlepos++; + if (rlepos < src_1->n_runs) { + start = src_1->runs[rlepos].value; + end = start + src_1->runs[rlepos].length + 1; + } + } else if (xstart + 1 <= start) { + // exit the second run + xrlepos++; + if (xrlepos < src_2->cardinality) { + xstart = src_2->array[xrlepos]; + } + } else { + if (start < xstart) { + answer->runs[answer->n_runs++] = + MAKE_RLE16(start, xstart - start - 1); + } + if (xstart + 1 < end) { + start = xstart + 1; + } else { + rlepos++; + if (rlepos < src_1->n_runs) { + start = src_1->runs[rlepos].value; + end = start + src_1->runs[rlepos].length + 1; + } + } + } + } + if (rlepos < src_1->n_runs) { + answer->runs[answer->n_runs++] = MAKE_RLE16(start, end - start - 1); + rlepos++; + if (rlepos < src_1->n_runs) { + memcpy(answer->runs + answer->n_runs, src_1->runs + rlepos, + (src_1->n_runs - rlepos) * sizeof(rle16_t)); + answer->n_runs += (src_1->n_runs - rlepos); + } + } + uint8_t return_type; + *dst = convert_run_to_efficient_container(answer, &return_type); + if (answer != *dst) run_container_free(answer); + return return_type; + } + // else it's a bitmap or array + + if (card <= DEFAULT_MAX_SIZE) { + array_container_t *ac = array_container_create_given_capacity(card); + // nb Java code used a generic iterator-based merge to compute + // difference + ac->cardinality = run_array_array_subtract(src_1, src_2, ac); + *dst = ac; + return ARRAY_CONTAINER_TYPE; + } + bitset_container_t *ans = bitset_container_from_run(src_1); + bool result_is_bitset = bitset_array_container_iandnot(ans, src_2, dst); + return (result_is_bitset ? BITSET_CONTAINER_TYPE + : ARRAY_CONTAINER_TYPE); +} + +/* Compute the andnot of src_1 and src_2 and write the result to + * dst (which has no container initially). It will modify src_1 + * to be dst if the result is a bitset. Otherwise, it will + * free src_1 and dst will be a new array container. In both + * cases, the caller is responsible for deallocating dst. + * Returns true iff dst is a bitset */ + +int run_array_container_iandnot( + run_container_t *src_1, const array_container_t *src_2, + container_t **dst +){ + // dummy implementation same as June 2016 Java + int ans = run_array_container_andnot(src_1, src_2, dst); + run_container_free(src_1); + return ans; +} + +/* dst must be a valid array container, allowed to be src_1 */ + +void array_run_container_andnot(const array_container_t *src_1, + const run_container_t *src_2, + array_container_t *dst) { + // basically following Java impl as of June 2016 + if (src_1->cardinality > dst->capacity) { + array_container_grow(dst, src_1->cardinality, false); + } + + if (src_2->n_runs == 0) { + memmove(dst->array, src_1->array, + sizeof(uint16_t) * src_1->cardinality); + dst->cardinality = src_1->cardinality; + return; + } + int32_t run_start = src_2->runs[0].value; + int32_t run_end = run_start + src_2->runs[0].length; + int which_run = 0; + + uint16_t val = 0; + int dest_card = 0; + for (int i = 0; i < src_1->cardinality; ++i) { + val = src_1->array[i]; + if (val < run_start) + dst->array[dest_card++] = val; + else if (val <= run_end) { + ; // omitted item + } else { + do { + if (which_run + 1 < src_2->n_runs) { + ++which_run; + run_start = src_2->runs[which_run].value; + run_end = run_start + src_2->runs[which_run].length; + + } else + run_start = run_end = (1 << 16) + 1; + } while (val > run_end); + --i; + } + } + dst->cardinality = dest_card; +} + +/* dst does not indicate a valid container initially. Eventually it + * can become any kind of container. + */ + +void array_run_container_iandnot(array_container_t *src_1, + const run_container_t *src_2) { + array_run_container_andnot(src_1, src_2, src_1); +} + +/* dst does not indicate a valid container initially. Eventually it + * can become any kind of container. + */ + +int run_run_container_andnot( + const run_container_t *src_1, const run_container_t *src_2, + container_t **dst +){ + run_container_t *ans = run_container_create(); + run_container_andnot(src_1, src_2, ans); + uint8_t typecode_after; + *dst = convert_run_to_efficient_container_and_free(ans, &typecode_after); + return typecode_after; +} + +/* Compute the andnot of src_1 and src_2 and write the result to + * dst (which has no container initially). It will modify src_1 + * to be dst if the result is a bitset. Otherwise, it will + * free src_1 and dst will be a new array container. In both + * cases, the caller is responsible for deallocating dst. + * Returns true iff dst is a bitset */ + +int run_run_container_iandnot( + run_container_t *src_1, const run_container_t *src_2, + container_t **dst +){ + // following Java impl as of June 2016 (dummy) + int ans = run_run_container_andnot(src_1, src_2, dst); + run_container_free(src_1); + return ans; +} + +/* + * dst is a valid array container and may be the same as src_1 + */ + +void array_array_container_andnot(const array_container_t *src_1, + const array_container_t *src_2, + array_container_t *dst) { + array_container_andnot(src_1, src_2, dst); +} + +/* inplace array-array andnot will always be able to reuse the space of + * src_1 */ +void array_array_container_iandnot(array_container_t *src_1, + const array_container_t *src_2) { + array_container_andnot(src_1, src_2, src_1); +} + +/* Compute the andnot of src_1 and src_2 and write the result to + * dst (which has no container initially). Return value is + * "dst is a bitset" + */ + +bool bitset_bitset_container_andnot( + const bitset_container_t *src_1, const bitset_container_t *src_2, + container_t **dst +){ + bitset_container_t *ans = bitset_container_create(); + int card = bitset_container_andnot(src_1, src_2, ans); + if (card <= DEFAULT_MAX_SIZE) { + *dst = array_container_from_bitset(ans); + bitset_container_free(ans); + return false; // not bitset + } else { + *dst = ans; + return true; + } +} + +/* Compute the andnot of src_1 and src_2 and write the result to + * dst (which has no container initially). It will modify src_1 + * to be dst if the result is a bitset. Otherwise, it will + * free src_1 and dst will be a new array container. In both + * cases, the caller is responsible for deallocating dst. + * Returns true iff dst is a bitset */ + +bool bitset_bitset_container_iandnot( + bitset_container_t *src_1, const bitset_container_t *src_2, + container_t **dst +){ + int card = bitset_container_andnot(src_1, src_2, src_1); + if (card <= DEFAULT_MAX_SIZE) { + *dst = array_container_from_bitset(src_1); + bitset_container_free(src_1); + return false; // not bitset + } else { + *dst = src_1; + return true; + } +} + +#ifdef __cplusplus +} } } // extern "C" { namespace roaring { namespace internal { +#endif +/* end file src/containers/mixed_andnot.c */ +/* begin file src/containers/mixed_equal.c */ + +#ifdef __cplusplus +extern "C" { namespace roaring { namespace internal { +#endif + +bool array_container_equal_bitset(const array_container_t* container1, + const bitset_container_t* container2) { + if (container2->cardinality != BITSET_UNKNOWN_CARDINALITY) { + if (container2->cardinality != container1->cardinality) { + return false; + } + } + int32_t pos = 0; + for (int32_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; ++i) { + uint64_t w = container2->words[i]; + while (w != 0) { + uint64_t t = w & (~w + 1); + uint16_t r = i * 64 + roaring_trailing_zeroes(w); + if (pos >= container1->cardinality) { + return false; + } + if (container1->array[pos] != r) { + return false; + } + ++pos; + w ^= t; + } + } + return (pos == container1->cardinality); +} + +bool run_container_equals_array(const run_container_t* container1, + const array_container_t* container2) { + if (run_container_cardinality(container1) != container2->cardinality) + return false; + int32_t pos = 0; + for (int i = 0; i < container1->n_runs; ++i) { + const uint32_t run_start = container1->runs[i].value; + const uint32_t le = container1->runs[i].length; + + if (container2->array[pos] != run_start) { + return false; + } + + if (container2->array[pos + le] != run_start + le) { + return false; + } + + pos += le + 1; + } + return true; +} + +bool run_container_equals_bitset(const run_container_t* container1, + const bitset_container_t* container2) { + + int run_card = run_container_cardinality(container1); + int bitset_card = (container2->cardinality != BITSET_UNKNOWN_CARDINALITY) ? + container2->cardinality : + bitset_container_compute_cardinality(container2); + if (bitset_card != run_card) { + return false; + } + + for (int32_t i = 0; i < container1->n_runs; i++) { + uint32_t begin = container1->runs[i].value; + if (container1->runs[i].length) { + uint32_t end = begin + container1->runs[i].length + 1; + if (!bitset_container_contains_range(container2, begin, end)) { + return false; + } + } else { + if (!bitset_container_contains(container2, begin)) { + return false; + } + } + } + + return true; +} + +#ifdef __cplusplus +} } } // extern "C" { namespace roaring { namespace internal { +#endif +/* end file src/containers/mixed_equal.c */ +/* begin file src/containers/mixed_intersection.c */ +/* + * mixed_intersection.c + * + */ + + +#ifdef __cplusplus +extern "C" { namespace roaring { namespace internal { +#endif + +/* Compute the intersection of src_1 and src_2 and write the result to + * dst. */ +void array_bitset_container_intersection(const array_container_t *src_1, + const bitset_container_t *src_2, + array_container_t *dst) { + if (dst->capacity < src_1->cardinality) { + array_container_grow(dst, src_1->cardinality, false); + } + int32_t newcard = 0; // dst could be src_1 + const int32_t origcard = src_1->cardinality; + for (int i = 0; i < origcard; ++i) { + uint16_t key = src_1->array[i]; + // this branchless approach is much faster... + dst->array[newcard] = key; + newcard += bitset_container_contains(src_2, key); + /** + * we could do it this way instead... + * if (bitset_container_contains(src_2, key)) { + * dst->array[newcard++] = key; + * } + * but if the result is unpredictible, the processor generates + * many mispredicted branches. + * Difference can be huge (from 3 cycles when predictible all the way + * to 16 cycles when unpredictible. + * See + * https://github.com/lemire/Code-used-on-Daniel-Lemire-s-blog/blob/master/extra/bitset/c/arraybitsetintersection.c + */ + } + dst->cardinality = newcard; +} + +/* Compute the size of the intersection of src_1 and src_2. */ +int array_bitset_container_intersection_cardinality( + const array_container_t *src_1, const bitset_container_t *src_2) { + int32_t newcard = 0; + const int32_t origcard = src_1->cardinality; + for (int i = 0; i < origcard; ++i) { + uint16_t key = src_1->array[i]; + newcard += bitset_container_contains(src_2, key); + } + return newcard; +} + + +bool array_bitset_container_intersect(const array_container_t *src_1, + const bitset_container_t *src_2) { + const int32_t origcard = src_1->cardinality; + for (int i = 0; i < origcard; ++i) { + uint16_t key = src_1->array[i]; + if(bitset_container_contains(src_2, key)) return true; + } + return false; +} + +/* Compute the intersection of src_1 and src_2 and write the result to + * dst. It is allowed for dst to be equal to src_1. We assume that dst is a + * valid container. */ +void array_run_container_intersection(const array_container_t *src_1, + const run_container_t *src_2, + array_container_t *dst) { + if (run_container_is_full(src_2)) { + if (dst != src_1) array_container_copy(src_1, dst); + return; + } + if (dst->capacity < src_1->cardinality) { + array_container_grow(dst, src_1->cardinality, false); + } + if (src_2->n_runs == 0) { + return; + } + int32_t rlepos = 0; + int32_t arraypos = 0; + rle16_t rle = src_2->runs[rlepos]; + int32_t newcard = 0; + while (arraypos < src_1->cardinality) { + const uint16_t arrayval = src_1->array[arraypos]; + while (rle.value + rle.length < + arrayval) { // this will frequently be false + ++rlepos; + if (rlepos == src_2->n_runs) { + dst->cardinality = newcard; + return; // we are done + } + rle = src_2->runs[rlepos]; + } + if (rle.value > arrayval) { + arraypos = advanceUntil(src_1->array, arraypos, src_1->cardinality, + rle.value); + } else { + dst->array[newcard] = arrayval; + newcard++; + arraypos++; + } + } + dst->cardinality = newcard; +} + +/* Compute the intersection of src_1 and src_2 and write the result to + * *dst. If the result is true then the result is a bitset_container_t + * otherwise is a array_container_t. If *dst == src_2, an in-place processing + * is attempted.*/ +bool run_bitset_container_intersection( + const run_container_t *src_1, const bitset_container_t *src_2, + container_t **dst +){ + if (run_container_is_full(src_1)) { + if (*dst != src_2) *dst = bitset_container_clone(src_2); + return true; + } + int32_t card = run_container_cardinality(src_1); + if (card <= DEFAULT_MAX_SIZE) { + // result can only be an array (assuming that we never make a + // RunContainer) + if (card > src_2->cardinality) { + card = src_2->cardinality; + } + array_container_t *answer = array_container_create_given_capacity(card); + *dst = answer; + if (*dst == NULL) { + return false; + } + for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) { + rle16_t rle = src_1->runs[rlepos]; + uint32_t endofrun = (uint32_t)rle.value + rle.length; + for (uint32_t runValue = rle.value; runValue <= endofrun; + ++runValue) { + answer->array[answer->cardinality] = (uint16_t)runValue; + answer->cardinality += + bitset_container_contains(src_2, runValue); + } + } + return false; + } + if (*dst == src_2) { // we attempt in-place + bitset_container_t *answer = CAST_bitset(*dst); + uint32_t start = 0; + for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) { + const rle16_t rle = src_1->runs[rlepos]; + uint32_t end = rle.value; + bitset_reset_range(src_2->words, start, end); + + start = end + rle.length + 1; + } + bitset_reset_range(src_2->words, start, UINT32_C(1) << 16); + answer->cardinality = bitset_container_compute_cardinality(answer); + if (src_2->cardinality > DEFAULT_MAX_SIZE) { + return true; + } else { + array_container_t *newanswer = array_container_from_bitset(src_2); + if (newanswer == NULL) { + *dst = NULL; + return false; + } + *dst = newanswer; + return false; + } + } else { // no inplace + // we expect the answer to be a bitmap (if we are lucky) + bitset_container_t *answer = bitset_container_clone(src_2); + + *dst = answer; + if (answer == NULL) { + return true; + } + uint32_t start = 0; + for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) { + const rle16_t rle = src_1->runs[rlepos]; + uint32_t end = rle.value; + bitset_reset_range(answer->words, start, end); + start = end + rle.length + 1; + } + bitset_reset_range(answer->words, start, UINT32_C(1) << 16); + answer->cardinality = bitset_container_compute_cardinality(answer); + + if (answer->cardinality > DEFAULT_MAX_SIZE) { + return true; + } else { + array_container_t *newanswer = array_container_from_bitset(answer); + bitset_container_free(CAST_bitset(*dst)); + if (newanswer == NULL) { + *dst = NULL; + return false; + } + *dst = newanswer; + return false; + } + } +} + +/* Compute the size of the intersection between src_1 and src_2 . */ +int array_run_container_intersection_cardinality(const array_container_t *src_1, + const run_container_t *src_2) { + if (run_container_is_full(src_2)) { + return src_1->cardinality; + } + if (src_2->n_runs == 0) { + return 0; + } + int32_t rlepos = 0; + int32_t arraypos = 0; + rle16_t rle = src_2->runs[rlepos]; + int32_t newcard = 0; + while (arraypos < src_1->cardinality) { + const uint16_t arrayval = src_1->array[arraypos]; + while (rle.value + rle.length < + arrayval) { // this will frequently be false + ++rlepos; + if (rlepos == src_2->n_runs) { + return newcard; // we are done + } + rle = src_2->runs[rlepos]; + } + if (rle.value > arrayval) { + arraypos = advanceUntil(src_1->array, arraypos, src_1->cardinality, + rle.value); + } else { + newcard++; + arraypos++; + } + } + return newcard; +} + +/* Compute the intersection between src_1 and src_2 + **/ +int run_bitset_container_intersection_cardinality( + const run_container_t *src_1, const bitset_container_t *src_2) { + if (run_container_is_full(src_1)) { + return bitset_container_cardinality(src_2); + } + int answer = 0; + for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) { + rle16_t rle = src_1->runs[rlepos]; + answer += + bitset_lenrange_cardinality(src_2->words, rle.value, rle.length); + } + return answer; +} + + +bool array_run_container_intersect(const array_container_t *src_1, + const run_container_t *src_2) { + if( run_container_is_full(src_2) ) { + return !array_container_empty(src_1); + } + if (src_2->n_runs == 0) { + return false; + } + int32_t rlepos = 0; + int32_t arraypos = 0; + rle16_t rle = src_2->runs[rlepos]; + while (arraypos < src_1->cardinality) { + const uint16_t arrayval = src_1->array[arraypos]; + while (rle.value + rle.length < + arrayval) { // this will frequently be false + ++rlepos; + if (rlepos == src_2->n_runs) { + return false; // we are done + } + rle = src_2->runs[rlepos]; + } + if (rle.value > arrayval) { + arraypos = advanceUntil(src_1->array, arraypos, src_1->cardinality, + rle.value); + } else { + return true; + } + } + return false; +} + +/* Compute the intersection between src_1 and src_2 + **/ +bool run_bitset_container_intersect(const run_container_t *src_1, + const bitset_container_t *src_2) { + if( run_container_is_full(src_1) ) { + return !bitset_container_empty(src_2); + } + for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) { + rle16_t rle = src_1->runs[rlepos]; + if(!bitset_lenrange_empty(src_2->words, rle.value,rle.length)) return true; + } + return false; +} + +/* + * Compute the intersection between src_1 and src_2 and write the result + * to *dst. If the return function is true, the result is a bitset_container_t + * otherwise is a array_container_t. + */ +bool bitset_bitset_container_intersection( + const bitset_container_t *src_1, const bitset_container_t *src_2, + container_t **dst +){ + const int newCardinality = bitset_container_and_justcard(src_1, src_2); + if (newCardinality > DEFAULT_MAX_SIZE) { + *dst = bitset_container_create(); + if (*dst != NULL) { + bitset_container_and_nocard(src_1, src_2, CAST_bitset(*dst)); + CAST_bitset(*dst)->cardinality = newCardinality; + } + return true; // it is a bitset + } + *dst = array_container_create_given_capacity(newCardinality); + if (*dst != NULL) { + CAST_array(*dst)->cardinality = newCardinality; + bitset_extract_intersection_setbits_uint16( + src_1->words, src_2->words, BITSET_CONTAINER_SIZE_IN_WORDS, + CAST_array(*dst)->array, 0); + } + return false; // not a bitset +} + +bool bitset_bitset_container_intersection_inplace( + bitset_container_t *src_1, const bitset_container_t *src_2, + container_t **dst +){ + const int newCardinality = bitset_container_and_justcard(src_1, src_2); + if (newCardinality > DEFAULT_MAX_SIZE) { + *dst = src_1; + bitset_container_and_nocard(src_1, src_2, src_1); + CAST_bitset(*dst)->cardinality = newCardinality; + return true; // it is a bitset + } + *dst = array_container_create_given_capacity(newCardinality); + if (*dst != NULL) { + CAST_array(*dst)->cardinality = newCardinality; + bitset_extract_intersection_setbits_uint16( + src_1->words, src_2->words, BITSET_CONTAINER_SIZE_IN_WORDS, + CAST_array(*dst)->array, 0); + } + return false; // not a bitset +} + +#ifdef __cplusplus +} } } // extern "C" { namespace roaring { namespace internal { +#endif +/* end file src/containers/mixed_intersection.c */ +/* begin file src/containers/mixed_negation.c */ +/* + * mixed_negation.c + * + */ + +#include +#include + + +#ifdef __cplusplus +extern "C" { namespace roaring { namespace internal { +#endif + +// TODO: make simplified and optimized negation code across +// the full range. + +/* Negation across the entire range of the container. + * Compute the negation of src and write the result + * to *dst. The complement of a + * sufficiently sparse set will always be dense and a hence a bitmap +' * We assume that dst is pre-allocated and a valid bitset container + * There can be no in-place version. + */ +void array_container_negation(const array_container_t *src, + bitset_container_t *dst) { + uint64_t card = UINT64_C(1 << 16); + bitset_container_set_all(dst); + + if (src->cardinality == 0) { + return; + } + + dst->cardinality = (int32_t)bitset_clear_list(dst->words, card, src->array, + (uint64_t)src->cardinality); +} + +/* Negation across the entire range of the container + * Compute the negation of src and write the result + * to *dst. A true return value indicates a bitset result, + * otherwise the result is an array container. + * We assume that dst is not pre-allocated. In + * case of failure, *dst will be NULL. + */ +bool bitset_container_negation( + const bitset_container_t *src, container_t **dst +){ + return bitset_container_negation_range(src, 0, (1 << 16), dst); +} + +/* inplace version */ +/* + * Same as bitset_container_negation except that if the output is to + * be a + * bitset_container_t, then src is modified and no allocation is made. + * If the output is to be an array_container_t, then caller is responsible + * to free the container. + * In all cases, the result is in *dst. + */ +bool bitset_container_negation_inplace( + bitset_container_t *src, container_t **dst +){ + return bitset_container_negation_range_inplace(src, 0, (1 << 16), dst); +} + +/* Negation across the entire range of container + * Compute the negation of src and write the result + * to *dst. Return values are the *_TYPECODES as defined * in containers.h + * We assume that dst is not pre-allocated. In + * case of failure, *dst will be NULL. + */ +int run_container_negation(const run_container_t *src, container_t **dst) { + return run_container_negation_range(src, 0, (1 << 16), dst); +} + +/* + * Same as run_container_negation except that if the output is to + * be a + * run_container_t, and has the capacity to hold the result, + * then src is modified and no allocation is made. + * In all cases, the result is in *dst. + */ +int run_container_negation_inplace(run_container_t *src, container_t **dst) { + return run_container_negation_range_inplace(src, 0, (1 << 16), dst); +} + +/* Negation across a range of the container. + * Compute the negation of src and write the result + * to *dst. Returns true if the result is a bitset container + * and false for an array container. *dst is not preallocated. + */ +bool array_container_negation_range( + const array_container_t *src, + const int range_start, const int range_end, + container_t **dst +){ + /* close port of the Java implementation */ + if (range_start >= range_end) { + *dst = array_container_clone(src); + return false; + } + + int32_t start_index = + binarySearch(src->array, src->cardinality, (uint16_t)range_start); + if (start_index < 0) start_index = -start_index - 1; + + int32_t last_index = + binarySearch(src->array, src->cardinality, (uint16_t)(range_end - 1)); + if (last_index < 0) last_index = -last_index - 2; + + const int32_t current_values_in_range = last_index - start_index + 1; + const int32_t span_to_be_flipped = range_end - range_start; + const int32_t new_values_in_range = + span_to_be_flipped - current_values_in_range; + const int32_t cardinality_change = + new_values_in_range - current_values_in_range; + const int32_t new_cardinality = src->cardinality + cardinality_change; + + if (new_cardinality > DEFAULT_MAX_SIZE) { + bitset_container_t *temp = bitset_container_from_array(src); + bitset_flip_range(temp->words, (uint32_t)range_start, + (uint32_t)range_end); + temp->cardinality = new_cardinality; + *dst = temp; + return true; + } + + array_container_t *arr = + array_container_create_given_capacity(new_cardinality); + *dst = (container_t *)arr; + if(new_cardinality == 0) { + arr->cardinality = new_cardinality; + return false; // we are done. + } + // copy stuff before the active area + memcpy(arr->array, src->array, start_index * sizeof(uint16_t)); + + // work on the range + int32_t out_pos = start_index, in_pos = start_index; + int32_t val_in_range = range_start; + for (; val_in_range < range_end && in_pos <= last_index; ++val_in_range) { + if ((uint16_t)val_in_range != src->array[in_pos]) { + arr->array[out_pos++] = (uint16_t)val_in_range; + } else { + ++in_pos; + } + } + for (; val_in_range < range_end; ++val_in_range) + arr->array[out_pos++] = (uint16_t)val_in_range; + + // content after the active range + memcpy(arr->array + out_pos, src->array + (last_index + 1), + (src->cardinality - (last_index + 1)) * sizeof(uint16_t)); + arr->cardinality = new_cardinality; + return false; +} + +/* Even when the result would fit, it is unclear how to make an + * inplace version without inefficient copying. + */ + +bool array_container_negation_range_inplace( + array_container_t *src, + const int range_start, const int range_end, + container_t **dst +){ + bool ans = array_container_negation_range(src, range_start, range_end, dst); + // TODO : try a real inplace version + array_container_free(src); + return ans; +} + +/* Negation across a range of the container + * Compute the negation of src and write the result + * to *dst. A true return value indicates a bitset result, + * otherwise the result is an array container. + * We assume that dst is not pre-allocated. In + * case of failure, *dst will be NULL. + */ +bool bitset_container_negation_range( + const bitset_container_t *src, + const int range_start, const int range_end, + container_t **dst +){ + // TODO maybe consider density-based estimate + // and sometimes build result directly as array, with + // conversion back to bitset if wrong. Or determine + // actual result cardinality, then go directly for the known final cont. + + // keep computation using bitsets as long as possible. + bitset_container_t *t = bitset_container_clone(src); + bitset_flip_range(t->words, (uint32_t)range_start, (uint32_t)range_end); + t->cardinality = bitset_container_compute_cardinality(t); + + if (t->cardinality > DEFAULT_MAX_SIZE) { + *dst = t; + return true; + } else { + *dst = array_container_from_bitset(t); + bitset_container_free(t); + return false; + } +} + +/* inplace version */ +/* + * Same as bitset_container_negation except that if the output is to + * be a + * bitset_container_t, then src is modified and no allocation is made. + * If the output is to be an array_container_t, then caller is responsible + * to free the container. + * In all cases, the result is in *dst. + */ +bool bitset_container_negation_range_inplace( + bitset_container_t *src, + const int range_start, const int range_end, + container_t **dst +){ + bitset_flip_range(src->words, (uint32_t)range_start, (uint32_t)range_end); + src->cardinality = bitset_container_compute_cardinality(src); + if (src->cardinality > DEFAULT_MAX_SIZE) { + *dst = src; + return true; + } + *dst = array_container_from_bitset(src); + bitset_container_free(src); + return false; +} + +/* Negation across a range of container + * Compute the negation of src and write the result + * to *dst. Return values are the *_TYPECODES as defined * in containers.h + * We assume that dst is not pre-allocated. In + * case of failure, *dst will be NULL. + */ +int run_container_negation_range( + const run_container_t *src, + const int range_start, const int range_end, + container_t **dst +){ + uint8_t return_typecode; + + // follows the Java implementation + if (range_end <= range_start) { + *dst = run_container_clone(src); + return RUN_CONTAINER_TYPE; + } + + run_container_t *ans = run_container_create_given_capacity( + src->n_runs + 1); // src->n_runs + 1); + int k = 0; + for (; k < src->n_runs && src->runs[k].value < range_start; ++k) { + ans->runs[k] = src->runs[k]; + ans->n_runs++; + } + + run_container_smart_append_exclusive( + ans, (uint16_t)range_start, (uint16_t)(range_end - range_start - 1)); + + for (; k < src->n_runs; ++k) { + run_container_smart_append_exclusive(ans, src->runs[k].value, + src->runs[k].length); + } + + *dst = convert_run_to_efficient_container(ans, &return_typecode); + if (return_typecode != RUN_CONTAINER_TYPE) run_container_free(ans); + + return return_typecode; +} + +/* + * Same as run_container_negation except that if the output is to + * be a + * run_container_t, and has the capacity to hold the result, + * then src is modified and no allocation is made. + * In all cases, the result is in *dst. + */ +int run_container_negation_range_inplace( + run_container_t *src, + const int range_start, const int range_end, + container_t **dst +){ + uint8_t return_typecode; + + if (range_end <= range_start) { + *dst = src; + return RUN_CONTAINER_TYPE; + } + + // TODO: efficient special case when range is 0 to 65535 inclusive + + if (src->capacity == src->n_runs) { + // no excess room. More checking to see if result can fit + bool last_val_before_range = false; + bool first_val_in_range = false; + bool last_val_in_range = false; + bool first_val_past_range = false; + + if (range_start > 0) + last_val_before_range = + run_container_contains(src, (uint16_t)(range_start - 1)); + first_val_in_range = run_container_contains(src, (uint16_t)range_start); + + if (last_val_before_range == first_val_in_range) { + last_val_in_range = + run_container_contains(src, (uint16_t)(range_end - 1)); + if (range_end != 0x10000) + first_val_past_range = + run_container_contains(src, (uint16_t)range_end); + + if (last_val_in_range == + first_val_past_range) { // no space for inplace + int ans = run_container_negation_range(src, range_start, + range_end, dst); + run_container_free(src); + return ans; + } + } + } + // all other cases: result will fit + + run_container_t *ans = src; + int my_nbr_runs = src->n_runs; + + ans->n_runs = 0; + int k = 0; + for (; (k < my_nbr_runs) && (src->runs[k].value < range_start); ++k) { + // ans->runs[k] = src->runs[k]; (would be self-copy) + ans->n_runs++; + } + + // as with Java implementation, use locals to give self a buffer of depth 1 + rle16_t buffered = MAKE_RLE16(0, 0); + rle16_t next = buffered; + if (k < my_nbr_runs) buffered = src->runs[k]; + + run_container_smart_append_exclusive( + ans, (uint16_t)range_start, (uint16_t)(range_end - range_start - 1)); + + for (; k < my_nbr_runs; ++k) { + if (k + 1 < my_nbr_runs) next = src->runs[k + 1]; + + run_container_smart_append_exclusive(ans, buffered.value, + buffered.length); + buffered = next; + } + + *dst = convert_run_to_efficient_container(ans, &return_typecode); + if (return_typecode != RUN_CONTAINER_TYPE) run_container_free(ans); + + return return_typecode; +} + +#ifdef __cplusplus +} } } // extern "C" { namespace roaring { namespace internal { +#endif +/* end file src/containers/mixed_negation.c */ +/* begin file src/containers/mixed_subset.c */ + +#ifdef __cplusplus +extern "C" { namespace roaring { namespace internal { +#endif + +bool array_container_is_subset_bitset(const array_container_t* container1, + const bitset_container_t* container2) { + if (container2->cardinality != BITSET_UNKNOWN_CARDINALITY) { + if (container2->cardinality < container1->cardinality) { + return false; + } + } + for (int i = 0; i < container1->cardinality; ++i) { + if (!bitset_container_contains(container2, container1->array[i])) { + return false; + } + } + return true; +} + +bool run_container_is_subset_array(const run_container_t* container1, + const array_container_t* container2) { + if (run_container_cardinality(container1) > container2->cardinality) + return false; + int32_t start_pos = -1, stop_pos = -1; + for (int i = 0; i < container1->n_runs; ++i) { + int32_t start = container1->runs[i].value; + int32_t stop = start + container1->runs[i].length; + start_pos = advanceUntil(container2->array, stop_pos, + container2->cardinality, start); + stop_pos = advanceUntil(container2->array, stop_pos, + container2->cardinality, stop); + if (stop_pos == container2->cardinality) { + return false; + } else if (stop_pos - start_pos != stop - start || + container2->array[start_pos] != start || + container2->array[stop_pos] != stop) { + return false; + } + } + return true; +} + +bool array_container_is_subset_run(const array_container_t* container1, + const run_container_t* container2) { + if (container1->cardinality > run_container_cardinality(container2)) + return false; + int i_array = 0, i_run = 0; + while (i_array < container1->cardinality && i_run < container2->n_runs) { + uint32_t start = container2->runs[i_run].value; + uint32_t stop = start + container2->runs[i_run].length; + if (container1->array[i_array] < start) { + return false; + } else if (container1->array[i_array] > stop) { + i_run++; + } else { // the value of the array is in the run + i_array++; + } + } + if (i_array == container1->cardinality) { + return true; + } else { + return false; + } +} + +bool run_container_is_subset_bitset(const run_container_t* container1, + const bitset_container_t* container2) { + // todo: this code could be much faster + if (container2->cardinality != BITSET_UNKNOWN_CARDINALITY) { + if (container2->cardinality < run_container_cardinality(container1)) { + return false; + } + } else { + int32_t card = bitset_container_compute_cardinality( + container2); // modify container2? + if (card < run_container_cardinality(container1)) { + return false; + } + } + for (int i = 0; i < container1->n_runs; ++i) { + uint32_t run_start = container1->runs[i].value; + uint32_t le = container1->runs[i].length; + for (uint32_t j = run_start; j <= run_start + le; ++j) { + if (!bitset_container_contains(container2, j)) { + return false; + } + } + } + return true; +} + +bool bitset_container_is_subset_run(const bitset_container_t* container1, + const run_container_t* container2) { + // todo: this code could be much faster + if (container1->cardinality != BITSET_UNKNOWN_CARDINALITY) { + if (container1->cardinality > run_container_cardinality(container2)) { + return false; + } + } + int32_t i_bitset = 0, i_run = 0; + while (i_bitset < BITSET_CONTAINER_SIZE_IN_WORDS && + i_run < container2->n_runs) { + uint64_t w = container1->words[i_bitset]; + while (w != 0 && i_run < container2->n_runs) { + uint32_t start = container2->runs[i_run].value; + uint32_t stop = start + container2->runs[i_run].length; + uint64_t t = w & (~w + 1); + uint16_t r = i_bitset * 64 + roaring_trailing_zeroes(w); + if (r < start) { + return false; + } else if (r > stop) { + i_run++; + continue; + } else { + w ^= t; + } + } + if (w == 0) { + i_bitset++; + } else { + return false; + } + } + if (i_bitset < BITSET_CONTAINER_SIZE_IN_WORDS) { + // terminated iterating on the run containers, check that rest of bitset + // is empty + for (; i_bitset < BITSET_CONTAINER_SIZE_IN_WORDS; i_bitset++) { + if (container1->words[i_bitset] != 0) { + return false; + } + } + } + return true; +} + +#ifdef __cplusplus +} } } // extern "C" { namespace roaring { namespace internal { +#endif +/* end file src/containers/mixed_subset.c */ +/* begin file src/containers/mixed_union.c */ +/* + * mixed_union.c + * + */ + +#include +#include + + +#ifdef __cplusplus +extern "C" { namespace roaring { namespace internal { +#endif + +/* Compute the union of src_1 and src_2 and write the result to + * dst. */ +void array_bitset_container_union(const array_container_t *src_1, + const bitset_container_t *src_2, + bitset_container_t *dst) { + if (src_2 != dst) bitset_container_copy(src_2, dst); + dst->cardinality = (int32_t)bitset_set_list_withcard( + dst->words, dst->cardinality, src_1->array, src_1->cardinality); +} + +/* Compute the union of src_1 and src_2 and write the result to + * dst. It is allowed for src_2 to be dst. This version does not + * update the cardinality of dst (it is set to BITSET_UNKNOWN_CARDINALITY). */ +void array_bitset_container_lazy_union(const array_container_t *src_1, + const bitset_container_t *src_2, + bitset_container_t *dst) { + if (src_2 != dst) bitset_container_copy(src_2, dst); + bitset_set_list(dst->words, src_1->array, src_1->cardinality); + dst->cardinality = BITSET_UNKNOWN_CARDINALITY; +} + +void run_bitset_container_union(const run_container_t *src_1, + const bitset_container_t *src_2, + bitset_container_t *dst) { + assert(!run_container_is_full(src_1)); // catch this case upstream + if (src_2 != dst) bitset_container_copy(src_2, dst); + for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) { + rle16_t rle = src_1->runs[rlepos]; + bitset_set_lenrange(dst->words, rle.value, rle.length); + } + dst->cardinality = bitset_container_compute_cardinality(dst); +} + +void run_bitset_container_lazy_union(const run_container_t *src_1, + const bitset_container_t *src_2, + bitset_container_t *dst) { + assert(!run_container_is_full(src_1)); // catch this case upstream + if (src_2 != dst) bitset_container_copy(src_2, dst); + for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) { + rle16_t rle = src_1->runs[rlepos]; + bitset_set_lenrange(dst->words, rle.value, rle.length); + } + dst->cardinality = BITSET_UNKNOWN_CARDINALITY; +} + +// why do we leave the result as a run container?? +void array_run_container_union(const array_container_t *src_1, + const run_container_t *src_2, + run_container_t *dst) { + if (run_container_is_full(src_2)) { + run_container_copy(src_2, dst); + return; + } + // TODO: see whether the "2*" is spurious + run_container_grow(dst, 2 * (src_1->cardinality + src_2->n_runs), false); + int32_t rlepos = 0; + int32_t arraypos = 0; + rle16_t previousrle; + if (src_2->runs[rlepos].value <= src_1->array[arraypos]) { + previousrle = run_container_append_first(dst, src_2->runs[rlepos]); + rlepos++; + } else { + previousrle = + run_container_append_value_first(dst, src_1->array[arraypos]); + arraypos++; + } + while ((rlepos < src_2->n_runs) && (arraypos < src_1->cardinality)) { + if (src_2->runs[rlepos].value <= src_1->array[arraypos]) { + run_container_append(dst, src_2->runs[rlepos], &previousrle); + rlepos++; + } else { + run_container_append_value(dst, src_1->array[arraypos], + &previousrle); + arraypos++; + } + } + if (arraypos < src_1->cardinality) { + while (arraypos < src_1->cardinality) { + run_container_append_value(dst, src_1->array[arraypos], + &previousrle); + arraypos++; + } + } else { + while (rlepos < src_2->n_runs) { + run_container_append(dst, src_2->runs[rlepos], &previousrle); + rlepos++; + } + } +} + +void array_run_container_inplace_union(const array_container_t *src_1, + run_container_t *src_2) { + if (run_container_is_full(src_2)) { + return; + } + const int32_t maxoutput = src_1->cardinality + src_2->n_runs; + const int32_t neededcapacity = maxoutput + src_2->n_runs; + if (src_2->capacity < neededcapacity) + run_container_grow(src_2, neededcapacity, true); + memmove(src_2->runs + maxoutput, src_2->runs, + src_2->n_runs * sizeof(rle16_t)); + rle16_t *inputsrc2 = src_2->runs + maxoutput; + int32_t rlepos = 0; + int32_t arraypos = 0; + int src2nruns = src_2->n_runs; + src_2->n_runs = 0; + + rle16_t previousrle; + + if (inputsrc2[rlepos].value <= src_1->array[arraypos]) { + previousrle = run_container_append_first(src_2, inputsrc2[rlepos]); + rlepos++; + } else { + previousrle = + run_container_append_value_first(src_2, src_1->array[arraypos]); + arraypos++; + } + + while ((rlepos < src2nruns) && (arraypos < src_1->cardinality)) { + if (inputsrc2[rlepos].value <= src_1->array[arraypos]) { + run_container_append(src_2, inputsrc2[rlepos], &previousrle); + rlepos++; + } else { + run_container_append_value(src_2, src_1->array[arraypos], + &previousrle); + arraypos++; + } + } + if (arraypos < src_1->cardinality) { + while (arraypos < src_1->cardinality) { + run_container_append_value(src_2, src_1->array[arraypos], + &previousrle); + arraypos++; + } + } else { + while (rlepos < src2nruns) { + run_container_append(src_2, inputsrc2[rlepos], &previousrle); + rlepos++; + } + } +} + +bool array_array_container_union( + const array_container_t *src_1, const array_container_t *src_2, + container_t **dst +){ + int totalCardinality = src_1->cardinality + src_2->cardinality; + if (totalCardinality <= DEFAULT_MAX_SIZE) { + *dst = array_container_create_given_capacity(totalCardinality); + if (*dst != NULL) { + array_container_union(src_1, src_2, CAST_array(*dst)); + } else { + return true; // otherwise failure won't be caught + } + return false; // not a bitset + } + *dst = bitset_container_create(); + bool returnval = true; // expect a bitset + if (*dst != NULL) { + bitset_container_t *ourbitset = CAST_bitset(*dst); + bitset_set_list(ourbitset->words, src_1->array, src_1->cardinality); + ourbitset->cardinality = (int32_t)bitset_set_list_withcard( + ourbitset->words, src_1->cardinality, src_2->array, + src_2->cardinality); + if (ourbitset->cardinality <= DEFAULT_MAX_SIZE) { + // need to convert! + *dst = array_container_from_bitset(ourbitset); + bitset_container_free(ourbitset); + returnval = false; // not going to be a bitset + } + } + return returnval; +} + +bool array_array_container_inplace_union( + array_container_t *src_1, const array_container_t *src_2, + container_t **dst +){ + int totalCardinality = src_1->cardinality + src_2->cardinality; + *dst = NULL; + if (totalCardinality <= DEFAULT_MAX_SIZE) { + if(src_1->capacity < totalCardinality) { + *dst = array_container_create_given_capacity(2 * totalCardinality); // be purposefully generous + if (*dst != NULL) { + array_container_union(src_1, src_2, CAST_array(*dst)); + } else { + return true; // otherwise failure won't be caught + } + return false; // not a bitset + } else { + memmove(src_1->array + src_2->cardinality, src_1->array, src_1->cardinality * sizeof(uint16_t)); + /* + Next line is safe: + + We just need to focus on the reading and writing performed on array1. In `union_vector16`, both vectorized and scalar code still obey the basic rule: read from two inputs, do the union, and then write the output. + + Let's say the length(cardinality) of input2 is L2: + ``` + |<- L2 ->| + array1: [output--- |input 1---|---] + array2: [input 2---] + ``` + Let's define 3 __m128i pointers, `pos1` starts from `input1`, `pos2` starts from `input2`, these 2 point at the next byte to read, `out` starts from `output`, pointing at the next byte to overwrite. + ``` + array1: [output--- |input 1---|---] + ^ ^ + out pos1 + array2: [input 2---] + ^ + pos2 + ``` + The union output always contains less or equal number of elements than all inputs added, so we have: + ``` + out <= pos1 + pos2 + ``` + therefore: + ``` + out <= pos1 + L2 + ``` + which means you will not overwrite data beyond pos1, so the data haven't read is safe, and we don't care the data already read. + */ + src_1->cardinality = (int32_t)fast_union_uint16(src_1->array + src_2->cardinality, src_1->cardinality, + src_2->array, src_2->cardinality, src_1->array); + return false; // not a bitset + } + } + *dst = bitset_container_create(); + bool returnval = true; // expect a bitset + if (*dst != NULL) { + bitset_container_t *ourbitset = CAST_bitset(*dst); + bitset_set_list(ourbitset->words, src_1->array, src_1->cardinality); + ourbitset->cardinality = (int32_t)bitset_set_list_withcard( + ourbitset->words, src_1->cardinality, src_2->array, + src_2->cardinality); + if (ourbitset->cardinality <= DEFAULT_MAX_SIZE) { + // need to convert! + if(src_1->capacity < ourbitset->cardinality) { + array_container_grow(src_1, ourbitset->cardinality, false); + } + + bitset_extract_setbits_uint16(ourbitset->words, BITSET_CONTAINER_SIZE_IN_WORDS, + src_1->array, 0); + src_1->cardinality = ourbitset->cardinality; + *dst = src_1; + bitset_container_free(ourbitset); + returnval = false; // not going to be a bitset + } + } + return returnval; +} + + +bool array_array_container_lazy_union( + const array_container_t *src_1, const array_container_t *src_2, + container_t **dst +){ + int totalCardinality = src_1->cardinality + src_2->cardinality; + // + // We assume that operations involving bitset containers will be faster than + // operations involving solely array containers, except maybe when array containers + // are small. Indeed, for example, it is cheap to compute the union between an array and + // a bitset container, generally more so than between a large array and another array. + // So it is advantageous to favour bitset containers during the computation. + // Of course, if we convert array containers eagerly to bitset containers, we may later + // need to revert the bitset containers to array containerr to satisfy the Roaring format requirements, + // but such one-time conversions at the end may not be overly expensive. We arrived to this design + // based on extensive benchmarking. + // + if (totalCardinality <= ARRAY_LAZY_LOWERBOUND) { + *dst = array_container_create_given_capacity(totalCardinality); + if (*dst != NULL) { + array_container_union(src_1, src_2, CAST_array(*dst)); + } else { + return true; // otherwise failure won't be caught + } + return false; // not a bitset + } + *dst = bitset_container_create(); + bool returnval = true; // expect a bitset + if (*dst != NULL) { + bitset_container_t *ourbitset = CAST_bitset(*dst); + bitset_set_list(ourbitset->words, src_1->array, src_1->cardinality); + bitset_set_list(ourbitset->words, src_2->array, src_2->cardinality); + ourbitset->cardinality = BITSET_UNKNOWN_CARDINALITY; + } + return returnval; +} + + +bool array_array_container_lazy_inplace_union( + array_container_t *src_1, const array_container_t *src_2, + container_t **dst +){ + int totalCardinality = src_1->cardinality + src_2->cardinality; + *dst = NULL; + // + // We assume that operations involving bitset containers will be faster than + // operations involving solely array containers, except maybe when array containers + // are small. Indeed, for example, it is cheap to compute the union between an array and + // a bitset container, generally more so than between a large array and another array. + // So it is advantageous to favour bitset containers during the computation. + // Of course, if we convert array containers eagerly to bitset containers, we may later + // need to revert the bitset containers to array containerr to satisfy the Roaring format requirements, + // but such one-time conversions at the end may not be overly expensive. We arrived to this design + // based on extensive benchmarking. + // + if (totalCardinality <= ARRAY_LAZY_LOWERBOUND) { + if(src_1->capacity < totalCardinality) { + *dst = array_container_create_given_capacity(2 * totalCardinality); // be purposefully generous + if (*dst != NULL) { + array_container_union(src_1, src_2, CAST_array(*dst)); + } else { + return true; // otherwise failure won't be caught + } + return false; // not a bitset + } else { + memmove(src_1->array + src_2->cardinality, src_1->array, src_1->cardinality * sizeof(uint16_t)); + /* + Next line is safe: + + We just need to focus on the reading and writing performed on array1. In `union_vector16`, both vectorized and scalar code still obey the basic rule: read from two inputs, do the union, and then write the output. + + Let's say the length(cardinality) of input2 is L2: + ``` + |<- L2 ->| + array1: [output--- |input 1---|---] + array2: [input 2---] + ``` + Let's define 3 __m128i pointers, `pos1` starts from `input1`, `pos2` starts from `input2`, these 2 point at the next byte to read, `out` starts from `output`, pointing at the next byte to overwrite. + ``` + array1: [output--- |input 1---|---] + ^ ^ + out pos1 + array2: [input 2---] + ^ + pos2 + ``` + The union output always contains less or equal number of elements than all inputs added, so we have: + ``` + out <= pos1 + pos2 + ``` + therefore: + ``` + out <= pos1 + L2 + ``` + which means you will not overwrite data beyond pos1, so the data haven't read is safe, and we don't care the data already read. + */ + src_1->cardinality = (int32_t)fast_union_uint16(src_1->array + src_2->cardinality, src_1->cardinality, + src_2->array, src_2->cardinality, src_1->array); + return false; // not a bitset + } + } + *dst = bitset_container_create(); + bool returnval = true; // expect a bitset + if (*dst != NULL) { + bitset_container_t *ourbitset = CAST_bitset(*dst); + bitset_set_list(ourbitset->words, src_1->array, src_1->cardinality); + bitset_set_list(ourbitset->words, src_2->array, src_2->cardinality); + ourbitset->cardinality = BITSET_UNKNOWN_CARDINALITY; + } + return returnval; +} + +#ifdef __cplusplus +} } } // extern "C" { namespace roaring { namespace internal { +#endif +/* end file src/containers/mixed_union.c */ +/* begin file src/containers/mixed_xor.c */ +/* + * mixed_xor.c + */ + +#include +#include + + +#ifdef __cplusplus +extern "C" { namespace roaring { namespace internal { +#endif + +/* Compute the xor of src_1 and src_2 and write the result to + * dst (which has no container initially). + * Result is true iff dst is a bitset */ +bool array_bitset_container_xor( + const array_container_t *src_1, const bitset_container_t *src_2, + container_t **dst +){ + bitset_container_t *result = bitset_container_create(); + bitset_container_copy(src_2, result); + result->cardinality = (int32_t)bitset_flip_list_withcard( + result->words, result->cardinality, src_1->array, src_1->cardinality); + + // do required type conversions. + if (result->cardinality <= DEFAULT_MAX_SIZE) { + *dst = array_container_from_bitset(result); + bitset_container_free(result); + return false; // not bitset + } + *dst = result; + return true; // bitset +} + +/* Compute the xor of src_1 and src_2 and write the result to + * dst. It is allowed for src_2 to be dst. This version does not + * update the cardinality of dst (it is set to BITSET_UNKNOWN_CARDINALITY). + */ + +void array_bitset_container_lazy_xor(const array_container_t *src_1, + const bitset_container_t *src_2, + bitset_container_t *dst) { + if (src_2 != dst) bitset_container_copy(src_2, dst); + bitset_flip_list(dst->words, src_1->array, src_1->cardinality); + dst->cardinality = BITSET_UNKNOWN_CARDINALITY; +} + +/* Compute the xor of src_1 and src_2 and write the result to + * dst. Result may be either a bitset or an array container + * (returns "result is bitset"). dst does not initially have + * any container, but becomes either a bitset container (return + * result true) or an array container. + */ + +bool run_bitset_container_xor( + const run_container_t *src_1, const bitset_container_t *src_2, + container_t **dst +){ + bitset_container_t *result = bitset_container_create(); + + bitset_container_copy(src_2, result); + for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) { + rle16_t rle = src_1->runs[rlepos]; + bitset_flip_range(result->words, rle.value, + rle.value + rle.length + UINT32_C(1)); + } + result->cardinality = bitset_container_compute_cardinality(result); + + if (result->cardinality <= DEFAULT_MAX_SIZE) { + *dst = array_container_from_bitset(result); + bitset_container_free(result); + return false; // not bitset + } + *dst = result; + return true; // bitset +} + +/* lazy xor. Dst is initialized and may be equal to src_2. + * Result is left as a bitset container, even if actual + * cardinality would dictate an array container. + */ + +void run_bitset_container_lazy_xor(const run_container_t *src_1, + const bitset_container_t *src_2, + bitset_container_t *dst) { + if (src_2 != dst) bitset_container_copy(src_2, dst); + for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) { + rle16_t rle = src_1->runs[rlepos]; + bitset_flip_range(dst->words, rle.value, + rle.value + rle.length + UINT32_C(1)); + } + dst->cardinality = BITSET_UNKNOWN_CARDINALITY; +} + +/* dst does not indicate a valid container initially. Eventually it + * can become any kind of container. + */ + +int array_run_container_xor( + const array_container_t *src_1, const run_container_t *src_2, + container_t **dst +){ + // semi following Java XOR implementation as of May 2016 + // the C OR implementation works quite differently and can return a run + // container + // TODO could optimize for full run containers. + + // use of lazy following Java impl. + const int arbitrary_threshold = 32; + if (src_1->cardinality < arbitrary_threshold) { + run_container_t *ans = run_container_create(); + array_run_container_lazy_xor(src_1, src_2, ans); // keeps runs. + uint8_t typecode_after; + *dst = + convert_run_to_efficient_container_and_free(ans, &typecode_after); + return typecode_after; + } + + int card = run_container_cardinality(src_2); + if (card <= DEFAULT_MAX_SIZE) { + // Java implementation works with the array, xoring the run elements via + // iterator + array_container_t *temp = array_container_from_run(src_2); + bool ret_is_bitset = array_array_container_xor(temp, src_1, dst); + array_container_free(temp); + return ret_is_bitset ? BITSET_CONTAINER_TYPE + : ARRAY_CONTAINER_TYPE; + + } else { // guess that it will end up as a bitset + bitset_container_t *result = bitset_container_from_run(src_2); + bool is_bitset = bitset_array_container_ixor(result, src_1, dst); + // any necessary type conversion has been done by the ixor + int retval = (is_bitset ? BITSET_CONTAINER_TYPE + : ARRAY_CONTAINER_TYPE); + return retval; + } +} + +/* Dst is a valid run container. (Can it be src_2? Let's say not.) + * Leaves result as run container, even if other options are + * smaller. + */ + +void array_run_container_lazy_xor(const array_container_t *src_1, + const run_container_t *src_2, + run_container_t *dst) { + run_container_grow(dst, src_1->cardinality + src_2->n_runs, false); + int32_t rlepos = 0; + int32_t arraypos = 0; + dst->n_runs = 0; + + while ((rlepos < src_2->n_runs) && (arraypos < src_1->cardinality)) { + if (src_2->runs[rlepos].value <= src_1->array[arraypos]) { + run_container_smart_append_exclusive(dst, src_2->runs[rlepos].value, + src_2->runs[rlepos].length); + rlepos++; + } else { + run_container_smart_append_exclusive(dst, src_1->array[arraypos], + 0); + arraypos++; + } + } + while (arraypos < src_1->cardinality) { + run_container_smart_append_exclusive(dst, src_1->array[arraypos], 0); + arraypos++; + } + while (rlepos < src_2->n_runs) { + run_container_smart_append_exclusive(dst, src_2->runs[rlepos].value, + src_2->runs[rlepos].length); + rlepos++; + } +} + +/* dst does not indicate a valid container initially. Eventually it + * can become any kind of container. + */ + +int run_run_container_xor( + const run_container_t *src_1, const run_container_t *src_2, + container_t **dst +){ + run_container_t *ans = run_container_create(); + run_container_xor(src_1, src_2, ans); + uint8_t typecode_after; + *dst = convert_run_to_efficient_container_and_free(ans, &typecode_after); + return typecode_after; +} + +/* + * Java implementation (as of May 2016) for array_run, run_run + * and bitset_run don't do anything different for inplace. + * Could adopt the mixed_union.c approach instead (ie, using + * smart_append_exclusive) + * + */ + +bool array_array_container_xor( + const array_container_t *src_1, const array_container_t *src_2, + container_t **dst +){ + int totalCardinality = + src_1->cardinality + src_2->cardinality; // upper bound + if (totalCardinality <= DEFAULT_MAX_SIZE) { + *dst = array_container_create_given_capacity(totalCardinality); + array_container_xor(src_1, src_2, CAST_array(*dst)); + return false; // not a bitset + } + *dst = bitset_container_from_array(src_1); + bool returnval = true; // expect a bitset + bitset_container_t *ourbitset = CAST_bitset(*dst); + ourbitset->cardinality = (uint32_t)bitset_flip_list_withcard( + ourbitset->words, src_1->cardinality, src_2->array, src_2->cardinality); + if (ourbitset->cardinality <= DEFAULT_MAX_SIZE) { + // need to convert! + *dst = array_container_from_bitset(ourbitset); + bitset_container_free(ourbitset); + returnval = false; // not going to be a bitset + } + + return returnval; +} + +bool array_array_container_lazy_xor( + const array_container_t *src_1, const array_container_t *src_2, + container_t **dst +){ + int totalCardinality = src_1->cardinality + src_2->cardinality; + // + // We assume that operations involving bitset containers will be faster than + // operations involving solely array containers, except maybe when array containers + // are small. Indeed, for example, it is cheap to compute the exclusive union between an array and + // a bitset container, generally more so than between a large array and another array. + // So it is advantageous to favour bitset containers during the computation. + // Of course, if we convert array containers eagerly to bitset containers, we may later + // need to revert the bitset containers to array containerr to satisfy the Roaring format requirements, + // but such one-time conversions at the end may not be overly expensive. We arrived to this design + // based on extensive benchmarking on unions. + // For XOR/exclusive union, we simply followed the heuristic used by the unions (see mixed_union.c). + // Further tuning is possible. + // + if (totalCardinality <= ARRAY_LAZY_LOWERBOUND) { + *dst = array_container_create_given_capacity(totalCardinality); + if (*dst != NULL) + array_container_xor(src_1, src_2, CAST_array(*dst)); + return false; // not a bitset + } + *dst = bitset_container_from_array(src_1); + bool returnval = true; // expect a bitset (maybe, for XOR??) + if (*dst != NULL) { + bitset_container_t *ourbitset = CAST_bitset(*dst); + bitset_flip_list(ourbitset->words, src_2->array, src_2->cardinality); + ourbitset->cardinality = BITSET_UNKNOWN_CARDINALITY; + } + return returnval; +} + +/* Compute the xor of src_1 and src_2 and write the result to + * dst (which has no container initially). Return value is + * "dst is a bitset" + */ + +bool bitset_bitset_container_xor( + const bitset_container_t *src_1, const bitset_container_t *src_2, + container_t **dst +){ + bitset_container_t *ans = bitset_container_create(); + int card = bitset_container_xor(src_1, src_2, ans); + if (card <= DEFAULT_MAX_SIZE) { + *dst = array_container_from_bitset(ans); + bitset_container_free(ans); + return false; // not bitset + } else { + *dst = ans; + return true; + } +} + +/* Compute the xor of src_1 and src_2 and write the result to + * dst (which has no container initially). It will modify src_1 + * to be dst if the result is a bitset. Otherwise, it will + * free src_1 and dst will be a new array container. In both + * cases, the caller is responsible for deallocating dst. + * Returns true iff dst is a bitset */ + +bool bitset_array_container_ixor( + bitset_container_t *src_1, const array_container_t *src_2, + container_t **dst +){ + *dst = src_1; + src_1->cardinality = (uint32_t)bitset_flip_list_withcard( + src_1->words, src_1->cardinality, src_2->array, src_2->cardinality); + + if (src_1->cardinality <= DEFAULT_MAX_SIZE) { + *dst = array_container_from_bitset(src_1); + bitset_container_free(src_1); + return false; // not bitset + } else + return true; +} + +/* a bunch of in-place, some of which may not *really* be inplace. + * TODO: write actual inplace routine if efficiency warrants it + * Anything inplace with a bitset is a good candidate + */ + +bool bitset_bitset_container_ixor( + bitset_container_t *src_1, const bitset_container_t *src_2, + container_t **dst +){ + int card = bitset_container_xor(src_1, src_2, src_1); + if (card <= DEFAULT_MAX_SIZE) { + *dst = array_container_from_bitset(src_1); + bitset_container_free(src_1); + return false; // not bitset + } else { + *dst = src_1; + return true; + } +} + +bool array_bitset_container_ixor( + array_container_t *src_1, const bitset_container_t *src_2, + container_t **dst +){ + bool ans = array_bitset_container_xor(src_1, src_2, dst); + array_container_free(src_1); + return ans; +} + +/* Compute the xor of src_1 and src_2 and write the result to + * dst. Result may be either a bitset or an array container + * (returns "result is bitset"). dst does not initially have + * any container, but becomes either a bitset container (return + * result true) or an array container. + */ + +bool run_bitset_container_ixor( + run_container_t *src_1, const bitset_container_t *src_2, + container_t **dst +){ + bool ans = run_bitset_container_xor(src_1, src_2, dst); + run_container_free(src_1); + return ans; +} + +bool bitset_run_container_ixor( + bitset_container_t *src_1, const run_container_t *src_2, + container_t **dst +){ + bool ans = run_bitset_container_xor(src_2, src_1, dst); + bitset_container_free(src_1); + return ans; +} + +/* dst does not indicate a valid container initially. Eventually it + * can become any kind of container. + */ + +int array_run_container_ixor( + array_container_t *src_1, const run_container_t *src_2, + container_t **dst +){ + int ans = array_run_container_xor(src_1, src_2, dst); + array_container_free(src_1); + return ans; +} + +int run_array_container_ixor( + run_container_t *src_1, const array_container_t *src_2, + container_t **dst +){ + int ans = array_run_container_xor(src_2, src_1, dst); + run_container_free(src_1); + return ans; +} + +bool array_array_container_ixor( + array_container_t *src_1, const array_container_t *src_2, + container_t **dst +){ + bool ans = array_array_container_xor(src_1, src_2, dst); + array_container_free(src_1); + return ans; +} + +int run_run_container_ixor( + run_container_t *src_1, const run_container_t *src_2, + container_t **dst +){ + int ans = run_run_container_xor(src_1, src_2, dst); + run_container_free(src_1); + return ans; +} + +#ifdef __cplusplus +} } } // extern "C" { namespace roaring { namespace internal { +#endif +/* end file src/containers/mixed_xor.c */ +/* begin file src/containers/run.c */ +#include +#include + + +#if CROARING_IS_X64 +#ifndef CROARING_COMPILER_SUPPORTS_AVX512 +#error "CROARING_COMPILER_SUPPORTS_AVX512 needs to be defined." +#endif // CROARING_COMPILER_SUPPORTS_AVX512 +#endif + +#ifdef __cplusplus +extern "C" { namespace roaring { namespace internal { +#endif + +extern inline uint16_t run_container_minimum(const run_container_t *run); +extern inline uint16_t run_container_maximum(const run_container_t *run); +extern inline int32_t interleavedBinarySearch(const rle16_t *array, + int32_t lenarray, uint16_t ikey); +extern inline bool run_container_contains(const run_container_t *run, + uint16_t pos); +extern inline int run_container_index_equalorlarger(const run_container_t *arr, uint16_t x); +extern inline bool run_container_is_full(const run_container_t *run); +extern inline bool run_container_nonzero_cardinality(const run_container_t *rc); +extern inline int32_t run_container_serialized_size_in_bytes(int32_t num_runs); +extern inline run_container_t *run_container_create_range(uint32_t start, + uint32_t stop); +extern inline int run_container_cardinality(const run_container_t *run); + + +bool run_container_add(run_container_t *run, uint16_t pos) { + int32_t index = interleavedBinarySearch(run->runs, run->n_runs, pos); + if (index >= 0) return false; // already there + index = -index - 2; // points to preceding value, possibly -1 + if (index >= 0) { // possible match + int32_t offset = pos - run->runs[index].value; + int32_t le = run->runs[index].length; + if (offset <= le) return false; // already there + if (offset == le + 1) { + // we may need to fuse + if (index + 1 < run->n_runs) { + if (run->runs[index + 1].value == pos + 1) { + // indeed fusion is needed + run->runs[index].length = run->runs[index + 1].value + + run->runs[index + 1].length - + run->runs[index].value; + recoverRoomAtIndex(run, (uint16_t)(index + 1)); + return true; + } + } + run->runs[index].length++; + return true; + } + if (index + 1 < run->n_runs) { + // we may need to fuse + if (run->runs[index + 1].value == pos + 1) { + // indeed fusion is needed + run->runs[index + 1].value = pos; + run->runs[index + 1].length = run->runs[index + 1].length + 1; + return true; + } + } + } + if (index == -1) { + // we may need to extend the first run + if (0 < run->n_runs) { + if (run->runs[0].value == pos + 1) { + run->runs[0].length++; + run->runs[0].value--; + return true; + } + } + } + makeRoomAtIndex(run, (uint16_t)(index + 1)); + run->runs[index + 1].value = pos; + run->runs[index + 1].length = 0; + return true; +} + +/* Create a new run container. Return NULL in case of failure. */ +run_container_t *run_container_create_given_capacity(int32_t size) { + run_container_t *run; + /* Allocate the run container itself. */ + if ((run = (run_container_t *)roaring_malloc(sizeof(run_container_t))) == NULL) { + return NULL; + } + if (size <= 0 ) { // we don't want to rely on malloc(0) + run->runs = NULL; + } else if ((run->runs = (rle16_t *)roaring_malloc(sizeof(rle16_t) * size)) == NULL) { + roaring_free(run); + return NULL; + } + run->capacity = size; + run->n_runs = 0; + return run; +} + +int run_container_shrink_to_fit(run_container_t *src) { + if (src->n_runs == src->capacity) return 0; // nothing to do + int savings = src->capacity - src->n_runs; + src->capacity = src->n_runs; + rle16_t *oldruns = src->runs; + src->runs = (rle16_t *)roaring_realloc(oldruns, src->capacity * sizeof(rle16_t)); + if (src->runs == NULL) roaring_free(oldruns); // should never happen? + return savings; +} +/* Create a new run container. Return NULL in case of failure. */ +run_container_t *run_container_create(void) { + return run_container_create_given_capacity(RUN_DEFAULT_INIT_SIZE); +} + +run_container_t *run_container_clone(const run_container_t *src) { + run_container_t *run = run_container_create_given_capacity(src->capacity); + if (run == NULL) return NULL; + run->capacity = src->capacity; + run->n_runs = src->n_runs; + memcpy(run->runs, src->runs, src->n_runs * sizeof(rle16_t)); + return run; +} + +void run_container_offset(const run_container_t *c, + container_t **loc, container_t **hic, + uint16_t offset) { + run_container_t *lo = NULL, *hi = NULL; + + bool split; + int lo_cap, hi_cap; + int top, pivot; + + top = (1 << 16) - offset; + pivot = run_container_index_equalorlarger(c, top); + + if (pivot == -1) { + split = false; + lo_cap = c->n_runs; + hi_cap = 0; + } else { + split = c->runs[pivot].value < top; + lo_cap = pivot + (split ? 1 : 0); + hi_cap = c->n_runs - pivot; + } + + if (loc && lo_cap) { + lo = run_container_create_given_capacity(lo_cap); + memcpy(lo->runs, c->runs, lo_cap*sizeof(rle16_t)); + lo->n_runs = lo_cap; + for (int i = 0; i < lo_cap; ++i) { + lo->runs[i].value += offset; + } + *loc = (container_t*)lo; + } + + if (hic && hi_cap) { + hi = run_container_create_given_capacity(hi_cap); + memcpy(hi->runs, c->runs+pivot, hi_cap*sizeof(rle16_t)); + hi->n_runs = hi_cap; + for (int i = 0; i < hi_cap; ++i) { + hi->runs[i].value += offset; + } + *hic = (container_t*)hi; + } + + // Fix the split. + if (split) { + if (lo != NULL) { + // Add the missing run to 'lo', exhausting length. + lo->runs[lo->n_runs-1].length = (1 << 16) - lo->runs[lo->n_runs-1].value - 1; + } + + if (hi != NULL) { + // Fix the first run in 'hi'. + hi->runs[0].length -= UINT16_MAX - hi->runs[0].value + 1; + hi->runs[0].value = 0; + } + } +} + +/* Free memory. */ +void run_container_free(run_container_t *run) { + if(run->runs != NULL) {// Jon Strabala reports that some tools complain otherwise + roaring_free(run->runs); + run->runs = NULL; // pedantic + } + roaring_free(run); +} + +void run_container_grow(run_container_t *run, int32_t min, bool copy) { + int32_t newCapacity = + (run->capacity == 0) + ? RUN_DEFAULT_INIT_SIZE + : run->capacity < 64 ? run->capacity * 2 + : run->capacity < 1024 ? run->capacity * 3 / 2 + : run->capacity * 5 / 4; + if (newCapacity < min) newCapacity = min; + run->capacity = newCapacity; + assert(run->capacity >= min); + if (copy) { + rle16_t *oldruns = run->runs; + run->runs = + (rle16_t *)roaring_realloc(oldruns, run->capacity * sizeof(rle16_t)); + if (run->runs == NULL) roaring_free(oldruns); + } else { + // Jon Strabala reports that some tools complain otherwise + if (run->runs != NULL) { + roaring_free(run->runs); + } + run->runs = (rle16_t *)roaring_malloc(run->capacity * sizeof(rle16_t)); + } + // handle the case where realloc fails + if (run->runs == NULL) { + fprintf(stderr, "could not allocate memory\n"); + } + assert(run->runs != NULL); +} + +/* copy one container into another */ +void run_container_copy(const run_container_t *src, run_container_t *dst) { + const int32_t n_runs = src->n_runs; + if (src->n_runs > dst->capacity) { + run_container_grow(dst, n_runs, false); + } + dst->n_runs = n_runs; + memcpy(dst->runs, src->runs, sizeof(rle16_t) * n_runs); +} + +/* Compute the union of `src_1' and `src_2' and write the result to `dst' + * It is assumed that `dst' is distinct from both `src_1' and `src_2'. */ +void run_container_union(const run_container_t *src_1, + const run_container_t *src_2, run_container_t *dst) { + // TODO: this could be a lot more efficient + + // we start out with inexpensive checks + const bool if1 = run_container_is_full(src_1); + const bool if2 = run_container_is_full(src_2); + if (if1 || if2) { + if (if1) { + run_container_copy(src_1, dst); + return; + } + if (if2) { + run_container_copy(src_2, dst); + return; + } + } + const int32_t neededcapacity = src_1->n_runs + src_2->n_runs; + if (dst->capacity < neededcapacity) + run_container_grow(dst, neededcapacity, false); + dst->n_runs = 0; + int32_t rlepos = 0; + int32_t xrlepos = 0; + + rle16_t previousrle; + if (src_1->runs[rlepos].value <= src_2->runs[xrlepos].value) { + previousrle = run_container_append_first(dst, src_1->runs[rlepos]); + rlepos++; + } else { + previousrle = run_container_append_first(dst, src_2->runs[xrlepos]); + xrlepos++; + } + + while ((xrlepos < src_2->n_runs) && (rlepos < src_1->n_runs)) { + rle16_t newrl; + if (src_1->runs[rlepos].value <= src_2->runs[xrlepos].value) { + newrl = src_1->runs[rlepos]; + rlepos++; + } else { + newrl = src_2->runs[xrlepos]; + xrlepos++; + } + run_container_append(dst, newrl, &previousrle); + } + while (xrlepos < src_2->n_runs) { + run_container_append(dst, src_2->runs[xrlepos], &previousrle); + xrlepos++; + } + while (rlepos < src_1->n_runs) { + run_container_append(dst, src_1->runs[rlepos], &previousrle); + rlepos++; + } +} + +/* Compute the union of `src_1' and `src_2' and write the result to `src_1' + */ +void run_container_union_inplace(run_container_t *src_1, + const run_container_t *src_2) { + // TODO: this could be a lot more efficient + + // we start out with inexpensive checks + const bool if1 = run_container_is_full(src_1); + const bool if2 = run_container_is_full(src_2); + if (if1 || if2) { + if (if1) { + return; + } + if (if2) { + run_container_copy(src_2, src_1); + return; + } + } + // we move the data to the end of the current array + const int32_t maxoutput = src_1->n_runs + src_2->n_runs; + const int32_t neededcapacity = maxoutput + src_1->n_runs; + if (src_1->capacity < neededcapacity) + run_container_grow(src_1, neededcapacity, true); + memmove(src_1->runs + maxoutput, src_1->runs, + src_1->n_runs * sizeof(rle16_t)); + rle16_t *inputsrc1 = src_1->runs + maxoutput; + const int32_t input1nruns = src_1->n_runs; + src_1->n_runs = 0; + int32_t rlepos = 0; + int32_t xrlepos = 0; + + rle16_t previousrle; + if (inputsrc1[rlepos].value <= src_2->runs[xrlepos].value) { + previousrle = run_container_append_first(src_1, inputsrc1[rlepos]); + rlepos++; + } else { + previousrle = run_container_append_first(src_1, src_2->runs[xrlepos]); + xrlepos++; + } + while ((xrlepos < src_2->n_runs) && (rlepos < input1nruns)) { + rle16_t newrl; + if (inputsrc1[rlepos].value <= src_2->runs[xrlepos].value) { + newrl = inputsrc1[rlepos]; + rlepos++; + } else { + newrl = src_2->runs[xrlepos]; + xrlepos++; + } + run_container_append(src_1, newrl, &previousrle); + } + while (xrlepos < src_2->n_runs) { + run_container_append(src_1, src_2->runs[xrlepos], &previousrle); + xrlepos++; + } + while (rlepos < input1nruns) { + run_container_append(src_1, inputsrc1[rlepos], &previousrle); + rlepos++; + } +} + +/* Compute the symmetric difference of `src_1' and `src_2' and write the result + * to `dst' + * It is assumed that `dst' is distinct from both `src_1' and `src_2'. */ +void run_container_xor(const run_container_t *src_1, + const run_container_t *src_2, run_container_t *dst) { + // don't bother to convert xor with full range into negation + // since negation is implemented similarly + + const int32_t neededcapacity = src_1->n_runs + src_2->n_runs; + if (dst->capacity < neededcapacity) + run_container_grow(dst, neededcapacity, false); + + int32_t pos1 = 0; + int32_t pos2 = 0; + dst->n_runs = 0; + + while ((pos1 < src_1->n_runs) && (pos2 < src_2->n_runs)) { + if (src_1->runs[pos1].value <= src_2->runs[pos2].value) { + run_container_smart_append_exclusive(dst, src_1->runs[pos1].value, + src_1->runs[pos1].length); + pos1++; + } else { + run_container_smart_append_exclusive(dst, src_2->runs[pos2].value, + src_2->runs[pos2].length); + pos2++; + } + } + while (pos1 < src_1->n_runs) { + run_container_smart_append_exclusive(dst, src_1->runs[pos1].value, + src_1->runs[pos1].length); + pos1++; + } + + while (pos2 < src_2->n_runs) { + run_container_smart_append_exclusive(dst, src_2->runs[pos2].value, + src_2->runs[pos2].length); + pos2++; + } +} + +/* Compute the intersection of src_1 and src_2 and write the result to + * dst. It is assumed that dst is distinct from both src_1 and src_2. */ +void run_container_intersection(const run_container_t *src_1, + const run_container_t *src_2, + run_container_t *dst) { + const bool if1 = run_container_is_full(src_1); + const bool if2 = run_container_is_full(src_2); + if (if1 || if2) { + if (if1) { + run_container_copy(src_2, dst); + return; + } + if (if2) { + run_container_copy(src_1, dst); + return; + } + } + // TODO: this could be a lot more efficient, could use SIMD optimizations + const int32_t neededcapacity = src_1->n_runs + src_2->n_runs; + if (dst->capacity < neededcapacity) + run_container_grow(dst, neededcapacity, false); + dst->n_runs = 0; + int32_t rlepos = 0; + int32_t xrlepos = 0; + int32_t start = src_1->runs[rlepos].value; + int32_t end = start + src_1->runs[rlepos].length + 1; + int32_t xstart = src_2->runs[xrlepos].value; + int32_t xend = xstart + src_2->runs[xrlepos].length + 1; + while ((rlepos < src_1->n_runs) && (xrlepos < src_2->n_runs)) { + if (end <= xstart) { + ++rlepos; + if (rlepos < src_1->n_runs) { + start = src_1->runs[rlepos].value; + end = start + src_1->runs[rlepos].length + 1; + } + } else if (xend <= start) { + ++xrlepos; + if (xrlepos < src_2->n_runs) { + xstart = src_2->runs[xrlepos].value; + xend = xstart + src_2->runs[xrlepos].length + 1; + } + } else { // they overlap + const int32_t lateststart = start > xstart ? start : xstart; + int32_t earliestend; + if (end == xend) { // improbable + earliestend = end; + rlepos++; + xrlepos++; + if (rlepos < src_1->n_runs) { + start = src_1->runs[rlepos].value; + end = start + src_1->runs[rlepos].length + 1; + } + if (xrlepos < src_2->n_runs) { + xstart = src_2->runs[xrlepos].value; + xend = xstart + src_2->runs[xrlepos].length + 1; + } + } else if (end < xend) { + earliestend = end; + rlepos++; + if (rlepos < src_1->n_runs) { + start = src_1->runs[rlepos].value; + end = start + src_1->runs[rlepos].length + 1; + } + + } else { // end > xend + earliestend = xend; + xrlepos++; + if (xrlepos < src_2->n_runs) { + xstart = src_2->runs[xrlepos].value; + xend = xstart + src_2->runs[xrlepos].length + 1; + } + } + dst->runs[dst->n_runs].value = (uint16_t)lateststart; + dst->runs[dst->n_runs].length = + (uint16_t)(earliestend - lateststart - 1); + dst->n_runs++; + } + } +} + +/* Compute the size of the intersection of src_1 and src_2 . */ +int run_container_intersection_cardinality(const run_container_t *src_1, + const run_container_t *src_2) { + const bool if1 = run_container_is_full(src_1); + const bool if2 = run_container_is_full(src_2); + if (if1 || if2) { + if (if1) { + return run_container_cardinality(src_2); + } + if (if2) { + return run_container_cardinality(src_1); + } + } + int answer = 0; + int32_t rlepos = 0; + int32_t xrlepos = 0; + int32_t start = src_1->runs[rlepos].value; + int32_t end = start + src_1->runs[rlepos].length + 1; + int32_t xstart = src_2->runs[xrlepos].value; + int32_t xend = xstart + src_2->runs[xrlepos].length + 1; + while ((rlepos < src_1->n_runs) && (xrlepos < src_2->n_runs)) { + if (end <= xstart) { + ++rlepos; + if (rlepos < src_1->n_runs) { + start = src_1->runs[rlepos].value; + end = start + src_1->runs[rlepos].length + 1; + } + } else if (xend <= start) { + ++xrlepos; + if (xrlepos < src_2->n_runs) { + xstart = src_2->runs[xrlepos].value; + xend = xstart + src_2->runs[xrlepos].length + 1; + } + } else { // they overlap + const int32_t lateststart = start > xstart ? start : xstart; + int32_t earliestend; + if (end == xend) { // improbable + earliestend = end; + rlepos++; + xrlepos++; + if (rlepos < src_1->n_runs) { + start = src_1->runs[rlepos].value; + end = start + src_1->runs[rlepos].length + 1; + } + if (xrlepos < src_2->n_runs) { + xstart = src_2->runs[xrlepos].value; + xend = xstart + src_2->runs[xrlepos].length + 1; + } + } else if (end < xend) { + earliestend = end; + rlepos++; + if (rlepos < src_1->n_runs) { + start = src_1->runs[rlepos].value; + end = start + src_1->runs[rlepos].length + 1; + } + + } else { // end > xend + earliestend = xend; + xrlepos++; + if (xrlepos < src_2->n_runs) { + xstart = src_2->runs[xrlepos].value; + xend = xstart + src_2->runs[xrlepos].length + 1; + } + } + answer += earliestend - lateststart; + } + } + return answer; +} + +bool run_container_intersect(const run_container_t *src_1, + const run_container_t *src_2) { + const bool if1 = run_container_is_full(src_1); + const bool if2 = run_container_is_full(src_2); + if (if1 || if2) { + if (if1) { + return !run_container_empty(src_2); + } + if (if2) { + return !run_container_empty(src_1); + } + } + int32_t rlepos = 0; + int32_t xrlepos = 0; + int32_t start = src_1->runs[rlepos].value; + int32_t end = start + src_1->runs[rlepos].length + 1; + int32_t xstart = src_2->runs[xrlepos].value; + int32_t xend = xstart + src_2->runs[xrlepos].length + 1; + while ((rlepos < src_1->n_runs) && (xrlepos < src_2->n_runs)) { + if (end <= xstart) { + ++rlepos; + if (rlepos < src_1->n_runs) { + start = src_1->runs[rlepos].value; + end = start + src_1->runs[rlepos].length + 1; + } + } else if (xend <= start) { + ++xrlepos; + if (xrlepos < src_2->n_runs) { + xstart = src_2->runs[xrlepos].value; + xend = xstart + src_2->runs[xrlepos].length + 1; + } + } else { // they overlap + return true; + } + } + return false; +} + + +/* Compute the difference of src_1 and src_2 and write the result to + * dst. It is assumed that dst is distinct from both src_1 and src_2. */ +void run_container_andnot(const run_container_t *src_1, + const run_container_t *src_2, run_container_t *dst) { + // following Java implementation as of June 2016 + + if (dst->capacity < src_1->n_runs + src_2->n_runs) + run_container_grow(dst, src_1->n_runs + src_2->n_runs, false); + + dst->n_runs = 0; + + int rlepos1 = 0; + int rlepos2 = 0; + int32_t start = src_1->runs[rlepos1].value; + int32_t end = start + src_1->runs[rlepos1].length + 1; + int32_t start2 = src_2->runs[rlepos2].value; + int32_t end2 = start2 + src_2->runs[rlepos2].length + 1; + + while ((rlepos1 < src_1->n_runs) && (rlepos2 < src_2->n_runs)) { + if (end <= start2) { + // output the first run + dst->runs[dst->n_runs++] = MAKE_RLE16(start, end - start - 1); + rlepos1++; + if (rlepos1 < src_1->n_runs) { + start = src_1->runs[rlepos1].value; + end = start + src_1->runs[rlepos1].length + 1; + } + } else if (end2 <= start) { + // exit the second run + rlepos2++; + if (rlepos2 < src_2->n_runs) { + start2 = src_2->runs[rlepos2].value; + end2 = start2 + src_2->runs[rlepos2].length + 1; + } + } else { + if (start < start2) { + dst->runs[dst->n_runs++] = + MAKE_RLE16(start, start2 - start - 1); + } + if (end2 < end) { + start = end2; + } else { + rlepos1++; + if (rlepos1 < src_1->n_runs) { + start = src_1->runs[rlepos1].value; + end = start + src_1->runs[rlepos1].length + 1; + } + } + } + } + if (rlepos1 < src_1->n_runs) { + dst->runs[dst->n_runs++] = MAKE_RLE16(start, end - start - 1); + rlepos1++; + if (rlepos1 < src_1->n_runs) { + memcpy(dst->runs + dst->n_runs, src_1->runs + rlepos1, + sizeof(rle16_t) * (src_1->n_runs - rlepos1)); + dst->n_runs += src_1->n_runs - rlepos1; + } + } +} + +ALLOW_UNALIGNED +int run_container_to_uint32_array(void *vout, const run_container_t *cont, + uint32_t base) { + int outpos = 0; + uint32_t *out = (uint32_t *)vout; + for (int i = 0; i < cont->n_runs; ++i) { + uint32_t run_start = base + cont->runs[i].value; + uint16_t le = cont->runs[i].length; + for (int j = 0; j <= le; ++j) { + uint32_t val = run_start + j; + memcpy(out + outpos, &val, + sizeof(uint32_t)); // should be compiled as a MOV on x64 + outpos++; + } + } + return outpos; +} + +/* + * Print this container using printf (useful for debugging). + */ +void run_container_printf(const run_container_t *cont) { + for (int i = 0; i < cont->n_runs; ++i) { + uint16_t run_start = cont->runs[i].value; + uint16_t le = cont->runs[i].length; + printf("[%d,%d]", run_start, run_start + le); + } +} + +/* + * Print this container using printf as a comma-separated list of 32-bit + * integers starting at base. + */ +void run_container_printf_as_uint32_array(const run_container_t *cont, + uint32_t base) { + if (cont->n_runs == 0) return; + { + uint32_t run_start = base + cont->runs[0].value; + uint16_t le = cont->runs[0].length; + printf("%u", run_start); + for (uint32_t j = 1; j <= le; ++j) printf(",%u", run_start + j); + } + for (int32_t i = 1; i < cont->n_runs; ++i) { + uint32_t run_start = base + cont->runs[i].value; + uint16_t le = cont->runs[i].length; + for (uint32_t j = 0; j <= le; ++j) printf(",%u", run_start + j); + } +} + +int32_t run_container_write(const run_container_t *container, char *buf) { + uint16_t cast_16 = container->n_runs; + memcpy(buf, &cast_16, sizeof(uint16_t)); + memcpy(buf + sizeof(uint16_t), container->runs, + container->n_runs * sizeof(rle16_t)); + return run_container_size_in_bytes(container); +} + +int32_t run_container_read(int32_t cardinality, run_container_t *container, + const char *buf) { + (void)cardinality; + uint16_t cast_16; + memcpy(&cast_16, buf, sizeof(uint16_t)); + container->n_runs = cast_16; + if (container->n_runs > container->capacity) + run_container_grow(container, container->n_runs, false); + if(container->n_runs > 0) { + memcpy(container->runs, buf + sizeof(uint16_t), + container->n_runs * sizeof(rle16_t)); + } + return run_container_size_in_bytes(container); +} + +bool run_container_iterate(const run_container_t *cont, uint32_t base, + roaring_iterator iterator, void *ptr) { + for (int i = 0; i < cont->n_runs; ++i) { + uint32_t run_start = base + cont->runs[i].value; + uint16_t le = cont->runs[i].length; + + for (int j = 0; j <= le; ++j) + if (!iterator(run_start + j, ptr)) return false; + } + return true; +} + +bool run_container_iterate64(const run_container_t *cont, uint32_t base, + roaring_iterator64 iterator, uint64_t high_bits, + void *ptr) { + for (int i = 0; i < cont->n_runs; ++i) { + uint32_t run_start = base + cont->runs[i].value; + uint16_t le = cont->runs[i].length; + + for (int j = 0; j <= le; ++j) + if (!iterator(high_bits | (uint64_t)(run_start + j), ptr)) + return false; + } + return true; +} + +bool run_container_is_subset(const run_container_t *container1, + const run_container_t *container2) { + int i1 = 0, i2 = 0; + while (i1 < container1->n_runs && i2 < container2->n_runs) { + int start1 = container1->runs[i1].value; + int stop1 = start1 + container1->runs[i1].length; + int start2 = container2->runs[i2].value; + int stop2 = start2 + container2->runs[i2].length; + if (start1 < start2) { + return false; + } else { // start1 >= start2 + if (stop1 < stop2) { + i1++; + } else if (stop1 == stop2) { + i1++; + i2++; + } else { // stop1 > stop2 + i2++; + } + } + } + if (i1 == container1->n_runs) { + return true; + } else { + return false; + } +} + +// TODO: write smart_append_exclusive version to match the overloaded 1 param +// Java version (or is it even used?) + +// follows the Java implementation closely +// length is the rle-value. Ie, run [10,12) uses a length value 1. +void run_container_smart_append_exclusive(run_container_t *src, + const uint16_t start, + const uint16_t length) { + int old_end; + rle16_t *last_run = src->n_runs ? src->runs + (src->n_runs - 1) : NULL; + rle16_t *appended_last_run = src->runs + src->n_runs; + + if (!src->n_runs || + (start > (old_end = last_run->value + last_run->length + 1))) { + *appended_last_run = MAKE_RLE16(start, length); + src->n_runs++; + return; + } + if (old_end == start) { + // we merge + last_run->length += (length + 1); + return; + } + int new_end = start + length + 1; + + if (start == last_run->value) { + // wipe out previous + if (new_end < old_end) { + *last_run = MAKE_RLE16(new_end, old_end - new_end - 1); + return; + } else if (new_end > old_end) { + *last_run = MAKE_RLE16(old_end, new_end - old_end - 1); + return; + } else { + src->n_runs--; + return; + } + } + last_run->length = start - last_run->value - 1; + if (new_end < old_end) { + *appended_last_run = MAKE_RLE16(new_end, old_end - new_end - 1); + src->n_runs++; + } else if (new_end > old_end) { + *appended_last_run = MAKE_RLE16(old_end, new_end - old_end - 1); + src->n_runs++; + } +} + +bool run_container_select(const run_container_t *container, + uint32_t *start_rank, uint32_t rank, + uint32_t *element) { + for (int i = 0; i < container->n_runs; i++) { + uint16_t length = container->runs[i].length; + if (rank <= *start_rank + length) { + uint16_t value = container->runs[i].value; + *element = value + rank - (*start_rank); + return true; + } else + *start_rank += length + 1; + } + return false; +} + +int run_container_rank(const run_container_t *container, uint16_t x) { + int sum = 0; + uint32_t x32 = x; + for (int i = 0; i < container->n_runs; i++) { + uint32_t startpoint = container->runs[i].value; + uint32_t length = container->runs[i].length; + uint32_t endpoint = length + startpoint; + if (x <= endpoint) { + if (x < startpoint) break; + return sum + (x32 - startpoint) + 1; + } else { + sum += length + 1; + } + } + return sum; +} + +int run_container_get_index(const run_container_t *container, uint16_t x) { + if (run_container_contains(container, x)) { + int sum = 0; + uint32_t x32 = x; + for (int i = 0; i < container->n_runs; i++) { + uint32_t startpoint = container->runs[i].value; + uint32_t length = container->runs[i].length; + uint32_t endpoint = length + startpoint; + if (x <= endpoint) { + if (x < startpoint) break; + return sum + (x32 - startpoint); + } else { + sum += length + 1; + } + } + return sum - 1; + } else { + return -1; + } +} + +#if defined(CROARING_IS_X64) && CROARING_COMPILER_SUPPORTS_AVX512 + +CROARING_TARGET_AVX512 +ALLOW_UNALIGNED +/* Get the cardinality of `run'. Requires an actual computation. */ +static inline int _avx512_run_container_cardinality(const run_container_t *run) { + const int32_t n_runs = run->n_runs; + const rle16_t *runs = run->runs; + + /* by initializing with n_runs, we omit counting the +1 for each pair. */ + int sum = n_runs; + int32_t k = 0; + const int32_t step = sizeof(__m512i) / sizeof(rle16_t); + if (n_runs > step) { + __m512i total = _mm512_setzero_si512(); + for (; k + step <= n_runs; k += step) { + __m512i ymm1 = _mm512_loadu_si512((const __m512i *)(runs + k)); + __m512i justlengths = _mm512_srli_epi32(ymm1, 16); + total = _mm512_add_epi32(total, justlengths); + } + + __m256i lo = _mm512_extracti32x8_epi32(total, 0); + __m256i hi = _mm512_extracti32x8_epi32(total, 1); + + // a store might be faster than extract? + uint32_t buffer[sizeof(__m256i) / sizeof(rle16_t)]; + _mm256_storeu_si256((__m256i *)buffer, lo); + sum += (buffer[0] + buffer[1]) + (buffer[2] + buffer[3]) + + (buffer[4] + buffer[5]) + (buffer[6] + buffer[7]); + + _mm256_storeu_si256((__m256i *)buffer, hi); + sum += (buffer[0] + buffer[1]) + (buffer[2] + buffer[3]) + + (buffer[4] + buffer[5]) + (buffer[6] + buffer[7]); + + } + for (; k < n_runs; ++k) { + sum += runs[k].length; + } + + return sum; +} + +CROARING_UNTARGET_AVX512 + +CROARING_TARGET_AVX2 +ALLOW_UNALIGNED +/* Get the cardinality of `run'. Requires an actual computation. */ +static inline int _avx2_run_container_cardinality(const run_container_t *run) { + const int32_t n_runs = run->n_runs; + const rle16_t *runs = run->runs; + + /* by initializing with n_runs, we omit counting the +1 for each pair. */ + int sum = n_runs; + int32_t k = 0; + const int32_t step = sizeof(__m256i) / sizeof(rle16_t); + if (n_runs > step) { + __m256i total = _mm256_setzero_si256(); + for (; k + step <= n_runs; k += step) { + __m256i ymm1 = _mm256_lddqu_si256((const __m256i *)(runs + k)); + __m256i justlengths = _mm256_srli_epi32(ymm1, 16); + total = _mm256_add_epi32(total, justlengths); + } + // a store might be faster than extract? + uint32_t buffer[sizeof(__m256i) / sizeof(rle16_t)]; + _mm256_storeu_si256((__m256i *)buffer, total); + sum += (buffer[0] + buffer[1]) + (buffer[2] + buffer[3]) + + (buffer[4] + buffer[5]) + (buffer[6] + buffer[7]); + } + for (; k < n_runs; ++k) { + sum += runs[k].length; + } + + return sum; +} + +CROARING_UNTARGET_AVX2 + +/* Get the cardinality of `run'. Requires an actual computation. */ +static inline int _scalar_run_container_cardinality(const run_container_t *run) { + const int32_t n_runs = run->n_runs; + const rle16_t *runs = run->runs; + + /* by initializing with n_runs, we omit counting the +1 for each pair. */ + int sum = n_runs; + for (int k = 0; k < n_runs; ++k) { + sum += runs[k].length; + } + + return sum; +} + +int run_container_cardinality(const run_container_t *run) { +#if CROARING_COMPILER_SUPPORTS_AVX512 + if( croaring_hardware_support() & ROARING_SUPPORTS_AVX512 ) { + return _avx512_run_container_cardinality(run); + } + else +#endif + if( croaring_hardware_support() & ROARING_SUPPORTS_AVX2 ) { + return _avx2_run_container_cardinality(run); + } else { + return _scalar_run_container_cardinality(run); + } +} +#else + +/* Get the cardinality of `run'. Requires an actual computation. */ +int run_container_cardinality(const run_container_t *run) { + const int32_t n_runs = run->n_runs; + const rle16_t *runs = run->runs; + + /* by initializing with n_runs, we omit counting the +1 for each pair. */ + int sum = n_runs; + for (int k = 0; k < n_runs; ++k) { + sum += runs[k].length; + } + + return sum; +} +#endif + + +#ifdef __cplusplus +} } } // extern "C" { namespace roaring { namespace internal { +#endif +/* end file src/containers/run.c */ +/* begin file src/isadetection.c */ + +/* From +https://github.com/endorno/pytorch/blob/master/torch/lib/TH/generic/simd/simd.h +Highly modified. + +Copyright (c) 2016- Facebook, Inc (Adam Paszke) +Copyright (c) 2014- Facebook, Inc (Soumith Chintala) +Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) +Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu) +Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) +Copyright (c) 2011-2013 NYU (Clement Farabet) +Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, +Iain Melvin, Jason Weston) Copyright (c) 2006 Idiap Research Institute +(Samy Bengio) Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, +Samy Bengio, Johnny Mariethoz) + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories +America and IDIAP Research Institute nor the names of its contributors may be + used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +*/ + +#include +#include +#include + +// We need portability.h to be included first, see +// https://github.com/RoaringBitmap/CRoaring/issues/394 +#if CROARING_REGULAR_VISUAL_STUDIO +#include +#elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID) +#include +#endif // CROARING_REGULAR_VISUAL_STUDIO + +#if CROARING_IS_X64 +#ifndef CROARING_COMPILER_SUPPORTS_AVX512 +#error "CROARING_COMPILER_SUPPORTS_AVX512 needs to be defined." +#endif // CROARING_COMPILER_SUPPORTS_AVX512 +#endif + +#ifdef __cplusplus +extern "C" { namespace roaring { namespace internal { +#endif +enum croaring_instruction_set { + CROARING_DEFAULT = 0x0, + CROARING_NEON = 0x1, + CROARING_AVX2 = 0x4, + CROARING_SSE42 = 0x8, + CROARING_PCLMULQDQ = 0x10, + CROARING_BMI1 = 0x20, + CROARING_BMI2 = 0x40, + CROARING_ALTIVEC = 0x80, + CROARING_AVX512F = 0x100, + CROARING_AVX512DQ = 0x200, + CROARING_AVX512BW = 0x400, + CROARING_AVX512VBMI2 = 0x800, + CROARING_AVX512BITALG = 0x1000, + CROARING_AVX512VPOPCNTDQ = 0x2000, + CROARING_UNINITIALIZED = 0x8000 +}; + +#if CROARING_COMPILER_SUPPORTS_AVX512 +unsigned int CROARING_AVX512_REQUIRED = (CROARING_AVX512F | CROARING_AVX512DQ | CROARING_AVX512BW | CROARING_AVX512VBMI2 | CROARING_AVX512BITALG | CROARING_AVX512VPOPCNTDQ); +#endif + +#if defined(__x86_64__) || defined(_M_AMD64) // x64 + + +static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, + uint32_t *edx) { +#if CROARING_REGULAR_VISUAL_STUDIO + int cpu_info[4]; + __cpuidex(cpu_info, *eax, *ecx); + *eax = cpu_info[0]; + *ebx = cpu_info[1]; + *ecx = cpu_info[2]; + *edx = cpu_info[3]; +#elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID) + uint32_t level = *eax; + __get_cpuid(level, eax, ebx, ecx, edx); +#else + uint32_t a = *eax, b, c = *ecx, d; + __asm__("cpuid\n\t" : "+a"(a), "=b"(b), "+c"(c), "=d"(d)); + *eax = a; + *ebx = b; + *ecx = c; + *edx = d; +#endif +} + + +static inline uint64_t xgetbv(void) { +#if defined(_MSC_VER) + return _xgetbv(0); +#else + uint32_t xcr0_lo, xcr0_hi; + __asm__("xgetbv\n\t" : "=a" (xcr0_lo), "=d" (xcr0_hi) : "c" (0)); + return xcr0_lo | ((uint64_t)xcr0_hi << 32); +#endif +} + +/** + * This is a relatively expensive function but it will get called at most + * *once* per compilation units. Normally, the CRoaring library is built + * as one compilation unit. + */ +static inline uint32_t dynamic_croaring_detect_supported_architectures(void) { + uint32_t eax, ebx, ecx, edx; + uint32_t host_isa = 0x0; + // Can be found on Intel ISA Reference for CPUID + static uint32_t cpuid_avx2_bit = 1 << 5; ///< @private Bit 5 of EBX for EAX=0x7 + static uint32_t cpuid_bmi1_bit = 1 << 3; ///< @private bit 3 of EBX for EAX=0x7 + static uint32_t cpuid_bmi2_bit = 1 << 8; ///< @private bit 8 of EBX for EAX=0x7 + static uint32_t cpuid_avx512f_bit = 1 << 16; ///< @private bit 16 of EBX for EAX=0x7 + static uint32_t cpuid_avx512dq_bit = 1 << 17; ///< @private bit 17 of EBX for EAX=0x7 + static uint32_t cpuid_avx512bw_bit = 1 << 30; ///< @private bit 30 of EBX for EAX=0x7 + static uint32_t cpuid_avx512vbmi2_bit = 1 << 6; ///< @private bit 6 of ECX for EAX=0x7 + static uint32_t cpuid_avx512bitalg_bit = 1 << 12; ///< @private bit 12 of ECX for EAX=0x7 + static uint32_t cpuid_avx512vpopcntdq_bit = 1 << 14; ///< @private bit 14 of ECX for EAX=0x7 + static uint64_t cpuid_avx256_saved = 1 << 2; ///< @private bit 2 = AVX + static uint64_t cpuid_avx512_saved = 7 << 5; ///< @private bits 5,6,7 = opmask, ZMM_hi256, hi16_ZMM + static uint32_t cpuid_sse42_bit = 1 << 20; ///< @private bit 20 of ECX for EAX=0x1 + static uint32_t cpuid_osxsave = (1 << 26) | (1 << 27); ///< @private bits 26+27 of ECX for EAX=0x1 + static uint32_t cpuid_pclmulqdq_bit = 1 << 1; ///< @private bit 1 of ECX for EAX=0x1 + + + // EBX for EAX=0x1 + eax = 0x1; + ecx = 0x0; + cpuid(&eax, &ebx, &ecx, &edx); + + if (ecx & cpuid_sse42_bit) { + host_isa |= CROARING_SSE42; + } else { + return host_isa; // everything after is redundant + } + + if (ecx & cpuid_pclmulqdq_bit) { + host_isa |= CROARING_PCLMULQDQ; + } + + if ((ecx & cpuid_osxsave) != cpuid_osxsave) { + return host_isa; + } + + // xgetbv for checking if the OS saves registers + uint64_t xcr0 = xgetbv(); + + if ((xcr0 & cpuid_avx256_saved) == 0) { + return host_isa; + } + + // ECX for EAX=0x7 + eax = 0x7; + ecx = 0x0; + cpuid(&eax, &ebx, &ecx, &edx); + if (ebx & cpuid_avx2_bit) { + host_isa |= CROARING_AVX2; + } + if (ebx & cpuid_bmi1_bit) { + host_isa |= CROARING_BMI1; + } + + if (ebx & cpuid_bmi2_bit) { + host_isa |= CROARING_BMI2; + } + + if (!((xcr0 & cpuid_avx512_saved) == cpuid_avx512_saved)) { + return host_isa; + } + + if (ebx & cpuid_avx512f_bit) { + host_isa |= CROARING_AVX512F; + } + + if (ebx & cpuid_avx512bw_bit) { + host_isa |= CROARING_AVX512BW; + } + + if (ebx & cpuid_avx512dq_bit) { + host_isa |= CROARING_AVX512DQ; + } + + if (ecx & cpuid_avx512vbmi2_bit) { + host_isa |= CROARING_AVX512VBMI2; + } + + if (ecx & cpuid_avx512bitalg_bit) { + host_isa |= CROARING_AVX512BITALG; + } + + if (ecx & cpuid_avx512vpopcntdq_bit) { + host_isa |= CROARING_AVX512VPOPCNTDQ; + } + + return host_isa; +} + +#endif // end SIMD extension detection code + + +#if defined(__x86_64__) || defined(_M_AMD64) // x64 + +#if CROARING_ATOMIC_IMPL == CROARING_ATOMIC_IMPL_CPP +static inline uint32_t croaring_detect_supported_architectures(void) { + // thread-safe as per the C++11 standard. + static uint32_t buffer = dynamic_croaring_detect_supported_architectures(); + return buffer; +} +#elif CROARING_ATOMIC_IMPL == CROARING_ATOMIC_IMPL_C +static uint32_t croaring_detect_supported_architectures(void) { + // we use an atomic for thread safety + static _Atomic uint32_t buffer = CROARING_UNINITIALIZED; + if (buffer == CROARING_UNINITIALIZED) { + // atomicity is sufficient + buffer = dynamic_croaring_detect_supported_architectures(); + } + return buffer; +} +#else +// If we do not have atomics, we do the best we can. +static inline uint32_t croaring_detect_supported_architectures(void) { + static uint32_t buffer = CROARING_UNINITIALIZED; + if (buffer == CROARING_UNINITIALIZED) { + buffer = dynamic_croaring_detect_supported_architectures(); + } + return buffer; +} +#endif // CROARING_C_ATOMIC + +#ifdef ROARING_DISABLE_AVX + +int croaring_hardware_support(void) { + return 0; +} + +#elif defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VBMI2__) && defined(__AVX512BITALG__) && defined(__AVX512VPOPCNTDQ__) +int croaring_hardware_support(void) { + return ROARING_SUPPORTS_AVX2 | ROARING_SUPPORTS_AVX512; +} +#elif defined(__AVX2__) + +int croaring_hardware_support(void) { + static int support = 0xFFFFFFF; + if(support == 0xFFFFFFF) { + bool avx512_support = false; +#if CROARING_COMPILER_SUPPORTS_AVX512 + avx512_support = ( (croaring_detect_supported_architectures() & CROARING_AVX512_REQUIRED) + == CROARING_AVX512_REQUIRED); +#endif + support = ROARING_SUPPORTS_AVX2 | (avx512_support ? ROARING_SUPPORTS_AVX512 : 0); + } + return support; +} +#else + +int croaring_hardware_support(void) { + static int support = 0xFFFFFFF; + if(support == 0xFFFFFFF) { + bool has_avx2 = (croaring_detect_supported_architectures() & CROARING_AVX2) == CROARING_AVX2; + bool has_avx512 = false; +#if CROARING_COMPILER_SUPPORTS_AVX512 + has_avx512 = (croaring_detect_supported_architectures() & CROARING_AVX512_REQUIRED) == CROARING_AVX512_REQUIRED; +#endif // CROARING_COMPILER_SUPPORTS_AVX512 + support = (has_avx2 ? ROARING_SUPPORTS_AVX2 : 0) | (has_avx512 ? ROARING_SUPPORTS_AVX512 : 0); + } + return support; +} +#endif + +#endif // defined(__x86_64__) || defined(_M_AMD64) // x64 +#ifdef __cplusplus +} } } // extern "C" { namespace roaring { namespace internal { +#endif +/* end file src/isadetection.c */ +/* begin file src/memory.c */ +#include + +// without the following, we get lots of warnings about posix_memalign +#ifndef __cplusplus +extern int posix_memalign(void **__memptr, size_t __alignment, size_t __size); +#endif //__cplusplus // C++ does not have a well defined signature + +// portable version of posix_memalign +static void *roaring_bitmap_aligned_malloc(size_t alignment, size_t size) { + void *p; +#ifdef _MSC_VER + p = _aligned_malloc(size, alignment); +#elif defined(__MINGW32__) || defined(__MINGW64__) + p = __mingw_aligned_malloc(size, alignment); +#else + // somehow, if this is used before including "x86intrin.h", it creates an + // implicit defined warning. + if (posix_memalign(&p, alignment, size) != 0) return NULL; +#endif + return p; +} + +static void roaring_bitmap_aligned_free(void *memblock) { +#ifdef _MSC_VER + _aligned_free(memblock); +#elif defined(__MINGW32__) || defined(__MINGW64__) + __mingw_aligned_free(memblock); +#else + free(memblock); +#endif +} + +static roaring_memory_t global_memory_hook = { + .malloc = malloc, + .realloc = realloc, + .calloc = calloc, + .free = free, + .aligned_malloc = roaring_bitmap_aligned_malloc, + .aligned_free = roaring_bitmap_aligned_free, +}; + +void roaring_init_memory_hook(roaring_memory_t memory_hook) { + global_memory_hook = memory_hook; +} + +void* roaring_malloc(size_t n) { + return global_memory_hook.malloc(n); +} + +void* roaring_realloc(void* p, size_t new_sz) { + return global_memory_hook.realloc(p, new_sz); +} + +void* roaring_calloc(size_t n_elements, size_t element_size) { + return global_memory_hook.calloc(n_elements, element_size); +} + +void roaring_free(void* p) { + global_memory_hook.free(p); +} + +void* roaring_aligned_malloc(size_t alignment, size_t size) { + return global_memory_hook.aligned_malloc(alignment, size); +} + +void roaring_aligned_free(void* p) { + global_memory_hook.aligned_free(p); +} +/* end file src/memory.c */ +/* begin file src/roaring.c */ +#include +#include +#include +#include +#include +#include + + + +#ifdef __cplusplus +using namespace ::roaring::internal; + +extern "C" { namespace roaring { namespace api { +#endif + +#define CROARING_SERIALIZATION_ARRAY_UINT32 1 +#define CROARING_SERIALIZATION_CONTAINER 2 + +extern inline bool roaring_bitmap_get_copy_on_write(const roaring_bitmap_t* r); +extern inline void roaring_bitmap_set_copy_on_write(roaring_bitmap_t* r, bool cow); + +static inline bool is_cow(const roaring_bitmap_t *r) { + return r->high_low_container.flags & ROARING_FLAG_COW; +} +static inline bool is_frozen(const roaring_bitmap_t *r) { + return r->high_low_container.flags & ROARING_FLAG_FROZEN; +} + +// this is like roaring_bitmap_add, but it populates pointer arguments in such a +// way +// that we can recover the container touched, which, in turn can be used to +// accelerate some functions (when you repeatedly need to add to the same +// container) +static inline container_t *containerptr_roaring_bitmap_add( + roaring_bitmap_t *r, uint32_t val, + uint8_t *type, int *index +){ + roaring_array_t *ra = &r->high_low_container; + + uint16_t hb = val >> 16; + const int i = ra_get_index(ra, hb); + if (i >= 0) { + ra_unshare_container_at_index(ra, i); + container_t *c = ra_get_container_at_index(ra, i, type); + uint8_t new_type = *type; + container_t *c2 = container_add(c, val & 0xFFFF, *type, &new_type); + *index = i; + if (c2 != c) { + container_free(c, *type); + ra_set_container_at_index(ra, i, c2, new_type); + *type = new_type; + return c2; + } else { + return c; + } + } else { + array_container_t *new_ac = array_container_create(); + container_t *c = container_add(new_ac, val & 0xFFFF, + ARRAY_CONTAINER_TYPE, type); + // we could just assume that it stays an array container + ra_insert_new_key_value_at(ra, -i - 1, hb, c, *type); + *index = -i - 1; + return c; + } +} + +roaring_bitmap_t *roaring_bitmap_create_with_capacity(uint32_t cap) { + roaring_bitmap_t *ans = + (roaring_bitmap_t *)roaring_malloc(sizeof(roaring_bitmap_t)); + if (!ans) { + return NULL; + } + bool is_ok = ra_init_with_capacity(&ans->high_low_container, cap); + if (!is_ok) { + roaring_free(ans); + return NULL; + } + return ans; +} + +bool roaring_bitmap_init_with_capacity(roaring_bitmap_t *r, uint32_t cap) { + return ra_init_with_capacity(&r->high_low_container, cap); +} + +static inline void add_bulk_impl(roaring_bitmap_t *r, + roaring_bulk_context_t *context, + uint32_t val) { + uint16_t key = val >> 16; + if (context->container == NULL || context->key != key) { + uint8_t typecode; + int idx; + context->container = containerptr_roaring_bitmap_add( + r, val, &typecode, &idx); + context->typecode = typecode; + context->idx = idx; + context->key = key; + } else { + // no need to seek the container, it is at hand + // because we already have the container at hand, we can do the + // insertion directly, bypassing the roaring_bitmap_add call + uint8_t new_typecode; + container_t *container2 = container_add( + context->container, val & 0xFFFF, context->typecode, &new_typecode); + if (container2 != context->container) { + // rare instance when we need to change the container type + container_free(context->container, context->typecode); + ra_set_container_at_index(&r->high_low_container, context->idx, + container2, new_typecode); + context->typecode = new_typecode; + context->container = container2; + } + } +} + +void roaring_bitmap_add_many(roaring_bitmap_t *r, size_t n_args, + const uint32_t *vals) { + uint32_t val; + const uint32_t *start = vals; + const uint32_t *end = vals + n_args; + const uint32_t *current_val = start; + + if (n_args == 0) { + return; + } + + uint8_t typecode; + int idx; + container_t *container; + val = *current_val; + container = containerptr_roaring_bitmap_add(r, val, &typecode, &idx); + roaring_bulk_context_t context = {container, idx, (uint16_t)(val >> 16), typecode}; + + for (; current_val != end; current_val++) { + memcpy(&val, current_val, sizeof(val)); + add_bulk_impl(r, &context, val); + } +} + +void roaring_bitmap_add_bulk(roaring_bitmap_t *r, + roaring_bulk_context_t *context, uint32_t val) { + add_bulk_impl(r, context, val); +} + +bool roaring_bitmap_contains_bulk(const roaring_bitmap_t *r, + roaring_bulk_context_t *context, + uint32_t val) +{ + uint16_t key = val >> 16; + if (context->container == NULL || context->key != key) { + int32_t start_idx = -1; + if (context->container != NULL && context->key < key) { + start_idx = context->idx; + } + int idx = ra_advance_until(&r->high_low_container, key, start_idx); + if (idx == ra_get_size(&r->high_low_container)) { + return false; + } + uint8_t typecode; + context->container = ra_get_container_at_index(&r->high_low_container, idx, &typecode); + context->typecode = typecode; + context->idx = idx; + context->key = ra_get_key_at_index(&r->high_low_container, idx); + // ra_advance_until finds the next key >= the target, we found a later container. + if (context->key != key) { + return false; + } + } + // context is now set up + return container_contains(context->container, val & 0xFFFF, context->typecode); +} + +roaring_bitmap_t *roaring_bitmap_of_ptr(size_t n_args, const uint32_t *vals) { + roaring_bitmap_t *answer = roaring_bitmap_create(); + roaring_bitmap_add_many(answer, n_args, vals); + return answer; +} + +roaring_bitmap_t *roaring_bitmap_of(size_t n_args, ...) { + // todo: could be greatly optimized but we do not expect this call to ever + // include long lists + roaring_bitmap_t *answer = roaring_bitmap_create(); + roaring_bulk_context_t context = {0}; + va_list ap; + va_start(ap, n_args); + for (size_t i = 0; i < n_args; i++) { + uint32_t val = va_arg(ap, uint32_t); + roaring_bitmap_add_bulk(answer, &context, val); + } + va_end(ap); + return answer; +} + +static inline uint32_t minimum_uint32(uint32_t a, uint32_t b) { + return (a < b) ? a : b; +} + +static inline uint64_t minimum_uint64(uint64_t a, uint64_t b) { + return (a < b) ? a : b; +} + +roaring_bitmap_t *roaring_bitmap_from_range(uint64_t min, uint64_t max, + uint32_t step) { + if(max >= UINT64_C(0x100000000)) { + max = UINT64_C(0x100000000); + } + if (step == 0) return NULL; + if (max <= min) return NULL; + roaring_bitmap_t *answer = roaring_bitmap_create(); + if (step >= (1 << 16)) { + for (uint32_t value = (uint32_t)min; value < max; value += step) { + roaring_bitmap_add(answer, value); + } + return answer; + } + uint64_t min_tmp = min; + do { + uint32_t key = (uint32_t)min_tmp >> 16; + uint32_t container_min = min_tmp & 0xFFFF; + uint32_t container_max = (uint32_t)minimum_uint64(max - (key << 16), 1 << 16); + uint8_t type; + container_t *container = container_from_range(&type, container_min, + container_max, (uint16_t)step); + ra_append(&answer->high_low_container, key, container, type); + uint32_t gap = container_max - container_min + step - 1; + min_tmp += gap - (gap % step); + } while (min_tmp < max); + // cardinality of bitmap will be ((uint64_t) max - min + step - 1 ) / step + return answer; +} + +void roaring_bitmap_add_range_closed(roaring_bitmap_t *r, uint32_t min, uint32_t max) { + if (min > max) { + return; + } + + roaring_array_t *ra = &r->high_low_container; + + uint32_t min_key = min >> 16; + uint32_t max_key = max >> 16; + + int32_t num_required_containers = max_key - min_key + 1; + int32_t suffix_length = count_greater(ra->keys, ra->size, max_key); + int32_t prefix_length = count_less(ra->keys, ra->size - suffix_length, + min_key); + int32_t common_length = ra->size - prefix_length - suffix_length; + + if (num_required_containers > common_length) { + ra_shift_tail(ra, suffix_length, + num_required_containers - common_length); + } + + int32_t src = prefix_length + common_length - 1; + int32_t dst = ra->size - suffix_length - 1; + for (uint32_t key = max_key; key != min_key-1; key--) { // beware of min_key==0 + uint32_t container_min = (min_key == key) ? (min & 0xffff) : 0; + uint32_t container_max = (max_key == key) ? (max & 0xffff) : 0xffff; + container_t* new_container; + uint8_t new_type; + + if (src >= 0 && ra->keys[src] == key) { + ra_unshare_container_at_index(ra, src); + new_container = container_add_range(ra->containers[src], + ra->typecodes[src], + container_min, container_max, + &new_type); + if (new_container != ra->containers[src]) { + container_free(ra->containers[src], + ra->typecodes[src]); + } + src--; + } else { + new_container = container_from_range(&new_type, container_min, + container_max+1, 1); + } + ra_replace_key_and_container_at_index(ra, dst, key, new_container, + new_type); + dst--; + } +} + +void roaring_bitmap_remove_range_closed(roaring_bitmap_t *r, uint32_t min, uint32_t max) { + if (min > max) { + return; + } + + roaring_array_t *ra = &r->high_low_container; + + uint32_t min_key = min >> 16; + uint32_t max_key = max >> 16; + + int32_t src = count_less(ra->keys, ra->size, min_key); + int32_t dst = src; + while (src < ra->size && ra->keys[src] <= max_key) { + uint32_t container_min = (min_key == ra->keys[src]) ? (min & 0xffff) : 0; + uint32_t container_max = (max_key == ra->keys[src]) ? (max & 0xffff) : 0xffff; + ra_unshare_container_at_index(ra, src); + container_t *new_container; + uint8_t new_type; + new_container = container_remove_range(ra->containers[src], + ra->typecodes[src], + container_min, container_max, + &new_type); + if (new_container != ra->containers[src]) { + container_free(ra->containers[src], + ra->typecodes[src]); + } + if (new_container) { + ra_replace_key_and_container_at_index(ra, dst, ra->keys[src], + new_container, new_type); + dst++; + } + src++; + } + if (src > dst) { + ra_shift_tail(ra, ra->size - src, dst - src); + } +} + +extern inline void roaring_bitmap_add_range(roaring_bitmap_t *r, uint64_t min, uint64_t max); +extern inline void roaring_bitmap_remove_range(roaring_bitmap_t *r, uint64_t min, uint64_t max); + +void roaring_bitmap_printf(const roaring_bitmap_t *r) { + const roaring_array_t *ra = &r->high_low_container; + + printf("{"); + for (int i = 0; i < ra->size; ++i) { + container_printf_as_uint32_array(ra->containers[i], ra->typecodes[i], + ((uint32_t)ra->keys[i]) << 16); + + if (i + 1 < ra->size) { + printf(","); + } + } + printf("}"); +} + +void roaring_bitmap_printf_describe(const roaring_bitmap_t *r) { + const roaring_array_t *ra = &r->high_low_container; + + printf("{"); + for (int i = 0; i < ra->size; ++i) { + printf("%d: %s (%d)", ra->keys[i], + get_full_container_name(ra->containers[i], ra->typecodes[i]), + container_get_cardinality(ra->containers[i], ra->typecodes[i])); + if (ra->typecodes[i] == SHARED_CONTAINER_TYPE) { + printf("(shared count = %" PRIu32 " )", + croaring_refcount_get( + &(CAST_shared(ra->containers[i])->counter))); + } + + if (i + 1 < ra->size) { + printf(", "); + } + } + printf("}"); +} + +typedef struct min_max_sum_s { + uint32_t min; + uint32_t max; + uint64_t sum; +} min_max_sum_t; + +static bool min_max_sum_fnc(uint32_t value, void *param) { + min_max_sum_t *mms = (min_max_sum_t *)param; + if (value > mms->max) mms->max = value; + if (value < mms->min) mms->min = value; + mms->sum += value; + return true; // we always process all data points +} + +/** +* (For advanced users.) +* Collect statistics about the bitmap +*/ +void roaring_bitmap_statistics(const roaring_bitmap_t *r, + roaring_statistics_t *stat) { + const roaring_array_t *ra = &r->high_low_container; + + memset(stat, 0, sizeof(*stat)); + stat->n_containers = ra->size; + stat->cardinality = roaring_bitmap_get_cardinality(r); + min_max_sum_t mms; + mms.min = UINT32_C(0xFFFFFFFF); + mms.max = UINT32_C(0); + mms.sum = 0; + roaring_iterate(r, &min_max_sum_fnc, &mms); + stat->min_value = mms.min; + stat->max_value = mms.max; + stat->sum_value = mms.sum; + + for (int i = 0; i < ra->size; ++i) { + uint8_t truetype = + get_container_type(ra->containers[i], ra->typecodes[i]); + uint32_t card = + container_get_cardinality(ra->containers[i], ra->typecodes[i]); + uint32_t sbytes = + container_size_in_bytes(ra->containers[i], ra->typecodes[i]); + switch (truetype) { + case BITSET_CONTAINER_TYPE: + stat->n_bitset_containers++; + stat->n_values_bitset_containers += card; + stat->n_bytes_bitset_containers += sbytes; + break; + case ARRAY_CONTAINER_TYPE: + stat->n_array_containers++; + stat->n_values_array_containers += card; + stat->n_bytes_array_containers += sbytes; + break; + case RUN_CONTAINER_TYPE: + stat->n_run_containers++; + stat->n_values_run_containers += card; + stat->n_bytes_run_containers += sbytes; + break; + default: + assert(false); + roaring_unreachable; + } + } +} + +roaring_bitmap_t *roaring_bitmap_copy(const roaring_bitmap_t *r) { + roaring_bitmap_t *ans = + (roaring_bitmap_t *)roaring_malloc(sizeof(roaring_bitmap_t)); + if (!ans) { + return NULL; + } + if (!ra_init_with_capacity( // allocation of list of containers can fail + &ans->high_low_container, r->high_low_container.size) + ){ + roaring_free(ans); + return NULL; + } + if (!ra_overwrite( // memory allocation of individual containers may fail + &r->high_low_container, &ans->high_low_container, is_cow(r)) + ){ + roaring_bitmap_free(ans); // overwrite should leave in freeable state + return NULL; + } + roaring_bitmap_set_copy_on_write(ans, is_cow(r)); + return ans; +} + +bool roaring_bitmap_overwrite(roaring_bitmap_t *dest, + const roaring_bitmap_t *src) { + roaring_bitmap_set_copy_on_write(dest, is_cow(src)); + return ra_overwrite(&src->high_low_container, &dest->high_low_container, + is_cow(src)); +} + +void roaring_bitmap_free(const roaring_bitmap_t *r) { + if(r == NULL) { return; } + if (!is_frozen(r)) { + ra_clear((roaring_array_t*)&r->high_low_container); + } + roaring_free((roaring_bitmap_t*)r); +} + +void roaring_bitmap_clear(roaring_bitmap_t *r) { + ra_reset(&r->high_low_container); +} + +void roaring_bitmap_add(roaring_bitmap_t *r, uint32_t val) { + roaring_array_t *ra = &r->high_low_container; + + const uint16_t hb = val >> 16; + const int i = ra_get_index(ra, hb); + uint8_t typecode; + if (i >= 0) { + ra_unshare_container_at_index(ra, i); + container_t *container = + ra_get_container_at_index(ra, i, &typecode); + uint8_t newtypecode = typecode; + container_t *container2 = + container_add(container, val & 0xFFFF, typecode, &newtypecode); + if (container2 != container) { + container_free(container, typecode); + ra_set_container_at_index(&r->high_low_container, i, container2, + newtypecode); + } + } else { + array_container_t *newac = array_container_create(); + container_t *container = container_add(newac, val & 0xFFFF, + ARRAY_CONTAINER_TYPE, &typecode); + // we could just assume that it stays an array container + ra_insert_new_key_value_at(&r->high_low_container, -i - 1, hb, + container, typecode); + } +} + +bool roaring_bitmap_add_checked(roaring_bitmap_t *r, uint32_t val) { + const uint16_t hb = val >> 16; + const int i = ra_get_index(&r->high_low_container, hb); + uint8_t typecode; + bool result = false; + if (i >= 0) { + ra_unshare_container_at_index(&r->high_low_container, i); + container_t *container = + ra_get_container_at_index(&r->high_low_container, i, &typecode); + + const int oldCardinality = + container_get_cardinality(container, typecode); + + uint8_t newtypecode = typecode; + container_t *container2 = + container_add(container, val & 0xFFFF, typecode, &newtypecode); + if (container2 != container) { + container_free(container, typecode); + ra_set_container_at_index(&r->high_low_container, i, container2, + newtypecode); + result = true; + } else { + const int newCardinality = + container_get_cardinality(container, newtypecode); + + result = oldCardinality != newCardinality; + } + } else { + array_container_t *newac = array_container_create(); + container_t *container = container_add(newac, val & 0xFFFF, + ARRAY_CONTAINER_TYPE, &typecode); + // we could just assume that it stays an array container + ra_insert_new_key_value_at(&r->high_low_container, -i - 1, hb, + container, typecode); + result = true; + } + + return result; +} + +void roaring_bitmap_remove(roaring_bitmap_t *r, uint32_t val) { + const uint16_t hb = val >> 16; + const int i = ra_get_index(&r->high_low_container, hb); + uint8_t typecode; + if (i >= 0) { + ra_unshare_container_at_index(&r->high_low_container, i); + container_t *container = + ra_get_container_at_index(&r->high_low_container, i, &typecode); + uint8_t newtypecode = typecode; + container_t *container2 = + container_remove(container, val & 0xFFFF, typecode, &newtypecode); + if (container2 != container) { + container_free(container, typecode); + ra_set_container_at_index(&r->high_low_container, i, container2, + newtypecode); + } + if (container_get_cardinality(container2, newtypecode) != 0) { + ra_set_container_at_index(&r->high_low_container, i, container2, + newtypecode); + } else { + ra_remove_at_index_and_free(&r->high_low_container, i); + } + } +} + +bool roaring_bitmap_remove_checked(roaring_bitmap_t *r, uint32_t val) { + const uint16_t hb = val >> 16; + const int i = ra_get_index(&r->high_low_container, hb); + uint8_t typecode; + bool result = false; + if (i >= 0) { + ra_unshare_container_at_index(&r->high_low_container, i); + container_t *container = + ra_get_container_at_index(&r->high_low_container, i, &typecode); + + const int oldCardinality = + container_get_cardinality(container, typecode); + + uint8_t newtypecode = typecode; + container_t *container2 = + container_remove(container, val & 0xFFFF, typecode, &newtypecode); + if (container2 != container) { + container_free(container, typecode); + ra_set_container_at_index(&r->high_low_container, i, container2, + newtypecode); + } + + const int newCardinality = + container_get_cardinality(container2, newtypecode); + + if (newCardinality != 0) { + ra_set_container_at_index(&r->high_low_container, i, container2, + newtypecode); + } else { + ra_remove_at_index_and_free(&r->high_low_container, i); + } + + result = oldCardinality != newCardinality; + } + return result; +} + +void roaring_bitmap_remove_many(roaring_bitmap_t *r, size_t n_args, + const uint32_t *vals) { + if (n_args == 0 || r->high_low_container.size == 0) { + return; + } + int32_t pos = -1; // position of the container used in the previous iteration + for (size_t i = 0; i < n_args; i++) { + uint16_t key = (uint16_t)(vals[i] >> 16); + if (pos < 0 || key != r->high_low_container.keys[pos]) { + pos = ra_get_index(&r->high_low_container, key); + } + if (pos >= 0) { + uint8_t new_typecode; + container_t *new_container; + new_container = container_remove(r->high_low_container.containers[pos], + vals[i] & 0xffff, + r->high_low_container.typecodes[pos], + &new_typecode); + if (new_container != r->high_low_container.containers[pos]) { + container_free(r->high_low_container.containers[pos], + r->high_low_container.typecodes[pos]); + ra_replace_key_and_container_at_index(&r->high_low_container, + pos, key, new_container, + new_typecode); + } + if (!container_nonzero_cardinality(new_container, new_typecode)) { + container_free(new_container, new_typecode); + ra_remove_at_index(&r->high_low_container, pos); + pos = -1; + } + } + } +} + +// there should be some SIMD optimizations possible here +roaring_bitmap_t *roaring_bitmap_and(const roaring_bitmap_t *x1, + const roaring_bitmap_t *x2) { + uint8_t result_type = 0; + const int length1 = x1->high_low_container.size, + length2 = x2->high_low_container.size; + uint32_t neededcap = length1 > length2 ? length2 : length1; + roaring_bitmap_t *answer = roaring_bitmap_create_with_capacity(neededcap); + roaring_bitmap_set_copy_on_write(answer, is_cow(x1) || is_cow(x2)); + + int pos1 = 0, pos2 = 0; + + while (pos1 < length1 && pos2 < length2) { + const uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1); + const uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2); + + if (s1 == s2) { + uint8_t type1, type2; + container_t *c1 = ra_get_container_at_index( + &x1->high_low_container, pos1, &type1); + container_t *c2 = ra_get_container_at_index( + &x2->high_low_container, pos2, &type2); + container_t *c = container_and(c1, type1, c2, type2, &result_type); + + if (container_nonzero_cardinality(c, result_type)) { + ra_append(&answer->high_low_container, s1, c, result_type); + } else { + container_free(c, result_type); // otherwise: memory leak! + } + ++pos1; + ++pos2; + } else if (s1 < s2) { // s1 < s2 + pos1 = ra_advance_until(&x1->high_low_container, s2, pos1); + } else { // s1 > s2 + pos2 = ra_advance_until(&x2->high_low_container, s1, pos2); + } + } + return answer; +} + +/** + * Compute the union of 'number' bitmaps. + */ +roaring_bitmap_t *roaring_bitmap_or_many(size_t number, + const roaring_bitmap_t **x) { + if (number == 0) { + return roaring_bitmap_create(); + } + if (number == 1) { + return roaring_bitmap_copy(x[0]); + } + roaring_bitmap_t *answer = + roaring_bitmap_lazy_or(x[0], x[1], LAZY_OR_BITSET_CONVERSION); + for (size_t i = 2; i < number; i++) { + roaring_bitmap_lazy_or_inplace(answer, x[i], LAZY_OR_BITSET_CONVERSION); + } + roaring_bitmap_repair_after_lazy(answer); + return answer; +} + +/** + * Compute the xor of 'number' bitmaps. + */ +roaring_bitmap_t *roaring_bitmap_xor_many(size_t number, + const roaring_bitmap_t **x) { + if (number == 0) { + return roaring_bitmap_create(); + } + if (number == 1) { + return roaring_bitmap_copy(x[0]); + } + roaring_bitmap_t *answer = roaring_bitmap_lazy_xor(x[0], x[1]); + for (size_t i = 2; i < number; i++) { + roaring_bitmap_lazy_xor_inplace(answer, x[i]); + } + roaring_bitmap_repair_after_lazy(answer); + return answer; +} + +// inplace and (modifies its first argument). +void roaring_bitmap_and_inplace(roaring_bitmap_t *x1, + const roaring_bitmap_t *x2) { + if (x1 == x2) return; + int pos1 = 0, pos2 = 0, intersection_size = 0; + const int length1 = ra_get_size(&x1->high_low_container); + const int length2 = ra_get_size(&x2->high_low_container); + + // any skipped-over or newly emptied containers in x1 + // have to be freed. + while (pos1 < length1 && pos2 < length2) { + const uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1); + const uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2); + + if (s1 == s2) { + uint8_t type1, type2, result_type; + container_t *c1 = ra_get_container_at_index( + &x1->high_low_container, pos1, &type1); + container_t *c2 = ra_get_container_at_index( + &x2->high_low_container, pos2, &type2); + + // We do the computation "in place" only when c1 is not a shared container. + // Rationale: using a shared container safely with in place computation would + // require making a copy and then doing the computation in place which is likely + // less efficient than avoiding in place entirely and always generating a new + // container. + container_t *c = + (type1 == SHARED_CONTAINER_TYPE) + ? container_and(c1, type1, c2, type2, &result_type) + : container_iand(c1, type1, c2, type2, &result_type); + + if (c != c1) { // in this instance a new container was created, and + // we need to free the old one + container_free(c1, type1); + } + if (container_nonzero_cardinality(c, result_type)) { + ra_replace_key_and_container_at_index(&x1->high_low_container, + intersection_size, s1, c, + result_type); + intersection_size++; + } else { + container_free(c, result_type); + } + ++pos1; + ++pos2; + } else if (s1 < s2) { + pos1 = ra_advance_until_freeing(&x1->high_low_container, s2, pos1); + } else { // s1 > s2 + pos2 = ra_advance_until(&x2->high_low_container, s1, pos2); + } + } + + // if we ended early because x2 ran out, then all remaining in x1 should be + // freed + while (pos1 < length1) { + container_free(x1->high_low_container.containers[pos1], + x1->high_low_container.typecodes[pos1]); + ++pos1; + } + + // all containers after this have either been copied or freed + ra_downsize(&x1->high_low_container, intersection_size); +} + +roaring_bitmap_t *roaring_bitmap_or(const roaring_bitmap_t *x1, + const roaring_bitmap_t *x2) { + uint8_t result_type = 0; + const int length1 = x1->high_low_container.size, + length2 = x2->high_low_container.size; + if (0 == length1) { + return roaring_bitmap_copy(x2); + } + if (0 == length2) { + return roaring_bitmap_copy(x1); + } + roaring_bitmap_t *answer = + roaring_bitmap_create_with_capacity(length1 + length2); + roaring_bitmap_set_copy_on_write(answer, is_cow(x1) || is_cow(x2)); + int pos1 = 0, pos2 = 0; + uint8_t type1, type2; + uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1); + uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2); + while (true) { + if (s1 == s2) { + container_t *c1 = ra_get_container_at_index( + &x1->high_low_container, pos1, &type1); + container_t *c2 = ra_get_container_at_index( + &x2->high_low_container, pos2, &type2); + container_t *c = container_or(c1, type1, c2, type2, &result_type); + + // since we assume that the initial containers are non-empty, the + // result here + // can only be non-empty + ra_append(&answer->high_low_container, s1, c, result_type); + ++pos1; + ++pos2; + if (pos1 == length1) break; + if (pos2 == length2) break; + s1 = ra_get_key_at_index(&x1->high_low_container, pos1); + s2 = ra_get_key_at_index(&x2->high_low_container, pos2); + + } else if (s1 < s2) { // s1 < s2 + container_t *c1 = ra_get_container_at_index( + &x1->high_low_container, pos1, &type1); + // c1 = container_clone(c1, type1); + c1 = get_copy_of_container(c1, &type1, is_cow(x1)); + if (is_cow(x1)) { + ra_set_container_at_index(&x1->high_low_container, pos1, c1, + type1); + } + ra_append(&answer->high_low_container, s1, c1, type1); + pos1++; + if (pos1 == length1) break; + s1 = ra_get_key_at_index(&x1->high_low_container, pos1); + + } else { // s1 > s2 + container_t *c2 = ra_get_container_at_index( + &x2->high_low_container, pos2, &type2); + // c2 = container_clone(c2, type2); + c2 = get_copy_of_container(c2, &type2, is_cow(x2)); + if (is_cow(x2)) { + ra_set_container_at_index(&x2->high_low_container, pos2, c2, + type2); + } + ra_append(&answer->high_low_container, s2, c2, type2); + pos2++; + if (pos2 == length2) break; + s2 = ra_get_key_at_index(&x2->high_low_container, pos2); + } + } + if (pos1 == length1) { + ra_append_copy_range(&answer->high_low_container, + &x2->high_low_container, pos2, length2, + is_cow(x2)); + } else if (pos2 == length2) { + ra_append_copy_range(&answer->high_low_container, + &x1->high_low_container, pos1, length1, + is_cow(x1)); + } + return answer; +} + +// inplace or (modifies its first argument). +void roaring_bitmap_or_inplace(roaring_bitmap_t *x1, + const roaring_bitmap_t *x2) { + uint8_t result_type = 0; + int length1 = x1->high_low_container.size; + const int length2 = x2->high_low_container.size; + + if (0 == length2) return; + + if (0 == length1) { + roaring_bitmap_overwrite(x1, x2); + return; + } + int pos1 = 0, pos2 = 0; + uint8_t type1, type2; + uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1); + uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2); + while (true) { + if (s1 == s2) { + container_t *c1 = ra_get_container_at_index( + &x1->high_low_container, pos1, &type1); + if (!container_is_full(c1, type1)) { + container_t *c2 = ra_get_container_at_index( + &x2->high_low_container, pos2, &type2); + container_t *c = + (type1 == SHARED_CONTAINER_TYPE) + ? container_or(c1, type1, c2, type2, &result_type) + : container_ior(c1, type1, c2, type2, &result_type); + + if (c != c1) { // in this instance a new container was created, + // and we need to free the old one + container_free(c1, type1); + } + ra_set_container_at_index(&x1->high_low_container, pos1, c, + result_type); + } + ++pos1; + ++pos2; + if (pos1 == length1) break; + if (pos2 == length2) break; + s1 = ra_get_key_at_index(&x1->high_low_container, pos1); + s2 = ra_get_key_at_index(&x2->high_low_container, pos2); + + } else if (s1 < s2) { // s1 < s2 + pos1++; + if (pos1 == length1) break; + s1 = ra_get_key_at_index(&x1->high_low_container, pos1); + + } else { // s1 > s2 + container_t *c2 = ra_get_container_at_index(&x2->high_low_container, + pos2, &type2); + c2 = get_copy_of_container(c2, &type2, is_cow(x2)); + if (is_cow(x2)) { + ra_set_container_at_index(&x2->high_low_container, pos2, c2, + type2); + } + + // container_t *c2_clone = container_clone(c2, type2); + ra_insert_new_key_value_at(&x1->high_low_container, pos1, s2, c2, + type2); + pos1++; + length1++; + pos2++; + if (pos2 == length2) break; + s2 = ra_get_key_at_index(&x2->high_low_container, pos2); + } + } + if (pos1 == length1) { + ra_append_copy_range(&x1->high_low_container, &x2->high_low_container, + pos2, length2, is_cow(x2)); + } +} + +roaring_bitmap_t *roaring_bitmap_xor(const roaring_bitmap_t *x1, + const roaring_bitmap_t *x2) { + uint8_t result_type = 0; + const int length1 = x1->high_low_container.size, + length2 = x2->high_low_container.size; + if (0 == length1) { + return roaring_bitmap_copy(x2); + } + if (0 == length2) { + return roaring_bitmap_copy(x1); + } + roaring_bitmap_t *answer = + roaring_bitmap_create_with_capacity(length1 + length2); + roaring_bitmap_set_copy_on_write(answer, is_cow(x1) || is_cow(x2)); + int pos1 = 0, pos2 = 0; + uint8_t type1, type2; + uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1); + uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2); + while (true) { + if (s1 == s2) { + container_t *c1 = ra_get_container_at_index( + &x1->high_low_container, pos1, &type1); + container_t *c2 = ra_get_container_at_index( + &x2->high_low_container, pos2, &type2); + container_t *c = container_xor(c1, type1, c2, type2, &result_type); + + if (container_nonzero_cardinality(c, result_type)) { + ra_append(&answer->high_low_container, s1, c, result_type); + } else { + container_free(c, result_type); + } + ++pos1; + ++pos2; + if (pos1 == length1) break; + if (pos2 == length2) break; + s1 = ra_get_key_at_index(&x1->high_low_container, pos1); + s2 = ra_get_key_at_index(&x2->high_low_container, pos2); + + } else if (s1 < s2) { // s1 < s2 + container_t *c1 = ra_get_container_at_index( + &x1->high_low_container, pos1, &type1); + c1 = get_copy_of_container(c1, &type1, is_cow(x1)); + if (is_cow(x1)) { + ra_set_container_at_index(&x1->high_low_container, pos1, c1, + type1); + } + ra_append(&answer->high_low_container, s1, c1, type1); + pos1++; + if (pos1 == length1) break; + s1 = ra_get_key_at_index(&x1->high_low_container, pos1); + + } else { // s1 > s2 + container_t *c2 = ra_get_container_at_index( + &x2->high_low_container, pos2, &type2); + c2 = get_copy_of_container(c2, &type2, is_cow(x2)); + if (is_cow(x2)) { + ra_set_container_at_index(&x2->high_low_container, pos2, c2, + type2); + } + ra_append(&answer->high_low_container, s2, c2, type2); + pos2++; + if (pos2 == length2) break; + s2 = ra_get_key_at_index(&x2->high_low_container, pos2); + } + } + if (pos1 == length1) { + ra_append_copy_range(&answer->high_low_container, + &x2->high_low_container, pos2, length2, + is_cow(x2)); + } else if (pos2 == length2) { + ra_append_copy_range(&answer->high_low_container, + &x1->high_low_container, pos1, length1, + is_cow(x1)); + } + return answer; +} + +// inplace xor (modifies its first argument). + +void roaring_bitmap_xor_inplace(roaring_bitmap_t *x1, + const roaring_bitmap_t *x2) { + assert(x1 != x2); + uint8_t result_type = 0; + int length1 = x1->high_low_container.size; + const int length2 = x2->high_low_container.size; + + if (0 == length2) return; + + if (0 == length1) { + roaring_bitmap_overwrite(x1, x2); + return; + } + + // XOR can have new containers inserted from x2, but can also + // lose containers when x1 and x2 are nonempty and identical. + + int pos1 = 0, pos2 = 0; + uint8_t type1, type2; + uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1); + uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2); + while (true) { + if (s1 == s2) { + container_t *c1 = ra_get_container_at_index( + &x1->high_low_container, pos1, &type1); + container_t *c2 = ra_get_container_at_index( + &x2->high_low_container, pos2, &type2); + + // We do the computation "in place" only when c1 is not a shared container. + // Rationale: using a shared container safely with in place computation would + // require making a copy and then doing the computation in place which is likely + // less efficient than avoiding in place entirely and always generating a new + // container. + + container_t *c; + if (type1 == SHARED_CONTAINER_TYPE) { + c = container_xor(c1, type1, c2, type2, &result_type); + shared_container_free(CAST_shared(c1)); // so release + } + else { + c = container_ixor(c1, type1, c2, type2, &result_type); + } + + if (container_nonzero_cardinality(c, result_type)) { + ra_set_container_at_index(&x1->high_low_container, pos1, c, + result_type); + ++pos1; + } else { + container_free(c, result_type); + ra_remove_at_index(&x1->high_low_container, pos1); + --length1; + } + + ++pos2; + if (pos1 == length1) break; + if (pos2 == length2) break; + s1 = ra_get_key_at_index(&x1->high_low_container, pos1); + s2 = ra_get_key_at_index(&x2->high_low_container, pos2); + + } else if (s1 < s2) { // s1 < s2 + pos1++; + if (pos1 == length1) break; + s1 = ra_get_key_at_index(&x1->high_low_container, pos1); + + } else { // s1 > s2 + container_t *c2 = ra_get_container_at_index( + &x2->high_low_container, pos2, &type2); + c2 = get_copy_of_container(c2, &type2, is_cow(x2)); + if (is_cow(x2)) { + ra_set_container_at_index(&x2->high_low_container, pos2, c2, + type2); + } + + ra_insert_new_key_value_at(&x1->high_low_container, pos1, s2, c2, + type2); + pos1++; + length1++; + pos2++; + if (pos2 == length2) break; + s2 = ra_get_key_at_index(&x2->high_low_container, pos2); + } + } + if (pos1 == length1) { + ra_append_copy_range(&x1->high_low_container, &x2->high_low_container, + pos2, length2, is_cow(x2)); + } +} + +roaring_bitmap_t *roaring_bitmap_andnot(const roaring_bitmap_t *x1, + const roaring_bitmap_t *x2) { + uint8_t result_type = 0; + const int length1 = x1->high_low_container.size, + length2 = x2->high_low_container.size; + if (0 == length1) { + roaring_bitmap_t *empty_bitmap = roaring_bitmap_create(); + roaring_bitmap_set_copy_on_write(empty_bitmap, is_cow(x1) || is_cow(x2)); + return empty_bitmap; + } + if (0 == length2) { + return roaring_bitmap_copy(x1); + } + roaring_bitmap_t *answer = roaring_bitmap_create_with_capacity(length1); + roaring_bitmap_set_copy_on_write(answer, is_cow(x1) || is_cow(x2)); + + int pos1 = 0, pos2 = 0; + uint8_t type1, type2; + uint16_t s1 = 0; + uint16_t s2 = 0; + while (true) { + s1 = ra_get_key_at_index(&x1->high_low_container, pos1); + s2 = ra_get_key_at_index(&x2->high_low_container, pos2); + + if (s1 == s2) { + container_t *c1 = ra_get_container_at_index( + &x1->high_low_container, pos1, &type1); + container_t *c2 = ra_get_container_at_index( + &x2->high_low_container, pos2, &type2); + container_t *c = container_andnot(c1, type1, c2, type2, + &result_type); + + if (container_nonzero_cardinality(c, result_type)) { + ra_append(&answer->high_low_container, s1, c, result_type); + } else { + container_free(c, result_type); + } + ++pos1; + ++pos2; + if (pos1 == length1) break; + if (pos2 == length2) break; + } else if (s1 < s2) { // s1 < s2 + const int next_pos1 = + ra_advance_until(&x1->high_low_container, s2, pos1); + ra_append_copy_range(&answer->high_low_container, + &x1->high_low_container, pos1, next_pos1, + is_cow(x1)); + // TODO : perhaps some of the copy_on_write should be based on + // answer rather than x1 (more stringent?). Many similar cases + pos1 = next_pos1; + if (pos1 == length1) break; + } else { // s1 > s2 + pos2 = ra_advance_until(&x2->high_low_container, s1, pos2); + if (pos2 == length2) break; + } + } + if (pos2 == length2) { + ra_append_copy_range(&answer->high_low_container, + &x1->high_low_container, pos1, length1, + is_cow(x1)); + } + return answer; +} + +// inplace andnot (modifies its first argument). + +void roaring_bitmap_andnot_inplace(roaring_bitmap_t *x1, + const roaring_bitmap_t *x2) { + assert(x1 != x2); + + uint8_t result_type = 0; + int length1 = x1->high_low_container.size; + const int length2 = x2->high_low_container.size; + int intersection_size = 0; + + if (0 == length2) return; + + if (0 == length1) { + roaring_bitmap_clear(x1); + return; + } + + int pos1 = 0, pos2 = 0; + uint8_t type1, type2; + uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1); + uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2); + while (true) { + if (s1 == s2) { + container_t *c1 = ra_get_container_at_index( + &x1->high_low_container, pos1, &type1); + container_t *c2 = ra_get_container_at_index( + &x2->high_low_container, pos2, &type2); + + // We do the computation "in place" only when c1 is not a shared container. + // Rationale: using a shared container safely with in place computation would + // require making a copy and then doing the computation in place which is likely + // less efficient than avoiding in place entirely and always generating a new + // container. + + container_t *c; + if (type1 == SHARED_CONTAINER_TYPE) { + c = container_andnot(c1, type1, c2, type2, &result_type); + shared_container_free(CAST_shared(c1)); // release + } + else { + c = container_iandnot(c1, type1, c2, type2, &result_type); + } + + if (container_nonzero_cardinality(c, result_type)) { + ra_replace_key_and_container_at_index(&x1->high_low_container, + intersection_size++, s1, + c, result_type); + } else { + container_free(c, result_type); + } + + ++pos1; + ++pos2; + if (pos1 == length1) break; + if (pos2 == length2) break; + s1 = ra_get_key_at_index(&x1->high_low_container, pos1); + s2 = ra_get_key_at_index(&x2->high_low_container, pos2); + + } else if (s1 < s2) { // s1 < s2 + if (pos1 != intersection_size) { + container_t *c1 = ra_get_container_at_index( + &x1->high_low_container, pos1, &type1); + + ra_replace_key_and_container_at_index(&x1->high_low_container, + intersection_size, s1, c1, + type1); + } + intersection_size++; + pos1++; + if (pos1 == length1) break; + s1 = ra_get_key_at_index(&x1->high_low_container, pos1); + + } else { // s1 > s2 + pos2 = ra_advance_until(&x2->high_low_container, s1, pos2); + if (pos2 == length2) break; + s2 = ra_get_key_at_index(&x2->high_low_container, pos2); + } + } + + if (pos1 < length1) { + // all containers between intersection_size and + // pos1 are junk. However, they have either been moved + // (thus still referenced) or involved in an iandnot + // that will clean up all containers that could not be reused. + // Thus we should not free the junk containers between + // intersection_size and pos1. + if (pos1 > intersection_size) { + // left slide of remaining items + ra_copy_range(&x1->high_low_container, pos1, length1, + intersection_size); + } + // else current placement is fine + intersection_size += (length1 - pos1); + } + ra_downsize(&x1->high_low_container, intersection_size); +} + +uint64_t roaring_bitmap_get_cardinality(const roaring_bitmap_t *r) { + const roaring_array_t *ra = &r->high_low_container; + + uint64_t card = 0; + for (int i = 0; i < ra->size; ++i) + card += container_get_cardinality(ra->containers[i], ra->typecodes[i]); + return card; +} + +uint64_t roaring_bitmap_range_cardinality(const roaring_bitmap_t *r, + uint64_t range_start, + uint64_t range_end) { + const roaring_array_t *ra = &r->high_low_container; + + if (range_end > UINT32_MAX) { + range_end = UINT32_MAX + UINT64_C(1); + } + if (range_start >= range_end) { + return 0; + } + range_end--; // make range_end inclusive + // now we have: 0 <= range_start <= range_end <= UINT32_MAX + + uint16_t minhb = range_start >> 16; + uint16_t maxhb = range_end >> 16; + + uint64_t card = 0; + + int i = ra_get_index(ra, minhb); + if (i >= 0) { + if (minhb == maxhb) { + card += container_rank(ra->containers[i], ra->typecodes[i], + range_end & 0xffff); + } else { + card += container_get_cardinality(ra->containers[i], + ra->typecodes[i]); + } + if ((range_start & 0xffff) != 0) { + card -= container_rank(ra->containers[i], ra->typecodes[i], + (range_start & 0xffff) - 1); + } + i++; + } else { + i = -i - 1; + } + + for (; i < ra->size; i++) { + uint16_t key = ra->keys[i]; + if (key < maxhb) { + card += container_get_cardinality(ra->containers[i], + ra->typecodes[i]); + } else if (key == maxhb) { + card += container_rank(ra->containers[i], ra->typecodes[i], + range_end & 0xffff); + break; + } else { + break; + } + } + + return card; +} + + +bool roaring_bitmap_is_empty(const roaring_bitmap_t *r) { + return r->high_low_container.size == 0; +} + +void roaring_bitmap_to_uint32_array(const roaring_bitmap_t *r, uint32_t *ans) { + ra_to_uint32_array(&r->high_low_container, ans); +} + +bool roaring_bitmap_range_uint32_array(const roaring_bitmap_t *r, + size_t offset, size_t limit, + uint32_t *ans) { + return ra_range_uint32_array(&r->high_low_container, offset, limit, ans); +} + +/** convert array and bitmap containers to run containers when it is more + * efficient; + * also convert from run containers when more space efficient. Returns + * true if the result has at least one run container. +*/ +bool roaring_bitmap_run_optimize(roaring_bitmap_t *r) { + bool answer = false; + for (int i = 0; i < r->high_low_container.size; i++) { + uint8_t type_original, type_after; + ra_unshare_container_at_index( + &r->high_low_container, i); // TODO: this introduces extra cloning! + container_t *c = ra_get_container_at_index(&r->high_low_container, i, + &type_original); + container_t *c1 = convert_run_optimize(c, type_original, &type_after); + if (type_after == RUN_CONTAINER_TYPE) { + answer = true; + } + ra_set_container_at_index(&r->high_low_container, i, c1, type_after); + } + return answer; +} + +size_t roaring_bitmap_shrink_to_fit(roaring_bitmap_t *r) { + size_t answer = 0; + for (int i = 0; i < r->high_low_container.size; i++) { + uint8_t type_original; + container_t *c = ra_get_container_at_index(&r->high_low_container, i, + &type_original); + answer += container_shrink_to_fit(c, type_original); + } + answer += ra_shrink_to_fit(&r->high_low_container); + return answer; +} + +/** + * Remove run-length encoding even when it is more space efficient + * return whether a change was applied + */ +bool roaring_bitmap_remove_run_compression(roaring_bitmap_t *r) { + bool answer = false; + for (int i = 0; i < r->high_low_container.size; i++) { + uint8_t type_original, type_after; + container_t *c = ra_get_container_at_index(&r->high_low_container, i, + &type_original); + if (get_container_type(c, type_original) == RUN_CONTAINER_TYPE) { + answer = true; + if (type_original == SHARED_CONTAINER_TYPE) { + run_container_t *truec = CAST_run(CAST_shared(c)->container); + int32_t card = run_container_cardinality(truec); + container_t *c1 = convert_to_bitset_or_array_container( + truec, card, &type_after); + shared_container_free(CAST_shared(c)); // frees run as needed + ra_set_container_at_index(&r->high_low_container, i, c1, + type_after); + + } else { + int32_t card = run_container_cardinality(CAST_run(c)); + container_t *c1 = convert_to_bitset_or_array_container( + CAST_run(c), card, &type_after); + run_container_free(CAST_run(c)); + ra_set_container_at_index(&r->high_low_container, i, c1, + type_after); + } + } + } + return answer; +} + +size_t roaring_bitmap_serialize(const roaring_bitmap_t *r, char *buf) { + size_t portablesize = roaring_bitmap_portable_size_in_bytes(r); + uint64_t cardinality = roaring_bitmap_get_cardinality(r); + uint64_t sizeasarray = cardinality * sizeof(uint32_t) + sizeof(uint32_t); + if (portablesize < sizeasarray) { + buf[0] = CROARING_SERIALIZATION_CONTAINER; + return roaring_bitmap_portable_serialize(r, buf + 1) + 1; + } else { + buf[0] = CROARING_SERIALIZATION_ARRAY_UINT32; + memcpy(buf + 1, &cardinality, sizeof(uint32_t)); + roaring_bitmap_to_uint32_array( + r, (uint32_t *)(buf + 1 + sizeof(uint32_t))); + return 1 + (size_t)sizeasarray; + } +} + +size_t roaring_bitmap_size_in_bytes(const roaring_bitmap_t *r) { + size_t portablesize = roaring_bitmap_portable_size_in_bytes(r); + uint64_t sizeasarray = roaring_bitmap_get_cardinality(r) * sizeof(uint32_t) + + sizeof(uint32_t); + return portablesize < sizeasarray ? portablesize + 1 : (size_t)sizeasarray + 1; +} + +size_t roaring_bitmap_portable_size_in_bytes(const roaring_bitmap_t *r) { + return ra_portable_size_in_bytes(&r->high_low_container); +} + + +roaring_bitmap_t *roaring_bitmap_portable_deserialize_safe(const char *buf, size_t maxbytes) { + roaring_bitmap_t *ans = + (roaring_bitmap_t *)roaring_malloc(sizeof(roaring_bitmap_t)); + if (ans == NULL) { + return NULL; + } + size_t bytesread; + bool is_ok = ra_portable_deserialize(&ans->high_low_container, buf, maxbytes, &bytesread); + if(is_ok) assert(bytesread <= maxbytes); + roaring_bitmap_set_copy_on_write(ans, false); + if (!is_ok) { + roaring_free(ans); + return NULL; + } + return ans; +} + +roaring_bitmap_t *roaring_bitmap_portable_deserialize(const char *buf) { + return roaring_bitmap_portable_deserialize_safe(buf, SIZE_MAX); +} + + +size_t roaring_bitmap_portable_deserialize_size(const char *buf, size_t maxbytes) { + return ra_portable_deserialize_size(buf, maxbytes); +} + + +size_t roaring_bitmap_portable_serialize(const roaring_bitmap_t *r, + char *buf) { + return ra_portable_serialize(&r->high_low_container, buf); +} + +roaring_bitmap_t *roaring_bitmap_deserialize(const void *buf) { + const char *bufaschar = (const char *)buf; + if (bufaschar[0] == CROARING_SERIALIZATION_ARRAY_UINT32) { + /* This looks like a compressed set of uint32_t elements */ + uint32_t card; + + memcpy(&card, bufaschar + 1, sizeof(uint32_t)); + + const uint32_t *elems = + (const uint32_t *)(bufaschar + 1 + sizeof(uint32_t)); + + roaring_bitmap_t *bitmap = roaring_bitmap_create(); + if (bitmap == NULL) { + return NULL; + } + roaring_bulk_context_t context = {0}; + for (uint32_t i = 0; i < card; i++) { + // elems may not be aligned, read with memcpy + uint32_t elem; + memcpy(&elem, elems + i, sizeof(elem)); + roaring_bitmap_add_bulk(bitmap, &context, elem); + } + return bitmap; + + } else if (bufaschar[0] == CROARING_SERIALIZATION_CONTAINER) { + return roaring_bitmap_portable_deserialize(bufaschar + 1); + } else + return (NULL); +} + +roaring_bitmap_t* roaring_bitmap_deserialize_safe(const void *buf, size_t maxbytes) { + if (maxbytes < 1) { + return NULL; + } + + const char *bufaschar = (const char *)buf; + if (bufaschar[0] == CROARING_SERIALIZATION_ARRAY_UINT32) { + if (maxbytes < 1 + sizeof(uint32_t)) { + return NULL; + } + + /* This looks like a compressed set of uint32_t elements */ + uint32_t card; + memcpy(&card, bufaschar + 1, sizeof(uint32_t)); + + // Check the buffer is big enough to contain card uint32_t elements + if (maxbytes < 1 + sizeof(uint32_t) + card * sizeof(uint32_t)) { + return NULL; + } + + const uint32_t *elems = + (const uint32_t *)(bufaschar + 1 + sizeof(uint32_t)); + + roaring_bitmap_t *bitmap = roaring_bitmap_create(); + if (bitmap == NULL) { + return NULL; + } + roaring_bulk_context_t context = {0}; + for (uint32_t i = 0; i < card; i++) { + // elems may not be aligned, read with memcpy + uint32_t elem; + memcpy(&elem, elems + i, sizeof(elem)); + roaring_bitmap_add_bulk(bitmap, &context, elem); + } + return bitmap; + + } else if (bufaschar[0] == CROARING_SERIALIZATION_CONTAINER) { + return roaring_bitmap_portable_deserialize_safe(bufaschar + 1, maxbytes - 1); + } else + return (NULL); +} + +bool roaring_iterate(const roaring_bitmap_t *r, roaring_iterator iterator, + void *ptr) { + const roaring_array_t *ra = &r->high_low_container; + + for (int i = 0; i < ra->size; ++i) + if (!container_iterate(ra->containers[i], ra->typecodes[i], + ((uint32_t)ra->keys[i]) << 16, + iterator, ptr)) { + return false; + } + return true; +} + +bool roaring_iterate64(const roaring_bitmap_t *r, roaring_iterator64 iterator, + uint64_t high_bits, void *ptr) { + const roaring_array_t *ra = &r->high_low_container; + + for (int i = 0; i < ra->size; ++i) + if (!container_iterate64( + ra->containers[i], ra->typecodes[i], + ((uint32_t)ra->keys[i]) << 16, iterator, + high_bits, ptr)) { + return false; + } + return true; +} + +/**** +* begin roaring_uint32_iterator_t +*****/ + +// Partially initializes the roaring iterator when it begins looking at +// a new container. +static bool iter_new_container_partial_init(roaring_uint32_iterator_t *newit) { + newit->in_container_index = 0; + newit->run_index = 0; + newit->current_value = 0; + if (newit->container_index >= newit->parent->high_low_container.size || + newit->container_index < 0) { + newit->current_value = UINT32_MAX; + return (newit->has_value = false); + } + // assume not empty + newit->has_value = true; + // we precompute container, typecode and highbits so that successive + // iterators do not have to grab them from odd memory locations + // and have to worry about the (easily predicted) container_unwrap_shared + // call. + newit->container = + newit->parent->high_low_container.containers[newit->container_index]; + newit->typecode = + newit->parent->high_low_container.typecodes[newit->container_index]; + newit->highbits = + ((uint32_t) + newit->parent->high_low_container.keys[newit->container_index]) + << 16; + newit->container = + container_unwrap_shared(newit->container, &(newit->typecode)); + return newit->has_value; +} + +bool loadfirstvalue(roaring_uint32_iterator_t *newit) { + if (!iter_new_container_partial_init(newit)) + return newit->has_value; + + switch (newit->typecode) { + case BITSET_CONTAINER_TYPE: { + const bitset_container_t *bc = const_CAST_bitset(newit->container); + + uint32_t wordindex = 0; + uint64_t word; + while ((word = bc->words[wordindex]) == 0) { + wordindex++; // advance + } + // here "word" is non-zero + newit->in_container_index = wordindex * 64 + roaring_trailing_zeroes(word); + newit->current_value = newit->highbits | newit->in_container_index; + break; } + + case ARRAY_CONTAINER_TYPE: { + const array_container_t *ac = const_CAST_array(newit->container); + newit->current_value = newit->highbits | ac->array[0]; + break; } + + case RUN_CONTAINER_TYPE: { + const run_container_t *rc = const_CAST_run(newit->container); + newit->current_value = newit->highbits | rc->runs[0].value; + break; } + + default: + // if this ever happens, bug! + assert(false); + } // switch (typecode) + return true; +} + +bool loadlastvalue(roaring_uint32_iterator_t* newit) { + if (!iter_new_container_partial_init(newit)) + return newit->has_value; + + switch(newit->typecode) { + case BITSET_CONTAINER_TYPE: { + uint32_t wordindex = BITSET_CONTAINER_SIZE_IN_WORDS - 1; + uint64_t word; + const bitset_container_t* bitset_container = (const bitset_container_t*)newit->container; + while ((word = bitset_container->words[wordindex]) == 0) + --wordindex; + + int num_leading_zeros = roaring_leading_zeroes(word); + newit->in_container_index = (wordindex * 64) + (63 - num_leading_zeros); + newit->current_value = newit->highbits | newit->in_container_index; + break; + } + case ARRAY_CONTAINER_TYPE: { + const array_container_t* array_container = (const array_container_t*)newit->container; + newit->in_container_index = array_container->cardinality - 1; + newit->current_value = newit->highbits | array_container->array[newit->in_container_index]; + break; + } + case RUN_CONTAINER_TYPE: { + const run_container_t* run_container = (const run_container_t*)newit->container; + newit->run_index = run_container->n_runs - 1; + const rle16_t* last_run = &run_container->runs[newit->run_index]; + newit->current_value = newit->highbits | (last_run->value + last_run->length); + break; + } + default: + // if this ever happens, bug! + assert(false); + } + return true; +} + +// prerequesite: the value should be in range of the container +bool loadfirstvalue_largeorequal(roaring_uint32_iterator_t *newit, uint32_t val) { + // Don't have to check return value because of prerequisite + iter_new_container_partial_init(newit); + uint16_t lb = val & 0xFFFF; + + switch (newit->typecode) { + case BITSET_CONTAINER_TYPE: { + const bitset_container_t *bc = const_CAST_bitset(newit->container); + newit->in_container_index = + bitset_container_index_equalorlarger(bc, lb); + newit->current_value = newit->highbits | newit->in_container_index; + break; } + + case ARRAY_CONTAINER_TYPE: { + const array_container_t *ac = const_CAST_array(newit->container); + newit->in_container_index = + array_container_index_equalorlarger(ac, lb); + newit->current_value = + newit->highbits | ac->array[newit->in_container_index]; + break; } + + case RUN_CONTAINER_TYPE: { + const run_container_t *rc = const_CAST_run(newit->container); + newit->run_index = run_container_index_equalorlarger(rc, lb); + if (rc->runs[newit->run_index].value <= lb) { + newit->current_value = val; + } else { + newit->current_value = + newit->highbits | rc->runs[newit->run_index].value; + } + break; } + + default: + roaring_unreachable; + } + + return true; +} + +void roaring_init_iterator(const roaring_bitmap_t *r, + roaring_uint32_iterator_t *newit) { + newit->parent = r; + newit->container_index = 0; + newit->has_value = loadfirstvalue(newit); +} + +void roaring_init_iterator_last(const roaring_bitmap_t *r, + roaring_uint32_iterator_t *newit) { + newit->parent = r; + newit->container_index = newit->parent->high_low_container.size - 1; + newit->has_value = loadlastvalue(newit); +} + +roaring_uint32_iterator_t *roaring_create_iterator(const roaring_bitmap_t *r) { + roaring_uint32_iterator_t *newit = + (roaring_uint32_iterator_t *)roaring_malloc(sizeof(roaring_uint32_iterator_t)); + if (newit == NULL) return NULL; + roaring_init_iterator(r, newit); + return newit; +} + +roaring_uint32_iterator_t *roaring_copy_uint32_iterator( + const roaring_uint32_iterator_t *it) { + roaring_uint32_iterator_t *newit = + (roaring_uint32_iterator_t *)roaring_malloc(sizeof(roaring_uint32_iterator_t)); + memcpy(newit, it, sizeof(roaring_uint32_iterator_t)); + return newit; +} + +bool roaring_move_uint32_iterator_equalorlarger(roaring_uint32_iterator_t *it, uint32_t val) { + uint16_t hb = val >> 16; + const int i = ra_get_index(& it->parent->high_low_container, hb); + if (i >= 0) { + uint32_t lowvalue = container_maximum(it->parent->high_low_container.containers[i], it->parent->high_low_container.typecodes[i]); + uint16_t lb = val & 0xFFFF; + if(lowvalue < lb ) { + it->container_index = i+1; // will have to load first value of next container + } else {// the value is necessarily within the range of the container + it->container_index = i; + it->has_value = loadfirstvalue_largeorequal(it, val); + return it->has_value; + } + } else { + // there is no matching, so we are going for the next container + it->container_index = -i-1; + } + it->has_value = loadfirstvalue(it); + return it->has_value; +} + + +bool roaring_advance_uint32_iterator(roaring_uint32_iterator_t *it) { + if (it->container_index >= it->parent->high_low_container.size) { + return (it->has_value = false); + } + if (it->container_index < 0) { + it->container_index = 0; + return (it->has_value = loadfirstvalue(it)); + } + + switch (it->typecode) { + case BITSET_CONTAINER_TYPE: { + const bitset_container_t *bc = const_CAST_bitset(it->container); + it->in_container_index++; + + uint32_t wordindex = it->in_container_index / 64; + if (wordindex >= BITSET_CONTAINER_SIZE_IN_WORDS) break; + + uint64_t word = bc->words[wordindex] & + (UINT64_MAX << (it->in_container_index % 64)); + // next part could be optimized/simplified + while ((word == 0) && + (wordindex + 1 < BITSET_CONTAINER_SIZE_IN_WORDS)) { + wordindex++; + word = bc->words[wordindex]; + } + if (word != 0) { + it->in_container_index = wordindex * 64 + roaring_trailing_zeroes(word); + it->current_value = it->highbits | it->in_container_index; + return (it->has_value = true); + } + break; } + + case ARRAY_CONTAINER_TYPE: { + const array_container_t *ac = const_CAST_array(it->container); + it->in_container_index++; + if (it->in_container_index < ac->cardinality) { + it->current_value = + it->highbits | ac->array[it->in_container_index]; + return (it->has_value = true); + } + break; } + + case RUN_CONTAINER_TYPE: { + if(it->current_value == UINT32_MAX) { // avoid overflow to zero + return (it->has_value = false); + } + + const run_container_t* rc = const_CAST_run(it->container); + uint32_t limit = (it->highbits | (rc->runs[it->run_index].value + + rc->runs[it->run_index].length)); + if (++it->current_value <= limit) { + return (it->has_value = true); + } + + if (++it->run_index < rc->n_runs) { // Assume the run has a value + it->current_value = + it->highbits | rc->runs[it->run_index].value; + return (it->has_value = true); + } + break; + } + + default: + roaring_unreachable; + } + + // moving to next container + it->container_index++; + return (it->has_value = loadfirstvalue(it)); +} + +bool roaring_previous_uint32_iterator(roaring_uint32_iterator_t *it) { + if (it->container_index < 0) { + return (it->has_value = false); + } + if (it->container_index >= it->parent->high_low_container.size) { + it->container_index = it->parent->high_low_container.size - 1; + return (it->has_value = loadlastvalue(it)); + } + + switch (it->typecode) { + case BITSET_CONTAINER_TYPE: { + if (--it->in_container_index < 0) + break; + + const bitset_container_t* bitset_container = (const bitset_container_t*)it->container; + int32_t wordindex = it->in_container_index / 64; + uint64_t word = bitset_container->words[wordindex] & (UINT64_MAX >> (63 - (it->in_container_index % 64))); + + while (word == 0 && --wordindex >= 0) { + word = bitset_container->words[wordindex]; + } + if (word == 0) + break; + + int num_leading_zeros = roaring_leading_zeroes(word); + it->in_container_index = (wordindex * 64) + (63 - num_leading_zeros); + it->current_value = it->highbits | it->in_container_index; + return (it->has_value = true); + } + case ARRAY_CONTAINER_TYPE: { + if (--it->in_container_index < 0) + break; + + const array_container_t* array_container = (const array_container_t*)it->container; + it->current_value = it->highbits | array_container->array[it->in_container_index]; + return (it->has_value = true); + } + case RUN_CONTAINER_TYPE: { + if(it->current_value == 0) + return (it->has_value = false); + + const run_container_t* run_container = (const run_container_t*)it->container; + if (--it->current_value >= (it->highbits | run_container->runs[it->run_index].value)) { + return (it->has_value = true); + } + + if (--it->run_index < 0) + break; + + it->current_value = it->highbits | (run_container->runs[it->run_index].value + + run_container->runs[it->run_index].length); + return (it->has_value = true); + } + default: + // if this ever happens, bug! + assert(false); + } // switch (typecode) + + // moving to previous container + it->container_index--; + return (it->has_value = loadlastvalue(it)); +} + +uint32_t roaring_read_uint32_iterator(roaring_uint32_iterator_t *it, uint32_t* buf, uint32_t count) { + uint32_t ret = 0; + uint32_t num_values; + uint32_t wordindex; // used for bitsets + uint64_t word; // used for bitsets + const array_container_t* acont; //TODO remove + const run_container_t* rcont; //TODO remove + const bitset_container_t* bcont; //TODO remove + + while (it->has_value && ret < count) { + switch (it->typecode) { + case BITSET_CONTAINER_TYPE: + bcont = const_CAST_bitset(it->container); + wordindex = it->in_container_index / 64; + word = bcont->words[wordindex] & (UINT64_MAX << (it->in_container_index % 64)); + do { + while (word != 0 && ret < count) { + buf[0] = it->highbits | (wordindex * 64 + roaring_trailing_zeroes(word)); + word = word & (word - 1); + buf++; + ret++; + } + while (word == 0 && wordindex+1 < BITSET_CONTAINER_SIZE_IN_WORDS) { + wordindex++; + word = bcont->words[wordindex]; + } + } while (word != 0 && ret < count); + it->has_value = (word != 0); + if (it->has_value) { + it->in_container_index = wordindex * 64 + roaring_trailing_zeroes(word); + it->current_value = it->highbits | it->in_container_index; + } + break; + case ARRAY_CONTAINER_TYPE: + acont = const_CAST_array(it->container); + num_values = minimum_uint32(acont->cardinality - it->in_container_index, count - ret); + for (uint32_t i = 0; i < num_values; i++) { + buf[i] = it->highbits | acont->array[it->in_container_index + i]; + } + buf += num_values; + ret += num_values; + it->in_container_index += num_values; + it->has_value = (it->in_container_index < acont->cardinality); + if (it->has_value) { + it->current_value = it->highbits | acont->array[it->in_container_index]; + } + break; + case RUN_CONTAINER_TYPE: + rcont = const_CAST_run(it->container); + //"in_run_index" name is misleading, read it as "max_value_in_current_run" + do { + uint32_t largest_run_value = it->highbits | (rcont->runs[it->run_index].value + rcont->runs[it->run_index].length); + num_values = minimum_uint32(largest_run_value - it->current_value + 1, count - ret); + for (uint32_t i = 0; i < num_values; i++) { + buf[i] = it->current_value + i; + } + it->current_value += num_values; // this can overflow to zero: UINT32_MAX+1=0 + buf += num_values; + ret += num_values; + + if (it->current_value > largest_run_value || it->current_value == 0) { + it->run_index++; + if (it->run_index < rcont->n_runs) { + it->current_value = it->highbits | rcont->runs[it->run_index].value; + } else { + it->has_value = false; + } + } + } while ((ret < count) && it->has_value); + break; + default: + assert(false); + } + if (it->has_value) { + assert(ret == count); + return ret; + } + it->container_index++; + it->has_value = loadfirstvalue(it); + } + return ret; +} + + + +void roaring_free_uint32_iterator(roaring_uint32_iterator_t *it) { roaring_free(it); } + +/**** +* end of roaring_uint32_iterator_t +*****/ + +bool roaring_bitmap_equals(const roaring_bitmap_t *r1, + const roaring_bitmap_t *r2) { + const roaring_array_t *ra1 = &r1->high_low_container; + const roaring_array_t *ra2 = &r2->high_low_container; + + if (ra1->size != ra2->size) { + return false; + } + for (int i = 0; i < ra1->size; ++i) { + if (ra1->keys[i] != ra2->keys[i]) { + return false; + } + } + for (int i = 0; i < ra1->size; ++i) { + bool areequal = container_equals(ra1->containers[i], + ra1->typecodes[i], + ra2->containers[i], + ra2->typecodes[i]); + if (!areequal) { + return false; + } + } + return true; +} + +bool roaring_bitmap_is_subset(const roaring_bitmap_t *r1, + const roaring_bitmap_t *r2) { + const roaring_array_t *ra1 = &r1->high_low_container; + const roaring_array_t *ra2 = &r2->high_low_container; + + const int length1 = ra1->size, + length2 = ra2->size; + + int pos1 = 0, pos2 = 0; + + while (pos1 < length1 && pos2 < length2) { + const uint16_t s1 = ra_get_key_at_index(ra1, pos1); + const uint16_t s2 = ra_get_key_at_index(ra2, pos2); + + if (s1 == s2) { + uint8_t type1, type2; + container_t *c1 = ra_get_container_at_index(ra1, pos1, &type1); + container_t *c2 = ra_get_container_at_index(ra2, pos2, &type2); + if (!container_is_subset(c1, type1, c2, type2)) + return false; + ++pos1; + ++pos2; + } else if (s1 < s2) { // s1 < s2 + return false; + } else { // s1 > s2 + pos2 = ra_advance_until(ra2, s1, pos2); + } + } + if (pos1 == length1) + return true; + else + return false; +} + +static void insert_flipped_container(roaring_array_t *ans_arr, + const roaring_array_t *x1_arr, uint16_t hb, + uint16_t lb_start, uint16_t lb_end) { + const int i = ra_get_index(x1_arr, hb); + const int j = ra_get_index(ans_arr, hb); + uint8_t ctype_in, ctype_out; + container_t *flipped_container = NULL; + if (i >= 0) { + container_t *container_to_flip = + ra_get_container_at_index(x1_arr, i, &ctype_in); + flipped_container = + container_not_range(container_to_flip, ctype_in, (uint32_t)lb_start, + (uint32_t)(lb_end + 1), &ctype_out); + + if (container_get_cardinality(flipped_container, ctype_out)) + ra_insert_new_key_value_at(ans_arr, -j - 1, hb, flipped_container, + ctype_out); + else { + container_free(flipped_container, ctype_out); + } + } else { + flipped_container = container_range_of_ones( + (uint32_t)lb_start, (uint32_t)(lb_end + 1), &ctype_out); + ra_insert_new_key_value_at(ans_arr, -j - 1, hb, flipped_container, + ctype_out); + } +} + +static void inplace_flip_container(roaring_array_t *x1_arr, uint16_t hb, + uint16_t lb_start, uint16_t lb_end) { + const int i = ra_get_index(x1_arr, hb); + uint8_t ctype_in, ctype_out; + container_t *flipped_container = NULL; + if (i >= 0) { + container_t *container_to_flip = + ra_get_container_at_index(x1_arr, i, &ctype_in); + flipped_container = container_inot_range( + container_to_flip, ctype_in, (uint32_t)lb_start, + (uint32_t)(lb_end + 1), &ctype_out); + // if a new container was created, the old one was already freed + if (container_get_cardinality(flipped_container, ctype_out)) { + ra_set_container_at_index(x1_arr, i, flipped_container, ctype_out); + } else { + container_free(flipped_container, ctype_out); + ra_remove_at_index(x1_arr, i); + } + + } else { + flipped_container = container_range_of_ones( + (uint32_t)lb_start, (uint32_t)(lb_end + 1), &ctype_out); + ra_insert_new_key_value_at(x1_arr, -i - 1, hb, flipped_container, + ctype_out); + } +} + +static void insert_fully_flipped_container(roaring_array_t *ans_arr, + const roaring_array_t *x1_arr, + uint16_t hb) { + const int i = ra_get_index(x1_arr, hb); + const int j = ra_get_index(ans_arr, hb); + uint8_t ctype_in, ctype_out; + container_t *flipped_container = NULL; + if (i >= 0) { + container_t *container_to_flip = + ra_get_container_at_index(x1_arr, i, &ctype_in); + flipped_container = + container_not(container_to_flip, ctype_in, &ctype_out); + if (container_get_cardinality(flipped_container, ctype_out)) + ra_insert_new_key_value_at(ans_arr, -j - 1, hb, flipped_container, + ctype_out); + else { + container_free(flipped_container, ctype_out); + } + } else { + flipped_container = container_range_of_ones(0U, 0x10000U, &ctype_out); + ra_insert_new_key_value_at(ans_arr, -j - 1, hb, flipped_container, + ctype_out); + } +} + +static void inplace_fully_flip_container(roaring_array_t *x1_arr, uint16_t hb) { + const int i = ra_get_index(x1_arr, hb); + uint8_t ctype_in, ctype_out; + container_t *flipped_container = NULL; + if (i >= 0) { + container_t *container_to_flip = + ra_get_container_at_index(x1_arr, i, &ctype_in); + flipped_container = + container_inot(container_to_flip, ctype_in, &ctype_out); + + if (container_get_cardinality(flipped_container, ctype_out)) { + ra_set_container_at_index(x1_arr, i, flipped_container, ctype_out); + } else { + container_free(flipped_container, ctype_out); + ra_remove_at_index(x1_arr, i); + } + + } else { + flipped_container = container_range_of_ones(0U, 0x10000U, &ctype_out); + ra_insert_new_key_value_at(x1_arr, -i - 1, hb, flipped_container, + ctype_out); + } +} + +roaring_bitmap_t *roaring_bitmap_flip(const roaring_bitmap_t *x1, + uint64_t range_start, + uint64_t range_end) { + if (range_start >= range_end) { + return roaring_bitmap_copy(x1); + } + if(range_end >= UINT64_C(0x100000000)) { + range_end = UINT64_C(0x100000000); + } + + roaring_bitmap_t *ans = roaring_bitmap_create(); + roaring_bitmap_set_copy_on_write(ans, is_cow(x1)); + + uint16_t hb_start = (uint16_t)(range_start >> 16); + const uint16_t lb_start = (uint16_t)range_start; // & 0xFFFF; + uint16_t hb_end = (uint16_t)((range_end - 1) >> 16); + const uint16_t lb_end = (uint16_t)(range_end - 1); // & 0xFFFF; + + ra_append_copies_until(&ans->high_low_container, &x1->high_low_container, + hb_start, is_cow(x1)); + if (hb_start == hb_end) { + insert_flipped_container(&ans->high_low_container, + &x1->high_low_container, hb_start, lb_start, + lb_end); + } else { + // start and end containers are distinct + if (lb_start > 0) { + // handle first (partial) container + insert_flipped_container(&ans->high_low_container, + &x1->high_low_container, hb_start, + lb_start, 0xFFFF); + ++hb_start; // for the full containers. Can't wrap. + } + + if (lb_end != 0xFFFF) --hb_end; // later we'll handle the partial block + + for (uint32_t hb = hb_start; hb <= hb_end; ++hb) { + insert_fully_flipped_container(&ans->high_low_container, + &x1->high_low_container, hb); + } + + // handle a partial final container + if (lb_end != 0xFFFF) { + insert_flipped_container(&ans->high_low_container, + &x1->high_low_container, hb_end + 1, 0, + lb_end); + ++hb_end; + } + } + ra_append_copies_after(&ans->high_low_container, &x1->high_low_container, + hb_end, is_cow(x1)); + return ans; +} + +void roaring_bitmap_flip_inplace(roaring_bitmap_t *x1, uint64_t range_start, + uint64_t range_end) { + if (range_start >= range_end) { + return; // empty range + } + if(range_end >= UINT64_C(0x100000000)) { + range_end = UINT64_C(0x100000000); + } + + uint16_t hb_start = (uint16_t)(range_start >> 16); + const uint16_t lb_start = (uint16_t)range_start; + uint16_t hb_end = (uint16_t)((range_end - 1) >> 16); + const uint16_t lb_end = (uint16_t)(range_end - 1); + + if (hb_start == hb_end) { + inplace_flip_container(&x1->high_low_container, hb_start, lb_start, + lb_end); + } else { + // start and end containers are distinct + if (lb_start > 0) { + // handle first (partial) container + inplace_flip_container(&x1->high_low_container, hb_start, lb_start, + 0xFFFF); + ++hb_start; // for the full containers. Can't wrap. + } + + if (lb_end != 0xFFFF) --hb_end; + + for (uint32_t hb = hb_start; hb <= hb_end; ++hb) { + inplace_fully_flip_container(&x1->high_low_container, hb); + } + // handle a partial final container + if (lb_end != 0xFFFF) { + inplace_flip_container(&x1->high_low_container, hb_end + 1, 0, + lb_end); + ++hb_end; + } + } +} + +static void offset_append_with_merge(roaring_array_t *ra, int k, container_t *c, uint8_t t) { + int size = ra_get_size(ra); + if (size == 0 || ra_get_key_at_index(ra, size-1) != k) { + // No merge. + ra_append(ra, k, c, t); + return; + } + + uint8_t last_t, new_t; + container_t *last_c, *new_c; + + // NOTE: we don't need to unwrap here, since we added last_c ourselves + // we have the certainty it's not a shared container. + // The same applies to c, as it's the result of calling container_offset. + last_c = ra_get_container_at_index(ra, size-1, &last_t); + new_c = container_ior(last_c, last_t, c, t, &new_t); + + ra_set_container_at_index(ra, size-1, new_c, new_t); + + // Comparison of pointers of different origin is UB (or so claim some compiler + // makers), so we compare their bit representation only. + if ((uintptr_t)last_c != (uintptr_t)new_c) { + container_free(last_c, last_t); + } + container_free(c, t); +} + +// roaring_bitmap_add_offset adds the value 'offset' to each and every value in +// a bitmap, generating a new bitmap in the process. If offset + element is +// outside of the range [0,2^32), that the element will be dropped. +// We need "offset" to be 64 bits because we want to support values +// between -0xFFFFFFFF up to +0xFFFFFFFF. +roaring_bitmap_t *roaring_bitmap_add_offset(const roaring_bitmap_t *bm, + int64_t offset) { + roaring_bitmap_t *answer; + roaring_array_t *ans_ra; + int64_t container_offset; + uint16_t in_offset; + + const roaring_array_t *bm_ra = &bm->high_low_container; + int length = bm_ra->size; + + if (offset == 0) { + return roaring_bitmap_copy(bm); + } + + container_offset = offset >> 16; + in_offset = (uint16_t)(offset - container_offset * (1 << 16)); + + answer = roaring_bitmap_create(); + roaring_bitmap_set_copy_on_write(answer, is_cow(bm)); + + ans_ra = &answer->high_low_container; + + if (in_offset == 0) { + ans_ra = &answer->high_low_container; + + for (int i = 0, j = 0; i < length; ++i) { + int64_t key = ra_get_key_at_index(bm_ra, i); + key += container_offset; + + if (key < 0 || key >= (1 << 16)) { + continue; + } + + ra_append_copy(ans_ra, bm_ra, i, false); + ans_ra->keys[j++] = key; + } + + return answer; + } + + uint8_t t; + const container_t *c; + container_t *lo, *hi, **lo_ptr, **hi_ptr; + int64_t k; + + for (int i = 0; i < length; ++i) { + lo = hi = NULL; + lo_ptr = hi_ptr = NULL; + + k = ra_get_key_at_index(bm_ra, i)+container_offset; + if (k >= 0 && k < (1 << 16)) { + lo_ptr = &lo; + } + if (k+1 >= 0 && k+1 < (1 << 16)) { + hi_ptr = &hi; + } + if (lo_ptr == NULL && hi_ptr == NULL) { + continue; + } + + c = ra_get_container_at_index(bm_ra, i, &t); + c = container_unwrap_shared(c, &t); + + container_add_offset(c, t, lo_ptr, hi_ptr, in_offset); + if (lo != NULL) { + offset_append_with_merge(ans_ra, k, lo, t); + } + if (hi != NULL) { + ra_append(ans_ra, k+1, hi, t); + } + } + + return answer; +} + +roaring_bitmap_t *roaring_bitmap_lazy_or(const roaring_bitmap_t *x1, + const roaring_bitmap_t *x2, + const bool bitsetconversion) { + uint8_t result_type = 0; + const int length1 = x1->high_low_container.size, + length2 = x2->high_low_container.size; + if (0 == length1) { + return roaring_bitmap_copy(x2); + } + if (0 == length2) { + return roaring_bitmap_copy(x1); + } + roaring_bitmap_t *answer = + roaring_bitmap_create_with_capacity(length1 + length2); + roaring_bitmap_set_copy_on_write(answer, is_cow(x1) || is_cow(x2)); + int pos1 = 0, pos2 = 0; + uint8_t type1, type2; + uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1); + uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2); + while (true) { + if (s1 == s2) { + container_t *c1 = ra_get_container_at_index( + &x1->high_low_container, pos1, &type1); + container_t *c2 = ra_get_container_at_index( + &x2->high_low_container, pos2, &type2); + container_t *c; + if (bitsetconversion && + (get_container_type(c1, type1) != BITSET_CONTAINER_TYPE) && + (get_container_type(c2, type2) != BITSET_CONTAINER_TYPE) + ){ + container_t *newc1 = + container_mutable_unwrap_shared(c1, &type1); + newc1 = container_to_bitset(newc1, type1); + type1 = BITSET_CONTAINER_TYPE; + c = container_lazy_ior(newc1, type1, c2, type2, + &result_type); + if (c != newc1) { // should not happen + container_free(newc1, type1); + } + } else { + c = container_lazy_or(c1, type1, c2, type2, &result_type); + } + // since we assume that the initial containers are non-empty, + // the + // result here + // can only be non-empty + ra_append(&answer->high_low_container, s1, c, result_type); + ++pos1; + ++pos2; + if (pos1 == length1) break; + if (pos2 == length2) break; + s1 = ra_get_key_at_index(&x1->high_low_container, pos1); + s2 = ra_get_key_at_index(&x2->high_low_container, pos2); + + } else if (s1 < s2) { // s1 < s2 + container_t *c1 = ra_get_container_at_index( + &x1->high_low_container, pos1, &type1); + c1 = get_copy_of_container(c1, &type1, is_cow(x1)); + if (is_cow(x1)) { + ra_set_container_at_index(&x1->high_low_container, pos1, c1, + type1); + } + ra_append(&answer->high_low_container, s1, c1, type1); + pos1++; + if (pos1 == length1) break; + s1 = ra_get_key_at_index(&x1->high_low_container, pos1); + + } else { // s1 > s2 + container_t *c2 = ra_get_container_at_index( + &x2->high_low_container, pos2, &type2); + c2 = get_copy_of_container(c2, &type2, is_cow(x2)); + if (is_cow(x2)) { + ra_set_container_at_index(&x2->high_low_container, pos2, c2, + type2); + } + ra_append(&answer->high_low_container, s2, c2, type2); + pos2++; + if (pos2 == length2) break; + s2 = ra_get_key_at_index(&x2->high_low_container, pos2); + } + } + if (pos1 == length1) { + ra_append_copy_range(&answer->high_low_container, + &x2->high_low_container, pos2, length2, + is_cow(x2)); + } else if (pos2 == length2) { + ra_append_copy_range(&answer->high_low_container, + &x1->high_low_container, pos1, length1, + is_cow(x1)); + } + return answer; +} + +void roaring_bitmap_lazy_or_inplace(roaring_bitmap_t *x1, + const roaring_bitmap_t *x2, + const bool bitsetconversion) { + uint8_t result_type = 0; + int length1 = x1->high_low_container.size; + const int length2 = x2->high_low_container.size; + + if (0 == length2) return; + + if (0 == length1) { + roaring_bitmap_overwrite(x1, x2); + return; + } + int pos1 = 0, pos2 = 0; + uint8_t type1, type2; + uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1); + uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2); + while (true) { + if (s1 == s2) { + container_t *c1 = ra_get_container_at_index( + &x1->high_low_container, pos1, &type1); + if (!container_is_full(c1, type1)) { + if ((bitsetconversion == false) || + (get_container_type(c1, type1) == BITSET_CONTAINER_TYPE) + ){ + c1 = get_writable_copy_if_shared(c1, &type1); + } else { + // convert to bitset + container_t *old_c1 = c1; + uint8_t old_type1 = type1; + c1 = container_mutable_unwrap_shared(c1, &type1); + c1 = container_to_bitset(c1, type1); + container_free(old_c1, old_type1); + type1 = BITSET_CONTAINER_TYPE; + } + + container_t *c2 = ra_get_container_at_index( + &x2->high_low_container, pos2, &type2); + container_t *c = container_lazy_ior(c1, type1, c2, type2, + &result_type); + + if (c != c1) { // in this instance a new container was created, + // and we need to free the old one + container_free(c1, type1); + } + + ra_set_container_at_index(&x1->high_low_container, pos1, c, + result_type); + } + ++pos1; + ++pos2; + if (pos1 == length1) break; + if (pos2 == length2) break; + s1 = ra_get_key_at_index(&x1->high_low_container, pos1); + s2 = ra_get_key_at_index(&x2->high_low_container, pos2); + + } else if (s1 < s2) { // s1 < s2 + pos1++; + if (pos1 == length1) break; + s1 = ra_get_key_at_index(&x1->high_low_container, pos1); + + } else { // s1 > s2 + container_t *c2 = ra_get_container_at_index( + &x2->high_low_container, pos2, &type2); + // container_t *c2_clone = container_clone(c2, type2); + c2 = get_copy_of_container(c2, &type2, is_cow(x2)); + if (is_cow(x2)) { + ra_set_container_at_index(&x2->high_low_container, pos2, c2, + type2); + } + ra_insert_new_key_value_at(&x1->high_low_container, pos1, s2, c2, + type2); + pos1++; + length1++; + pos2++; + if (pos2 == length2) break; + s2 = ra_get_key_at_index(&x2->high_low_container, pos2); + } + } + if (pos1 == length1) { + ra_append_copy_range(&x1->high_low_container, &x2->high_low_container, + pos2, length2, is_cow(x2)); + } +} + +roaring_bitmap_t *roaring_bitmap_lazy_xor(const roaring_bitmap_t *x1, + const roaring_bitmap_t *x2) { + uint8_t result_type = 0; + const int length1 = x1->high_low_container.size, + length2 = x2->high_low_container.size; + if (0 == length1) { + return roaring_bitmap_copy(x2); + } + if (0 == length2) { + return roaring_bitmap_copy(x1); + } + roaring_bitmap_t *answer = + roaring_bitmap_create_with_capacity(length1 + length2); + roaring_bitmap_set_copy_on_write(answer, is_cow(x1) || is_cow(x2)); + int pos1 = 0, pos2 = 0; + uint8_t type1, type2; + uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1); + uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2); + while (true) { + if (s1 == s2) { + container_t *c1 = ra_get_container_at_index( + &x1->high_low_container, pos1, &type1); + container_t *c2 = ra_get_container_at_index( + &x2->high_low_container, pos2, &type2); + container_t *c = container_lazy_xor( + c1, type1, c2, type2, &result_type); + + if (container_nonzero_cardinality(c, result_type)) { + ra_append(&answer->high_low_container, s1, c, result_type); + } else { + container_free(c, result_type); + } + + ++pos1; + ++pos2; + if (pos1 == length1) break; + if (pos2 == length2) break; + s1 = ra_get_key_at_index(&x1->high_low_container, pos1); + s2 = ra_get_key_at_index(&x2->high_low_container, pos2); + + } else if (s1 < s2) { // s1 < s2 + container_t *c1 = ra_get_container_at_index( + &x1->high_low_container, pos1, &type1); + c1 = get_copy_of_container(c1, &type1, is_cow(x1)); + if (is_cow(x1)) { + ra_set_container_at_index(&x1->high_low_container, pos1, c1, + type1); + } + ra_append(&answer->high_low_container, s1, c1, type1); + pos1++; + if (pos1 == length1) break; + s1 = ra_get_key_at_index(&x1->high_low_container, pos1); + + } else { // s1 > s2 + container_t *c2 = ra_get_container_at_index( + &x2->high_low_container, pos2, &type2); + c2 = get_copy_of_container(c2, &type2, is_cow(x2)); + if (is_cow(x2)) { + ra_set_container_at_index(&x2->high_low_container, pos2, c2, + type2); + } + ra_append(&answer->high_low_container, s2, c2, type2); + pos2++; + if (pos2 == length2) break; + s2 = ra_get_key_at_index(&x2->high_low_container, pos2); + } + } + if (pos1 == length1) { + ra_append_copy_range(&answer->high_low_container, + &x2->high_low_container, pos2, length2, + is_cow(x2)); + } else if (pos2 == length2) { + ra_append_copy_range(&answer->high_low_container, + &x1->high_low_container, pos1, length1, + is_cow(x1)); + } + return answer; +} + +void roaring_bitmap_lazy_xor_inplace(roaring_bitmap_t *x1, + const roaring_bitmap_t *x2) { + assert(x1 != x2); + uint8_t result_type = 0; + int length1 = x1->high_low_container.size; + const int length2 = x2->high_low_container.size; + + if (0 == length2) return; + + if (0 == length1) { + roaring_bitmap_overwrite(x1, x2); + return; + } + int pos1 = 0, pos2 = 0; + uint8_t type1, type2; + uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1); + uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2); + while (true) { + if (s1 == s2) { + container_t *c1 = ra_get_container_at_index( + &x1->high_low_container, pos1, &type1); + container_t *c2 = ra_get_container_at_index( + &x2->high_low_container, pos2, &type2); + + // We do the computation "in place" only when c1 is not a shared container. + // Rationale: using a shared container safely with in place computation would + // require making a copy and then doing the computation in place which is likely + // less efficient than avoiding in place entirely and always generating a new + // container. + + container_t *c; + if (type1 == SHARED_CONTAINER_TYPE) { + c = container_lazy_xor(c1, type1, c2, type2, &result_type); + shared_container_free(CAST_shared(c1)); // release + } + else { + c = container_lazy_ixor(c1, type1, c2, type2, &result_type); + } + + if (container_nonzero_cardinality(c, result_type)) { + ra_set_container_at_index(&x1->high_low_container, pos1, c, + result_type); + ++pos1; + } else { + container_free(c, result_type); + ra_remove_at_index(&x1->high_low_container, pos1); + --length1; + } + ++pos2; + if (pos1 == length1) break; + if (pos2 == length2) break; + s1 = ra_get_key_at_index(&x1->high_low_container, pos1); + s2 = ra_get_key_at_index(&x2->high_low_container, pos2); + + } else if (s1 < s2) { // s1 < s2 + pos1++; + if (pos1 == length1) break; + s1 = ra_get_key_at_index(&x1->high_low_container, pos1); + + } else { // s1 > s2 + container_t *c2 = ra_get_container_at_index( + &x2->high_low_container, pos2, &type2); + // container_t *c2_clone = container_clone(c2, type2); + c2 = get_copy_of_container(c2, &type2, is_cow(x2)); + if (is_cow(x2)) { + ra_set_container_at_index(&x2->high_low_container, pos2, c2, + type2); + } + ra_insert_new_key_value_at(&x1->high_low_container, pos1, s2, c2, + type2); + pos1++; + length1++; + pos2++; + if (pos2 == length2) break; + s2 = ra_get_key_at_index(&x2->high_low_container, pos2); + } + } + if (pos1 == length1) { + ra_append_copy_range(&x1->high_low_container, &x2->high_low_container, + pos2, length2, is_cow(x2)); + } +} + +void roaring_bitmap_repair_after_lazy(roaring_bitmap_t *r) { + roaring_array_t *ra = &r->high_low_container; + + for (int i = 0; i < ra->size; ++i) { + const uint8_t old_type = ra->typecodes[i]; + container_t *old_c = ra->containers[i]; + uint8_t new_type = old_type; + container_t *new_c = container_repair_after_lazy(old_c, &new_type); + ra->containers[i] = new_c; + ra->typecodes[i] = new_type; + } +} + + + +/** +* roaring_bitmap_rank returns the number of integers that are smaller or equal +* to x. +*/ +uint64_t roaring_bitmap_rank(const roaring_bitmap_t *bm, uint32_t x) { + uint64_t size = 0; + uint32_t xhigh = x >> 16; + for (int i = 0; i < bm->high_low_container.size; i++) { + uint32_t key = bm->high_low_container.keys[i]; + if (xhigh > key) { + size += + container_get_cardinality(bm->high_low_container.containers[i], + bm->high_low_container.typecodes[i]); + } else if (xhigh == key) { + return size + container_rank(bm->high_low_container.containers[i], + bm->high_low_container.typecodes[i], + x & 0xFFFF); + } else { + return size; + } + } + return size; +} + +/** + * roaring_bitmap_get_index returns the index of x, if not exsist return -1. + */ +int64_t roaring_bitmap_get_index(const roaring_bitmap_t *bm, uint32_t x) { + int64_t index = 0; + const uint16_t xhigh = x >> 16; + int32_t high_idx = ra_get_index(&bm->high_low_container, xhigh); + if (high_idx < 0) return -1; + + for (int i = 0; i < bm->high_low_container.size; i++) { + uint32_t key = bm->high_low_container.keys[i]; + if (xhigh > key) { + index += + container_get_cardinality(bm->high_low_container.containers[i], + bm->high_low_container.typecodes[i]); + } else if (xhigh == key) { + int32_t low_idx = container_get_index( + bm->high_low_container.containers[high_idx], + bm->high_low_container.typecodes[high_idx], x & 0xFFFF); + if (low_idx < 0) return -1; + return index + low_idx; + } else { + return -1; + } + } + return index; +} + +/** +* roaring_bitmap_smallest returns the smallest value in the set. +* Returns UINT32_MAX if the set is empty. +*/ +uint32_t roaring_bitmap_minimum(const roaring_bitmap_t *bm) { + if (bm->high_low_container.size > 0) { + container_t *c = bm->high_low_container.containers[0]; + uint8_t type = bm->high_low_container.typecodes[0]; + uint32_t key = bm->high_low_container.keys[0]; + uint32_t lowvalue = container_minimum(c, type); + return lowvalue | (key << 16); + } + return UINT32_MAX; +} + +/** +* roaring_bitmap_smallest returns the greatest value in the set. +* Returns 0 if the set is empty. +*/ +uint32_t roaring_bitmap_maximum(const roaring_bitmap_t *bm) { + if (bm->high_low_container.size > 0) { + container_t *container = + bm->high_low_container.containers[bm->high_low_container.size - 1]; + uint8_t typecode = + bm->high_low_container.typecodes[bm->high_low_container.size - 1]; + uint32_t key = + bm->high_low_container.keys[bm->high_low_container.size - 1]; + uint32_t lowvalue = container_maximum(container, typecode); + return lowvalue | (key << 16); + } + return 0; +} + +bool roaring_bitmap_select(const roaring_bitmap_t *bm, uint32_t rank, + uint32_t *element) { + container_t *container; + uint8_t typecode; + uint16_t key; + uint32_t start_rank = 0; + int i = 0; + bool valid = false; + while (!valid && i < bm->high_low_container.size) { + container = bm->high_low_container.containers[i]; + typecode = bm->high_low_container.typecodes[i]; + valid = + container_select(container, typecode, &start_rank, rank, element); + i++; + } + + if (valid) { + key = bm->high_low_container.keys[i - 1]; + *element |= (((uint32_t)key) << 16); // w/o cast, key promotes signed + return true; + } else + return false; +} + +bool roaring_bitmap_intersect(const roaring_bitmap_t *x1, + const roaring_bitmap_t *x2) { + const int length1 = x1->high_low_container.size, + length2 = x2->high_low_container.size; + uint64_t answer = 0; + int pos1 = 0, pos2 = 0; + + while (pos1 < length1 && pos2 < length2) { + const uint16_t s1 = ra_get_key_at_index(& x1->high_low_container, pos1); + const uint16_t s2 = ra_get_key_at_index(& x2->high_low_container, pos2); + + if (s1 == s2) { + uint8_t type1, type2; + container_t *c1 = ra_get_container_at_index( + &x1->high_low_container, pos1, &type1); + container_t *c2 = ra_get_container_at_index( + &x2->high_low_container, pos2, &type2); + if (container_intersect(c1, type1, c2, type2)) + return true; + ++pos1; + ++pos2; + } else if (s1 < s2) { // s1 < s2 + pos1 = ra_advance_until(& x1->high_low_container, s2, pos1); + } else { // s1 > s2 + pos2 = ra_advance_until(& x2->high_low_container, s1, pos2); + } + } + return answer != 0; +} + +bool roaring_bitmap_intersect_with_range(const roaring_bitmap_t *bm, + uint64_t x, uint64_t y) { + if (x >= y) { + // Empty range. + return false; + } + roaring_uint32_iterator_t it; + roaring_init_iterator(bm, &it); + if (!roaring_move_uint32_iterator_equalorlarger(&it, x)) { + // No values above x. + return false; + } + if (it.current_value >= y) { + // No values below y. + return false; + } + return true; +} + + +uint64_t roaring_bitmap_and_cardinality(const roaring_bitmap_t *x1, + const roaring_bitmap_t *x2) { + const int length1 = x1->high_low_container.size, + length2 = x2->high_low_container.size; + uint64_t answer = 0; + int pos1 = 0, pos2 = 0; + while (pos1 < length1 && pos2 < length2) { + const uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1); + const uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2); + + if (s1 == s2) { + uint8_t type1, type2; + container_t *c1 = ra_get_container_at_index( + &x1->high_low_container, pos1, &type1); + container_t *c2 = ra_get_container_at_index( + &x2->high_low_container, pos2, &type2); + answer += container_and_cardinality(c1, type1, c2, type2); + ++pos1; + ++pos2; + } else if (s1 < s2) { // s1 < s2 + pos1 = ra_advance_until(&x1->high_low_container, s2, pos1); + } else { // s1 > s2 + pos2 = ra_advance_until(&x2->high_low_container, s1, pos2); + } + } + return answer; +} + +double roaring_bitmap_jaccard_index(const roaring_bitmap_t *x1, + const roaring_bitmap_t *x2) { + const uint64_t c1 = roaring_bitmap_get_cardinality(x1); + const uint64_t c2 = roaring_bitmap_get_cardinality(x2); + const uint64_t inter = roaring_bitmap_and_cardinality(x1, x2); + return (double)inter / (double)(c1 + c2 - inter); +} + +uint64_t roaring_bitmap_or_cardinality(const roaring_bitmap_t *x1, + const roaring_bitmap_t *x2) { + const uint64_t c1 = roaring_bitmap_get_cardinality(x1); + const uint64_t c2 = roaring_bitmap_get_cardinality(x2); + const uint64_t inter = roaring_bitmap_and_cardinality(x1, x2); + return c1 + c2 - inter; +} + +uint64_t roaring_bitmap_andnot_cardinality(const roaring_bitmap_t *x1, + const roaring_bitmap_t *x2) { + const uint64_t c1 = roaring_bitmap_get_cardinality(x1); + const uint64_t inter = roaring_bitmap_and_cardinality(x1, x2); + return c1 - inter; +} + +uint64_t roaring_bitmap_xor_cardinality(const roaring_bitmap_t *x1, + const roaring_bitmap_t *x2) { + const uint64_t c1 = roaring_bitmap_get_cardinality(x1); + const uint64_t c2 = roaring_bitmap_get_cardinality(x2); + const uint64_t inter = roaring_bitmap_and_cardinality(x1, x2); + return c1 + c2 - 2 * inter; +} + + +bool roaring_bitmap_contains(const roaring_bitmap_t *r, uint32_t val) { + const uint16_t hb = val >> 16; + /* + * the next function call involves a binary search and lots of branching. + */ + int32_t i = ra_get_index(&r->high_low_container, hb); + if (i < 0) return false; + + uint8_t typecode; + // next call ought to be cheap + container_t *container = + ra_get_container_at_index(&r->high_low_container, i, &typecode); + // rest might be a tad expensive, possibly involving another round of binary search + return container_contains(container, val & 0xFFFF, typecode); +} + + +/** + * Check whether a range of values from range_start (included) to range_end (excluded) is present + */ +bool roaring_bitmap_contains_range(const roaring_bitmap_t *r, uint64_t range_start, uint64_t range_end) { + if(range_end >= UINT64_C(0x100000000)) { + range_end = UINT64_C(0x100000000); + } + if (range_start >= range_end) return true; // empty range are always contained! + if (range_end - range_start == 1) return roaring_bitmap_contains(r, (uint32_t)range_start); + uint16_t hb_rs = (uint16_t)(range_start >> 16); + uint16_t hb_re = (uint16_t)((range_end - 1) >> 16); + const int32_t span = hb_re - hb_rs; + const int32_t hlc_sz = ra_get_size(&r->high_low_container); + if (hlc_sz < span + 1) { + return false; + } + int32_t is = ra_get_index(&r->high_low_container, hb_rs); + int32_t ie = ra_get_index(&r->high_low_container, hb_re); + if ((ie < 0) || (is < 0) || ((ie - is) != span) || ie >= hlc_sz) { + return false; + } + const uint32_t lb_rs = range_start & 0xFFFF; + const uint32_t lb_re = ((range_end - 1) & 0xFFFF) + 1; + uint8_t type; + container_t *c = ra_get_container_at_index(&r->high_low_container, is, + &type); + if (hb_rs == hb_re) { + return container_contains_range(c, lb_rs, lb_re, type); + } + if (!container_contains_range(c, lb_rs, 1 << 16, type)) { + return false; + } + c = ra_get_container_at_index(&r->high_low_container, ie, &type); + if (!container_contains_range(c, 0, lb_re, type)) { + return false; + } + for (int32_t i = is + 1; i < ie; ++i) { + c = ra_get_container_at_index(&r->high_low_container, i, &type); + if (!container_is_full(c, type) ) { + return false; + } + } + return true; +} + + +bool roaring_bitmap_is_strict_subset(const roaring_bitmap_t *r1, + const roaring_bitmap_t *r2) { + return (roaring_bitmap_get_cardinality(r2) > + roaring_bitmap_get_cardinality(r1) && + roaring_bitmap_is_subset(r1, r2)); +} + + +/* + * FROZEN SERIALIZATION FORMAT DESCRIPTION + * + * -- (beginning must be aligned by 32 bytes) -- + * uint64_t[BITSET_CONTAINER_SIZE_IN_WORDS * num_bitset_containers] + * rle16_t[total number of rle elements in all run containers] + * uint16_t[total number of array elements in all array containers] + * uint16_t[num_containers] + * uint16_t[num_containers] + * uint8_t[num_containers] + *
uint32_t + * + *
is a 4-byte value which is a bit union of FROZEN_COOKIE (15 bits) + * and the number of containers (17 bits). + * + * stores number of elements for every container. + * Its meaning depends on container type. + * For array and bitset containers, this value is the container cardinality minus one. + * For run container, it is the number of rle_t elements (n_runs). + * + * ,, are flat arrays of elements of + * all containers of respective type. + * + * <*_data> and are kept close together because they are not accessed + * during deserilization. This may reduce IO in case of large mmaped bitmaps. + * All members have their native alignments during deserilization except
, + * which is not guaranteed to be aligned by 4 bytes. + */ + +size_t roaring_bitmap_frozen_size_in_bytes(const roaring_bitmap_t *rb) { + const roaring_array_t *ra = &rb->high_low_container; + size_t num_bytes = 0; + for (int32_t i = 0; i < ra->size; i++) { + switch (ra->typecodes[i]) { + case BITSET_CONTAINER_TYPE: { + num_bytes += BITSET_CONTAINER_SIZE_IN_WORDS * sizeof(uint64_t); + break; + } + case RUN_CONTAINER_TYPE: { + const run_container_t *rc = const_CAST_run(ra->containers[i]); + num_bytes += rc->n_runs * sizeof(rle16_t); + break; + } + case ARRAY_CONTAINER_TYPE: { + const array_container_t *ac = + const_CAST_array(ra->containers[i]); + num_bytes += ac->cardinality * sizeof(uint16_t); + break; + } + default: + roaring_unreachable; + } + } + num_bytes += (2 + 2 + 1) * ra->size; // keys, counts, typecodes + num_bytes += 4; // header + return num_bytes; +} + +inline static void *arena_alloc(char **arena, size_t num_bytes) { + char *res = *arena; + *arena += num_bytes; + return res; +} + +void roaring_bitmap_frozen_serialize(const roaring_bitmap_t *rb, char *buf) { + /* + * Note: we do not require user to supply a specifically aligned buffer. + * Thus we have to use memcpy() everywhere. + */ + + const roaring_array_t *ra = &rb->high_low_container; + + size_t bitset_zone_size = 0; + size_t run_zone_size = 0; + size_t array_zone_size = 0; + for (int32_t i = 0; i < ra->size; i++) { + switch (ra->typecodes[i]) { + case BITSET_CONTAINER_TYPE: { + bitset_zone_size += + BITSET_CONTAINER_SIZE_IN_WORDS * sizeof(uint64_t); + break; + } + case RUN_CONTAINER_TYPE: { + const run_container_t *rc = const_CAST_run(ra->containers[i]); + run_zone_size += rc->n_runs * sizeof(rle16_t); + break; + } + case ARRAY_CONTAINER_TYPE: { + const array_container_t *ac = + const_CAST_array(ra->containers[i]); + array_zone_size += ac->cardinality * sizeof(uint16_t); + break; + } + default: + roaring_unreachable; + } + } + + uint64_t *bitset_zone = (uint64_t *)arena_alloc(&buf, bitset_zone_size); + rle16_t *run_zone = (rle16_t *)arena_alloc(&buf, run_zone_size); + uint16_t *array_zone = (uint16_t *)arena_alloc(&buf, array_zone_size); + uint16_t *key_zone = (uint16_t *)arena_alloc(&buf, 2*ra->size); + uint16_t *count_zone = (uint16_t *)arena_alloc(&buf, 2*ra->size); + uint8_t *typecode_zone = (uint8_t *)arena_alloc(&buf, ra->size); + uint32_t *header_zone = (uint32_t *)arena_alloc(&buf, 4); + + for (int32_t i = 0; i < ra->size; i++) { + uint16_t count; + switch (ra->typecodes[i]) { + case BITSET_CONTAINER_TYPE: { + const bitset_container_t *bc = + const_CAST_bitset(ra->containers[i]); + memcpy(bitset_zone, bc->words, + BITSET_CONTAINER_SIZE_IN_WORDS * sizeof(uint64_t)); + bitset_zone += BITSET_CONTAINER_SIZE_IN_WORDS; + if (bc->cardinality != BITSET_UNKNOWN_CARDINALITY) { + count = bc->cardinality - 1; + } else { + count = bitset_container_compute_cardinality(bc) - 1; + } + break; + } + case RUN_CONTAINER_TYPE: { + const run_container_t *rc = const_CAST_run(ra->containers[i]); + size_t num_bytes = rc->n_runs * sizeof(rle16_t); + memcpy(run_zone, rc->runs, num_bytes); + run_zone += rc->n_runs; + count = rc->n_runs; + break; + } + case ARRAY_CONTAINER_TYPE: { + const array_container_t *ac = + const_CAST_array(ra->containers[i]); + size_t num_bytes = ac->cardinality * sizeof(uint16_t); + memcpy(array_zone, ac->array, num_bytes); + array_zone += ac->cardinality; + count = ac->cardinality - 1; + break; + } + default: + roaring_unreachable; + } + memcpy(&count_zone[i], &count, 2); + } + memcpy(key_zone, ra->keys, ra->size * sizeof(uint16_t)); + memcpy(typecode_zone, ra->typecodes, ra->size * sizeof(uint8_t)); + uint32_t header = ((uint32_t)ra->size << 15) | FROZEN_COOKIE; + memcpy(header_zone, &header, 4); +} + +const roaring_bitmap_t * +roaring_bitmap_frozen_view(const char *buf, size_t length) { + if ((uintptr_t)buf % 32 != 0) { + return NULL; + } + + // cookie and num_containers + if (length < 4) { + return NULL; + } + uint32_t header; + memcpy(&header, buf + length - 4, 4); // header may be misaligned + if ((header & 0x7FFF) != FROZEN_COOKIE) { + return NULL; + } + int32_t num_containers = (header >> 15); + + // typecodes, counts and keys + if (length < 4 + (size_t)num_containers * (1 + 2 + 2)) { + return NULL; + } + uint16_t *keys = (uint16_t *)(buf + length - 4 - num_containers * 5); + uint16_t *counts = (uint16_t *)(buf + length - 4 - num_containers * 3); + uint8_t *typecodes = (uint8_t *)(buf + length - 4 - num_containers * 1); + + // {bitset,array,run}_zone + int32_t num_bitset_containers = 0; + int32_t num_run_containers = 0; + int32_t num_array_containers = 0; + size_t bitset_zone_size = 0; + size_t run_zone_size = 0; + size_t array_zone_size = 0; + for (int32_t i = 0; i < num_containers; i++) { + switch (typecodes[i]) { + case BITSET_CONTAINER_TYPE: + num_bitset_containers++; + bitset_zone_size += BITSET_CONTAINER_SIZE_IN_WORDS * sizeof(uint64_t); + break; + case RUN_CONTAINER_TYPE: + num_run_containers++; + run_zone_size += counts[i] * sizeof(rle16_t); + break; + case ARRAY_CONTAINER_TYPE: + num_array_containers++; + array_zone_size += (counts[i] + UINT32_C(1)) * sizeof(uint16_t); + break; + default: + return NULL; + } + } + if (length != bitset_zone_size + run_zone_size + array_zone_size + + 5 * num_containers + 4) { + return NULL; + } + uint64_t *bitset_zone = (uint64_t*) (buf); + rle16_t *run_zone = (rle16_t*) (buf + bitset_zone_size); + uint16_t *array_zone = (uint16_t*) (buf + bitset_zone_size + run_zone_size); + + size_t alloc_size = 0; + alloc_size += sizeof(roaring_bitmap_t); + alloc_size += num_containers * sizeof(container_t*); + alloc_size += num_bitset_containers * sizeof(bitset_container_t); + alloc_size += num_run_containers * sizeof(run_container_t); + alloc_size += num_array_containers * sizeof(array_container_t); + + char *arena = (char *)roaring_malloc(alloc_size); + if (arena == NULL) { + return NULL; + } + + roaring_bitmap_t *rb = (roaring_bitmap_t *) + arena_alloc(&arena, sizeof(roaring_bitmap_t)); + rb->high_low_container.flags = ROARING_FLAG_FROZEN; + rb->high_low_container.allocation_size = num_containers; + rb->high_low_container.size = num_containers; + rb->high_low_container.keys = (uint16_t *)keys; + rb->high_low_container.typecodes = (uint8_t *)typecodes; + rb->high_low_container.containers = + (container_t **)arena_alloc(&arena, + sizeof(container_t*) * num_containers); + // Ensure offset of high_low_container.containers is known distance used in + // C++ wrapper. sizeof(roaring_bitmap_t) is used as it is the size of the + // only allocation that precedes high_low_container.containers. If this is + // changed (new allocation or changed order), this offset will also need to + // be changed in the C++ wrapper. + assert(rb == + (roaring_bitmap_t *)((char *)rb->high_low_container.containers - + sizeof(roaring_bitmap_t))); + for (int32_t i = 0; i < num_containers; i++) { + switch (typecodes[i]) { + case BITSET_CONTAINER_TYPE: { + bitset_container_t *bitset = (bitset_container_t *) + arena_alloc(&arena, sizeof(bitset_container_t)); + bitset->words = bitset_zone; + bitset->cardinality = counts[i] + UINT32_C(1); + rb->high_low_container.containers[i] = bitset; + bitset_zone += BITSET_CONTAINER_SIZE_IN_WORDS; + break; + } + case RUN_CONTAINER_TYPE: { + run_container_t *run = (run_container_t *) + arena_alloc(&arena, sizeof(run_container_t)); + run->capacity = counts[i]; + run->n_runs = counts[i]; + run->runs = run_zone; + rb->high_low_container.containers[i] = run; + run_zone += run->n_runs; + break; + } + case ARRAY_CONTAINER_TYPE: { + array_container_t *array = (array_container_t *) + arena_alloc(&arena, sizeof(array_container_t)); + array->capacity = counts[i] + UINT32_C(1); + array->cardinality = counts[i] + UINT32_C(1); + array->array = array_zone; + rb->high_low_container.containers[i] = array; + array_zone += counts[i] + UINT32_C(1); + break; + } + default: + roaring_free(arena); + return NULL; + } + } + + return rb; +} + +ALLOW_UNALIGNED +roaring_bitmap_t *roaring_bitmap_portable_deserialize_frozen(const char *buf) { + char *start_of_buf = (char *) buf; + uint32_t cookie; + int32_t num_containers; + uint16_t *descriptive_headers; + uint32_t *offset_headers = NULL; + const char *run_flag_bitset = NULL; + bool hasrun = false; + + // deserialize cookie + memcpy(&cookie, buf, sizeof(uint32_t)); + buf += sizeof(uint32_t); + if (cookie == SERIAL_COOKIE_NO_RUNCONTAINER) { + memcpy(&num_containers, buf, sizeof(int32_t)); + buf += sizeof(int32_t); + descriptive_headers = (uint16_t *) buf; + buf += num_containers * 2 * sizeof(uint16_t); + offset_headers = (uint32_t *) buf; + buf += num_containers * sizeof(uint32_t); + } else if ((cookie & 0xFFFF) == SERIAL_COOKIE) { + num_containers = (cookie >> 16) + 1; + hasrun = true; + int32_t run_flag_bitset_size = (num_containers + 7) / 8; + run_flag_bitset = buf; + buf += run_flag_bitset_size; + descriptive_headers = (uint16_t *) buf; + buf += num_containers * 2 * sizeof(uint16_t); + if(num_containers >= NO_OFFSET_THRESHOLD) { + offset_headers = (uint32_t *) buf; + buf += num_containers * sizeof(uint32_t); + } + } else { + return NULL; + } + + // calculate total size for allocation + int32_t num_bitset_containers = 0; + int32_t num_run_containers = 0; + int32_t num_array_containers = 0; + + for (int32_t i = 0; i < num_containers; i++) { + uint16_t tmp; + memcpy(&tmp, descriptive_headers + 2*i+1, sizeof(tmp)); + uint32_t cardinality = tmp + 1; + bool isbitmap = (cardinality > DEFAULT_MAX_SIZE); + bool isrun = false; + if(hasrun) { + if((run_flag_bitset[i / 8] & (1 << (i % 8))) != 0) { + isbitmap = false; + isrun = true; + } + } + + if (isbitmap) { + num_bitset_containers++; + } else if (isrun) { + num_run_containers++; + } else { + num_array_containers++; + } + } + + size_t alloc_size = 0; + alloc_size += sizeof(roaring_bitmap_t); + alloc_size += num_containers * sizeof(container_t*); + alloc_size += num_bitset_containers * sizeof(bitset_container_t); + alloc_size += num_run_containers * sizeof(run_container_t); + alloc_size += num_array_containers * sizeof(array_container_t); + alloc_size += num_containers * sizeof(uint16_t); // keys + alloc_size += num_containers * sizeof(uint8_t); // typecodes + + // allocate bitmap and construct containers + char *arena = (char *)roaring_malloc(alloc_size); + if (arena == NULL) { + return NULL; + } + + roaring_bitmap_t *rb = (roaring_bitmap_t *) + arena_alloc(&arena, sizeof(roaring_bitmap_t)); + rb->high_low_container.flags = ROARING_FLAG_FROZEN; + rb->high_low_container.allocation_size = num_containers; + rb->high_low_container.size = num_containers; + rb->high_low_container.containers = + (container_t **)arena_alloc(&arena, + sizeof(container_t*) * num_containers); + + uint16_t *keys = (uint16_t *)arena_alloc(&arena, num_containers * sizeof(uint16_t)); + uint8_t *typecodes = (uint8_t *)arena_alloc(&arena, num_containers * sizeof(uint8_t)); + + rb->high_low_container.keys = keys; + rb->high_low_container.typecodes = typecodes; + + for (int32_t i = 0; i < num_containers; i++) { + uint16_t tmp; + memcpy(&tmp, descriptive_headers + 2*i+1, sizeof(tmp)); + int32_t cardinality = tmp + 1; + bool isbitmap = (cardinality > DEFAULT_MAX_SIZE); + bool isrun = false; + if(hasrun) { + if((run_flag_bitset[i / 8] & (1 << (i % 8))) != 0) { + isbitmap = false; + isrun = true; + } + } + + keys[i] = descriptive_headers[2*i]; + + if (isbitmap) { + typecodes[i] = BITSET_CONTAINER_TYPE; + bitset_container_t *c = (bitset_container_t *)arena_alloc(&arena, sizeof(bitset_container_t)); + c->cardinality = cardinality; + if(offset_headers != NULL) { + c->words = (uint64_t *) (start_of_buf + offset_headers[i]); + } else { + c->words = (uint64_t *) buf; + buf += BITSET_CONTAINER_SIZE_IN_WORDS * sizeof(uint64_t); + } + rb->high_low_container.containers[i] = c; + } else if (isrun) { + typecodes[i] = RUN_CONTAINER_TYPE; + run_container_t *c = (run_container_t *)arena_alloc(&arena, sizeof(run_container_t)); + c->capacity = cardinality; + uint16_t n_runs; + if(offset_headers != NULL) { + memcpy(&n_runs, start_of_buf + offset_headers[i], sizeof(uint16_t)); + c->n_runs = n_runs; + c->runs = (rle16_t *) (start_of_buf + offset_headers[i] + sizeof(uint16_t)); + } else { + memcpy(&n_runs, buf, sizeof(uint16_t)); + c->n_runs = n_runs; + buf += sizeof(uint16_t); + c->runs = (rle16_t *) buf; + buf += c->n_runs * sizeof(rle16_t); + } + rb->high_low_container.containers[i] = c; + } else { + typecodes[i] = ARRAY_CONTAINER_TYPE; + array_container_t *c = (array_container_t *)arena_alloc(&arena, sizeof(array_container_t)); + c->cardinality = cardinality; + c->capacity = cardinality; + if(offset_headers != NULL) { + c->array = (uint16_t *) (start_of_buf + offset_headers[i]); + } else { + c->array = (uint16_t *) buf; + buf += cardinality * sizeof(uint16_t); + } + rb->high_low_container.containers[i] = c; + } + } + + return rb; +} + +bool roaring_bitmap_to_bitset(const roaring_bitmap_t *r, bitset_t * bitset) { + uint32_t max_value = roaring_bitmap_maximum(r); + size_t new_array_size = (size_t)(((uint64_t)max_value + 63)/64); + bool resize_ok = bitset_resize(bitset, new_array_size, true); + if(!resize_ok) { return false; } + const roaring_array_t *ra = &r->high_low_container; + for (int i = 0; i < ra->size; ++i) { + uint64_t* words = bitset->array + (ra->keys[i]<<10); + uint8_t type = ra->typecodes[i]; + const container_t *c = ra->containers[i]; + if(type == SHARED_CONTAINER_TYPE) { + c = container_unwrap_shared(c, &type); + } + switch (type) { + case BITSET_CONTAINER_TYPE: + { + size_t max_word_index = new_array_size - (ra->keys[i]<<10); + if(max_word_index > 1024) { max_word_index = 1024; } + const bitset_container_t *src = const_CAST_bitset(c); + memcpy(words, src->words, max_word_index * sizeof(uint64_t)); + } + break; + case ARRAY_CONTAINER_TYPE: + { + const array_container_t *src = const_CAST_array(c); + bitset_set_list(words, src->array, src->cardinality); + } + break; + case RUN_CONTAINER_TYPE: + { + const run_container_t *src = const_CAST_run(c); + for (int32_t rlepos = 0; rlepos < src->n_runs; ++rlepos) { + rle16_t rle = src->runs[rlepos]; + bitset_set_lenrange(words, rle.value, rle.length); + } + } + break; + default: + roaring_unreachable; + } + } + return true; +} + +#ifdef __cplusplus +} } } // extern "C" { namespace roaring { +#endif +/* end file src/roaring.c */ +/* begin file src/roaring_array.c */ +#include +#include +#include +#include +#include +#include + + +#ifdef __cplusplus +extern "C" { namespace roaring { namespace internal { +#endif + +// Convention: [0,ra->size) all elements are initialized +// [ra->size, ra->allocation_size) is junk and contains nothing needing freeing + +extern inline int32_t ra_get_size(const roaring_array_t *ra); +extern inline int32_t ra_get_index(const roaring_array_t *ra, uint16_t x); + +extern inline container_t *ra_get_container_at_index( + const roaring_array_t *ra, uint16_t i, + uint8_t *typecode); + +extern inline void ra_unshare_container_at_index(roaring_array_t *ra, + uint16_t i); + +extern inline void ra_replace_key_and_container_at_index( + roaring_array_t *ra, int32_t i, uint16_t key, + container_t *c, uint8_t typecode); + +extern inline void ra_set_container_at_index( + const roaring_array_t *ra, int32_t i, + container_t *c, uint8_t typecode); + +static bool realloc_array(roaring_array_t *ra, int32_t new_capacity) { + // + // Note: not implemented using C's realloc(), because the memory layout is + // Struct-of-Arrays vs. Array-of-Structs: + // https://github.com/RoaringBitmap/CRoaring/issues/256 + + if ( new_capacity == 0 ) { + roaring_free(ra->containers); + ra->containers = NULL; + ra->keys = NULL; + ra->typecodes = NULL; + ra->allocation_size = 0; + return true; + } + const size_t memoryneeded = new_capacity * ( + sizeof(uint16_t) + sizeof(container_t *) + sizeof(uint8_t)); + void *bigalloc = roaring_malloc(memoryneeded); + if (!bigalloc) return false; + void *oldbigalloc = ra->containers; + container_t **newcontainers = (container_t **)bigalloc; + uint16_t *newkeys = (uint16_t *)(newcontainers + new_capacity); + uint8_t *newtypecodes = (uint8_t *)(newkeys + new_capacity); + assert((char *)(newtypecodes + new_capacity) == + (char *)bigalloc + memoryneeded); + if(ra->size > 0) { + memcpy(newcontainers, ra->containers, sizeof(container_t *) * ra->size); + memcpy(newkeys, ra->keys, sizeof(uint16_t) * ra->size); + memcpy(newtypecodes, ra->typecodes, sizeof(uint8_t) * ra->size); + } + ra->containers = newcontainers; + ra->keys = newkeys; + ra->typecodes = newtypecodes; + ra->allocation_size = new_capacity; + roaring_free(oldbigalloc); + return true; +} + +bool ra_init_with_capacity(roaring_array_t *new_ra, uint32_t cap) { + if (!new_ra) return false; + ra_init(new_ra); + + if (cap > INT32_MAX) { return false; } + + if(cap > 0) { + void *bigalloc = roaring_malloc(cap * + (sizeof(uint16_t) + sizeof(container_t *) + sizeof(uint8_t))); + if( bigalloc == NULL ) return false; + new_ra->containers = (container_t **)bigalloc; + new_ra->keys = (uint16_t *)(new_ra->containers + cap); + new_ra->typecodes = (uint8_t *)(new_ra->keys + cap); + // Narrowing is safe because of above check + new_ra->allocation_size = (int32_t)cap; + } + return true; +} + +int ra_shrink_to_fit(roaring_array_t *ra) { + int savings = (ra->allocation_size - ra->size) * + (sizeof(uint16_t) + sizeof(container_t *) + sizeof(uint8_t)); + if (!realloc_array(ra, ra->size)) { + return 0; + } + ra->allocation_size = ra->size; + return savings; +} + +void ra_init(roaring_array_t *new_ra) { + if (!new_ra) { return; } + new_ra->keys = NULL; + new_ra->containers = NULL; + new_ra->typecodes = NULL; + + new_ra->allocation_size = 0; + new_ra->size = 0; + new_ra->flags = 0; +} + +bool ra_overwrite(const roaring_array_t *source, roaring_array_t *dest, + bool copy_on_write) { + ra_clear_containers(dest); // we are going to overwrite them + if (source->size == 0) { // Note: can't call memcpy(NULL), even w/size + dest->size = 0; // <--- This is important. + return true; // output was just cleared, so they match + } + if (dest->allocation_size < source->size) { + if (!realloc_array(dest, source->size)) { + return false; + } + } + dest->size = source->size; + memcpy(dest->keys, source->keys, dest->size * sizeof(uint16_t)); + // we go through the containers, turning them into shared containers... + if (copy_on_write) { + for (int32_t i = 0; i < dest->size; ++i) { + source->containers[i] = get_copy_of_container( + source->containers[i], &source->typecodes[i], copy_on_write); + } + // we do a shallow copy to the other bitmap + memcpy(dest->containers, source->containers, + dest->size * sizeof(container_t *)); + memcpy(dest->typecodes, source->typecodes, + dest->size * sizeof(uint8_t)); + } else { + memcpy(dest->typecodes, source->typecodes, + dest->size * sizeof(uint8_t)); + for (int32_t i = 0; i < dest->size; i++) { + dest->containers[i] = + container_clone(source->containers[i], source->typecodes[i]); + if (dest->containers[i] == NULL) { + for (int32_t j = 0; j < i; j++) { + container_free(dest->containers[j], dest->typecodes[j]); + } + ra_clear_without_containers(dest); + return false; + } + } + } + return true; +} + +void ra_clear_containers(roaring_array_t *ra) { + for (int32_t i = 0; i < ra->size; ++i) { + container_free(ra->containers[i], ra->typecodes[i]); + } +} + +void ra_reset(roaring_array_t *ra) { + ra_clear_containers(ra); + ra->size = 0; + ra_shrink_to_fit(ra); +} + +void ra_clear_without_containers(roaring_array_t *ra) { + roaring_free(ra->containers); // keys and typecodes are allocated with containers + ra->size = 0; + ra->allocation_size = 0; + ra->containers = NULL; + ra->keys = NULL; + ra->typecodes = NULL; +} + +void ra_clear(roaring_array_t *ra) { + ra_clear_containers(ra); + ra_clear_without_containers(ra); +} + +bool extend_array(roaring_array_t *ra, int32_t k) { + int32_t desired_size = ra->size + k; + const int32_t max_containers = 65536; + assert(desired_size <= max_containers); + if (desired_size > ra->allocation_size) { + int32_t new_capacity = + (ra->size < 1024) ? 2 * desired_size : 5 * desired_size / 4; + if (new_capacity > max_containers) { + new_capacity = max_containers; + } + + return realloc_array(ra, new_capacity); + } + return true; +} + +void ra_append( + roaring_array_t *ra, uint16_t key, + container_t *c, uint8_t typecode +){ + extend_array(ra, 1); + const int32_t pos = ra->size; + + ra->keys[pos] = key; + ra->containers[pos] = c; + ra->typecodes[pos] = typecode; + ra->size++; +} + +void ra_append_copy(roaring_array_t *ra, const roaring_array_t *sa, + uint16_t index, bool copy_on_write) { + extend_array(ra, 1); + const int32_t pos = ra->size; + + // old contents is junk not needing freeing + ra->keys[pos] = sa->keys[index]; + // the shared container will be in two bitmaps + if (copy_on_write) { + sa->containers[index] = get_copy_of_container( + sa->containers[index], &sa->typecodes[index], copy_on_write); + ra->containers[pos] = sa->containers[index]; + ra->typecodes[pos] = sa->typecodes[index]; + } else { + ra->containers[pos] = + container_clone(sa->containers[index], sa->typecodes[index]); + ra->typecodes[pos] = sa->typecodes[index]; + } + ra->size++; +} + +void ra_append_copies_until(roaring_array_t *ra, const roaring_array_t *sa, + uint16_t stopping_key, bool copy_on_write) { + for (int32_t i = 0; i < sa->size; ++i) { + if (sa->keys[i] >= stopping_key) break; + ra_append_copy(ra, sa, i, copy_on_write); + } +} + +void ra_append_copy_range(roaring_array_t *ra, const roaring_array_t *sa, + int32_t start_index, int32_t end_index, + bool copy_on_write) { + extend_array(ra, end_index - start_index); + for (int32_t i = start_index; i < end_index; ++i) { + const int32_t pos = ra->size; + ra->keys[pos] = sa->keys[i]; + if (copy_on_write) { + sa->containers[i] = get_copy_of_container( + sa->containers[i], &sa->typecodes[i], copy_on_write); + ra->containers[pos] = sa->containers[i]; + ra->typecodes[pos] = sa->typecodes[i]; + } else { + ra->containers[pos] = + container_clone(sa->containers[i], sa->typecodes[i]); + ra->typecodes[pos] = sa->typecodes[i]; + } + ra->size++; + } +} + +void ra_append_copies_after(roaring_array_t *ra, const roaring_array_t *sa, + uint16_t before_start, bool copy_on_write) { + int start_location = ra_get_index(sa, before_start); + if (start_location >= 0) + ++start_location; + else + start_location = -start_location - 1; + ra_append_copy_range(ra, sa, start_location, sa->size, copy_on_write); +} + +void ra_append_move_range(roaring_array_t *ra, roaring_array_t *sa, + int32_t start_index, int32_t end_index) { + extend_array(ra, end_index - start_index); + + for (int32_t i = start_index; i < end_index; ++i) { + const int32_t pos = ra->size; + + ra->keys[pos] = sa->keys[i]; + ra->containers[pos] = sa->containers[i]; + ra->typecodes[pos] = sa->typecodes[i]; + ra->size++; + } +} + +void ra_append_range(roaring_array_t *ra, roaring_array_t *sa, + int32_t start_index, int32_t end_index, + bool copy_on_write) { + extend_array(ra, end_index - start_index); + + for (int32_t i = start_index; i < end_index; ++i) { + const int32_t pos = ra->size; + ra->keys[pos] = sa->keys[i]; + if (copy_on_write) { + sa->containers[i] = get_copy_of_container( + sa->containers[i], &sa->typecodes[i], copy_on_write); + ra->containers[pos] = sa->containers[i]; + ra->typecodes[pos] = sa->typecodes[i]; + } else { + ra->containers[pos] = + container_clone(sa->containers[i], sa->typecodes[i]); + ra->typecodes[pos] = sa->typecodes[i]; + } + ra->size++; + } +} + +container_t *ra_get_container( + roaring_array_t *ra, uint16_t x, uint8_t *typecode +){ + int i = binarySearch(ra->keys, (int32_t)ra->size, x); + if (i < 0) return NULL; + *typecode = ra->typecodes[i]; + return ra->containers[i]; +} + +extern inline container_t *ra_get_container_at_index( + const roaring_array_t *ra, uint16_t i, + uint8_t *typecode); + +extern inline uint16_t ra_get_key_at_index(const roaring_array_t *ra, + uint16_t i); + +extern inline int32_t ra_get_index(const roaring_array_t *ra, uint16_t x); + +extern inline int32_t ra_advance_until(const roaring_array_t *ra, uint16_t x, + int32_t pos); + +// everything skipped over is freed +int32_t ra_advance_until_freeing(roaring_array_t *ra, uint16_t x, int32_t pos) { + while (pos < ra->size && ra->keys[pos] < x) { + container_free(ra->containers[pos], ra->typecodes[pos]); + ++pos; + } + return pos; +} + +void ra_insert_new_key_value_at( + roaring_array_t *ra, int32_t i, uint16_t key, + container_t *c, uint8_t typecode +){ + extend_array(ra, 1); + // May be an optimization opportunity with DIY memmove + memmove(&(ra->keys[i + 1]), &(ra->keys[i]), + sizeof(uint16_t) * (ra->size - i)); + memmove(&(ra->containers[i + 1]), &(ra->containers[i]), + sizeof(container_t *) * (ra->size - i)); + memmove(&(ra->typecodes[i + 1]), &(ra->typecodes[i]), + sizeof(uint8_t) * (ra->size - i)); + ra->keys[i] = key; + ra->containers[i] = c; + ra->typecodes[i] = typecode; + ra->size++; +} + +// note: Java routine set things to 0, enabling GC. +// Java called it "resize" but it was always used to downsize. +// Allowing upsize would break the conventions about +// valid containers below ra->size. + +void ra_downsize(roaring_array_t *ra, int32_t new_length) { + assert(new_length <= ra->size); + ra->size = new_length; +} + +void ra_remove_at_index(roaring_array_t *ra, int32_t i) { + memmove(&(ra->containers[i]), &(ra->containers[i + 1]), + sizeof(container_t *) * (ra->size - i - 1)); + memmove(&(ra->keys[i]), &(ra->keys[i + 1]), + sizeof(uint16_t) * (ra->size - i - 1)); + memmove(&(ra->typecodes[i]), &(ra->typecodes[i + 1]), + sizeof(uint8_t) * (ra->size - i - 1)); + ra->size--; +} + +void ra_remove_at_index_and_free(roaring_array_t *ra, int32_t i) { + container_free(ra->containers[i], ra->typecodes[i]); + ra_remove_at_index(ra, i); +} + +// used in inplace andNot only, to slide left the containers from +// the mutated RoaringBitmap that are after the largest container of +// the argument RoaringBitmap. In use it should be followed by a call to +// downsize. +// +void ra_copy_range(roaring_array_t *ra, uint32_t begin, uint32_t end, + uint32_t new_begin) { + assert(begin <= end); + assert(new_begin < begin); + + const int range = end - begin; + + // We ensure to previously have freed overwritten containers + // that are not copied elsewhere + + memmove(&(ra->containers[new_begin]), &(ra->containers[begin]), + sizeof(container_t *) * range); + memmove(&(ra->keys[new_begin]), &(ra->keys[begin]), + sizeof(uint16_t) * range); + memmove(&(ra->typecodes[new_begin]), &(ra->typecodes[begin]), + sizeof(uint8_t) * range); +} + +void ra_shift_tail(roaring_array_t *ra, int32_t count, int32_t distance) { + if (distance > 0) { + extend_array(ra, distance); + } + int32_t srcpos = ra->size - count; + int32_t dstpos = srcpos + distance; + memmove(&(ra->keys[dstpos]), &(ra->keys[srcpos]), + sizeof(uint16_t) * count); + memmove(&(ra->containers[dstpos]), &(ra->containers[srcpos]), + sizeof(container_t *) * count); + memmove(&(ra->typecodes[dstpos]), &(ra->typecodes[srcpos]), + sizeof(uint8_t) * count); + ra->size += distance; +} + + +void ra_to_uint32_array(const roaring_array_t *ra, uint32_t *ans) { + size_t ctr = 0; + for (int32_t i = 0; i < ra->size; ++i) { + int num_added = container_to_uint32_array( + ans + ctr, ra->containers[i], ra->typecodes[i], + ((uint32_t)ra->keys[i]) << 16); + ctr += num_added; + } +} + +bool ra_range_uint32_array(const roaring_array_t *ra, size_t offset, size_t limit, uint32_t *ans) { + size_t ctr = 0; + size_t dtr = 0; + + size_t t_limit = 0; + + bool first = false; + size_t first_skip = 0; + + uint32_t *t_ans = NULL; + size_t cur_len = 0; + + for (int i = 0; i < ra->size; ++i) { + + const container_t *c = container_unwrap_shared( + ra->containers[i], &ra->typecodes[i]); + switch (ra->typecodes[i]) { + case BITSET_CONTAINER_TYPE: + t_limit = (const_CAST_bitset(c))->cardinality; + break; + case ARRAY_CONTAINER_TYPE: + t_limit = (const_CAST_array(c))->cardinality; + break; + case RUN_CONTAINER_TYPE: + t_limit = run_container_cardinality(const_CAST_run(c)); + break; + } + if (ctr + t_limit - 1 >= offset && ctr < offset + limit){ + if (!first){ + //first_skip = t_limit - (ctr + t_limit - offset); + first_skip = offset - ctr; + first = true; + t_ans = (uint32_t *)roaring_malloc(sizeof(*t_ans) * (first_skip + limit)); + if(t_ans == NULL) { + return false; + } + memset(t_ans, 0, sizeof(*t_ans) * (first_skip + limit)) ; + cur_len = first_skip + limit; + } + if (dtr + t_limit > cur_len){ + uint32_t * append_ans = (uint32_t *)roaring_malloc(sizeof(*append_ans) * (cur_len + t_limit)); + if(append_ans == NULL) { + if(t_ans != NULL) roaring_free(t_ans); + return false; + } + memset(append_ans, 0, sizeof(*append_ans) * (cur_len + t_limit)); + cur_len = cur_len + t_limit; + memcpy(append_ans, t_ans, dtr * sizeof(uint32_t)); + roaring_free(t_ans); + t_ans = append_ans; + } + switch (ra->typecodes[i]) { + case BITSET_CONTAINER_TYPE: + container_to_uint32_array( + t_ans + dtr, + const_CAST_bitset(c), ra->typecodes[i], + ((uint32_t)ra->keys[i]) << 16); + break; + case ARRAY_CONTAINER_TYPE: + container_to_uint32_array( + t_ans + dtr, + const_CAST_array(c), ra->typecodes[i], + ((uint32_t)ra->keys[i]) << 16); + break; + case RUN_CONTAINER_TYPE: + container_to_uint32_array( + t_ans + dtr, + const_CAST_run(c), ra->typecodes[i], + ((uint32_t)ra->keys[i]) << 16); + break; + } + dtr += t_limit; + } + ctr += t_limit; + if (dtr-first_skip >= limit) break; + } + if(t_ans != NULL) { + memcpy(ans, t_ans+first_skip, limit * sizeof(uint32_t)); + free(t_ans); + } + return true; +} + +bool ra_has_run_container(const roaring_array_t *ra) { + for (int32_t k = 0; k < ra->size; ++k) { + if (get_container_type(ra->containers[k], ra->typecodes[k]) == + RUN_CONTAINER_TYPE) + return true; + } + return false; +} + +uint32_t ra_portable_header_size(const roaring_array_t *ra) { + if (ra_has_run_container(ra)) { + if (ra->size < + NO_OFFSET_THRESHOLD) { // for small bitmaps, we omit the offsets + return 4 + (ra->size + 7) / 8 + 4 * ra->size; + } + return 4 + (ra->size + 7) / 8 + + 8 * ra->size; // - 4 because we pack the size with the cookie + } else { + return 4 + 4 + 8 * ra->size; + } +} + +size_t ra_portable_size_in_bytes(const roaring_array_t *ra) { + size_t count = ra_portable_header_size(ra); + + for (int32_t k = 0; k < ra->size; ++k) { + count += container_size_in_bytes(ra->containers[k], ra->typecodes[k]); + } + return count; +} + +// This function is endian-sensitive. +size_t ra_portable_serialize(const roaring_array_t *ra, char *buf) { + char *initbuf = buf; + uint32_t startOffset = 0; + bool hasrun = ra_has_run_container(ra); + if (hasrun) { + uint32_t cookie = SERIAL_COOKIE | ((ra->size - 1) << 16); + memcpy(buf, &cookie, sizeof(cookie)); + buf += sizeof(cookie); + uint32_t s = (ra->size + 7) / 8; + uint8_t *bitmapOfRunContainers = (uint8_t *)roaring_calloc(s, 1); + assert(bitmapOfRunContainers != NULL); // todo: handle + for (int32_t i = 0; i < ra->size; ++i) { + if (get_container_type(ra->containers[i], ra->typecodes[i]) == + RUN_CONTAINER_TYPE) { + bitmapOfRunContainers[i / 8] |= (1 << (i % 8)); + } + } + memcpy(buf, bitmapOfRunContainers, s); + buf += s; + roaring_free(bitmapOfRunContainers); + if (ra->size < NO_OFFSET_THRESHOLD) { + startOffset = 4 + 4 * ra->size + s; + } else { + startOffset = 4 + 8 * ra->size + s; + } + } else { // backwards compatibility + uint32_t cookie = SERIAL_COOKIE_NO_RUNCONTAINER; + + memcpy(buf, &cookie, sizeof(cookie)); + buf += sizeof(cookie); + memcpy(buf, &ra->size, sizeof(ra->size)); + buf += sizeof(ra->size); + + startOffset = 4 + 4 + 4 * ra->size + 4 * ra->size; + } + for (int32_t k = 0; k < ra->size; ++k) { + memcpy(buf, &ra->keys[k], sizeof(ra->keys[k])); + buf += sizeof(ra->keys[k]); + // get_cardinality returns a value in [1,1<<16], subtracting one + // we get [0,1<<16 - 1] which fits in 16 bits + uint16_t card = (uint16_t)( + container_get_cardinality(ra->containers[k], ra->typecodes[k]) - 1); + memcpy(buf, &card, sizeof(card)); + buf += sizeof(card); + } + if ((!hasrun) || (ra->size >= NO_OFFSET_THRESHOLD)) { + // writing the containers offsets + for (int32_t k = 0; k < ra->size; k++) { + memcpy(buf, &startOffset, sizeof(startOffset)); + buf += sizeof(startOffset); + startOffset = + startOffset + + container_size_in_bytes(ra->containers[k], ra->typecodes[k]); + } + } + for (int32_t k = 0; k < ra->size; ++k) { + buf += container_write(ra->containers[k], ra->typecodes[k], buf); + } + return buf - initbuf; +} + +// Quickly checks whether there is a serialized bitmap at the pointer, +// not exceeding size "maxbytes" in bytes. This function does not allocate +// memory dynamically. +// +// This function returns 0 if and only if no valid bitmap is found. +// Otherwise, it returns how many bytes are occupied. +// +size_t ra_portable_deserialize_size(const char *buf, const size_t maxbytes) { + size_t bytestotal = sizeof(int32_t);// for cookie + if(bytestotal > maxbytes) return 0; + uint32_t cookie; + memcpy(&cookie, buf, sizeof(int32_t)); + buf += sizeof(uint32_t); + if ((cookie & 0xFFFF) != SERIAL_COOKIE && + cookie != SERIAL_COOKIE_NO_RUNCONTAINER) { + return 0; + } + int32_t size; + + if ((cookie & 0xFFFF) == SERIAL_COOKIE) + size = (cookie >> 16) + 1; + else { + bytestotal += sizeof(int32_t); + if(bytestotal > maxbytes) return 0; + memcpy(&size, buf, sizeof(int32_t)); + buf += sizeof(uint32_t); + } + if (size > (1<<16)) { + return 0; // logically impossible + } + char *bitmapOfRunContainers = NULL; + bool hasrun = (cookie & 0xFFFF) == SERIAL_COOKIE; + if (hasrun) { + int32_t s = (size + 7) / 8; + bytestotal += s; + if(bytestotal > maxbytes) return 0; + bitmapOfRunContainers = (char *)buf; + buf += s; + } + bytestotal += size * 2 * sizeof(uint16_t); + if(bytestotal > maxbytes) return 0; + uint16_t *keyscards = (uint16_t *)buf; + buf += size * 2 * sizeof(uint16_t); + if ((!hasrun) || (size >= NO_OFFSET_THRESHOLD)) { + // skipping the offsets + bytestotal += size * 4; + if(bytestotal > maxbytes) return 0; + buf += size * 4; + } + // Reading the containers + for (int32_t k = 0; k < size; ++k) { + uint16_t tmp; + memcpy(&tmp, keyscards + 2*k+1, sizeof(tmp)); + uint32_t thiscard = tmp + 1; + bool isbitmap = (thiscard > DEFAULT_MAX_SIZE); + bool isrun = false; + if(hasrun) { + if((bitmapOfRunContainers[k / 8] & (1 << (k % 8))) != 0) { + isbitmap = false; + isrun = true; + } + } + if (isbitmap) { + size_t containersize = BITSET_CONTAINER_SIZE_IN_WORDS * sizeof(uint64_t); + bytestotal += containersize; + if(bytestotal > maxbytes) return 0; + buf += containersize; + } else if (isrun) { + bytestotal += sizeof(uint16_t); + if(bytestotal > maxbytes) return 0; + uint16_t n_runs; + memcpy(&n_runs, buf, sizeof(uint16_t)); + buf += sizeof(uint16_t); + size_t containersize = n_runs * sizeof(rle16_t); + bytestotal += containersize; + if(bytestotal > maxbytes) return 0; + buf += containersize; + } else { + size_t containersize = thiscard * sizeof(uint16_t); + bytestotal += containersize; + if(bytestotal > maxbytes) return 0; + buf += containersize; + } + } + return bytestotal; +} + +// this function populates answer from the content of buf (reading up to maxbytes bytes). +// The function returns false if a properly serialized bitmap cannot be found. +// if it returns true, readbytes is populated by how many bytes were read, we have that *readbytes <= maxbytes. +// +// This function is endian-sensitive. +bool ra_portable_deserialize(roaring_array_t *answer, const char *buf, const size_t maxbytes, size_t * readbytes) { + *readbytes = sizeof(int32_t);// for cookie + if(*readbytes > maxbytes) { + fprintf(stderr, "Ran out of bytes while reading first 4 bytes.\n"); + return false; + } + uint32_t cookie; + memcpy(&cookie, buf, sizeof(int32_t)); + buf += sizeof(uint32_t); + if ((cookie & 0xFFFF) != SERIAL_COOKIE && + cookie != SERIAL_COOKIE_NO_RUNCONTAINER) { + fprintf(stderr, "I failed to find one of the right cookies. Found %" PRIu32 "\n", + cookie); + return false; + } + int32_t size; + + if ((cookie & 0xFFFF) == SERIAL_COOKIE) + size = (cookie >> 16) + 1; + else { + *readbytes += sizeof(int32_t); + if(*readbytes > maxbytes) { + fprintf(stderr, "Ran out of bytes while reading second part of the cookie.\n"); + return false; + } + memcpy(&size, buf, sizeof(int32_t)); + buf += sizeof(uint32_t); + } + if (size < 0) { + fprintf(stderr, "You cannot have a negative number of containers, the data must be corrupted: %" PRId32 "\n", + size); + return false; // logically impossible + } + if (size > (1<<16)) { + fprintf(stderr, "You cannot have so many containers, the data must be corrupted: %" PRId32 "\n", + size); + return false; // logically impossible + } + const char *bitmapOfRunContainers = NULL; + bool hasrun = (cookie & 0xFFFF) == SERIAL_COOKIE; + if (hasrun) { + int32_t s = (size + 7) / 8; + *readbytes += s; + if(*readbytes > maxbytes) {// data is corrupted? + fprintf(stderr, "Ran out of bytes while reading run bitmap.\n"); + return false; + } + bitmapOfRunContainers = buf; + buf += s; + } + uint16_t *keyscards = (uint16_t *)buf; + + *readbytes += size * 2 * sizeof(uint16_t); + if(*readbytes > maxbytes) { + fprintf(stderr, "Ran out of bytes while reading key-cardinality array.\n"); + return false; + } + buf += size * 2 * sizeof(uint16_t); + + bool is_ok = ra_init_with_capacity(answer, size); + if (!is_ok) { + fprintf(stderr, "Failed to allocate memory for roaring array. Bailing out.\n"); + return false; + } + + for (int32_t k = 0; k < size; ++k) { + uint16_t tmp; + memcpy(&tmp, keyscards + 2*k, sizeof(tmp)); + answer->keys[k] = tmp; + } + if ((!hasrun) || (size >= NO_OFFSET_THRESHOLD)) { + *readbytes += size * 4; + if(*readbytes > maxbytes) {// data is corrupted? + fprintf(stderr, "Ran out of bytes while reading offsets.\n"); + ra_clear(answer);// we need to clear the containers already allocated, and the roaring array + return false; + } + + // skipping the offsets + buf += size * 4; + } + // Reading the containers + for (int32_t k = 0; k < size; ++k) { + uint16_t tmp; + memcpy(&tmp, keyscards + 2*k+1, sizeof(tmp)); + uint32_t thiscard = tmp + 1; + bool isbitmap = (thiscard > DEFAULT_MAX_SIZE); + bool isrun = false; + if(hasrun) { + if((bitmapOfRunContainers[k / 8] & (1 << (k % 8))) != 0) { + isbitmap = false; + isrun = true; + } + } + if (isbitmap) { + // we check that the read is allowed + size_t containersize = BITSET_CONTAINER_SIZE_IN_WORDS * sizeof(uint64_t); + *readbytes += containersize; + if(*readbytes > maxbytes) { + fprintf(stderr, "Running out of bytes while reading a bitset container.\n"); + ra_clear(answer);// we need to clear the containers already allocated, and the roaring array + return false; + } + // it is now safe to read + bitset_container_t *c = bitset_container_create(); + if(c == NULL) {// memory allocation failure + fprintf(stderr, "Failed to allocate memory for a bitset container.\n"); + ra_clear(answer);// we need to clear the containers already allocated, and the roaring array + return false; + } + answer->size++; + buf += bitset_container_read(thiscard, c, buf); + answer->containers[k] = c; + answer->typecodes[k] = BITSET_CONTAINER_TYPE; + } else if (isrun) { + // we check that the read is allowed + *readbytes += sizeof(uint16_t); + if(*readbytes > maxbytes) { + fprintf(stderr, "Running out of bytes while reading a run container (header).\n"); + ra_clear(answer);// we need to clear the containers already allocated, and the roaring array + return false; + } + uint16_t n_runs; + memcpy(&n_runs, buf, sizeof(uint16_t)); + size_t containersize = n_runs * sizeof(rle16_t); + *readbytes += containersize; + if(*readbytes > maxbytes) {// data is corrupted? + fprintf(stderr, "Running out of bytes while reading a run container.\n"); + ra_clear(answer);// we need to clear the containers already allocated, and the roaring array + return false; + } + // it is now safe to read + + run_container_t *c = run_container_create(); + if(c == NULL) {// memory allocation failure + fprintf(stderr, "Failed to allocate memory for a run container.\n"); + ra_clear(answer);// we need to clear the containers already allocated, and the roaring array + return false; + } + answer->size++; + buf += run_container_read(thiscard, c, buf); + answer->containers[k] = c; + answer->typecodes[k] = RUN_CONTAINER_TYPE; + } else { + // we check that the read is allowed + size_t containersize = thiscard * sizeof(uint16_t); + *readbytes += containersize; + if(*readbytes > maxbytes) {// data is corrupted? + fprintf(stderr, "Running out of bytes while reading an array container.\n"); + ra_clear(answer);// we need to clear the containers already allocated, and the roaring array + return false; + } + // it is now safe to read + array_container_t *c = + array_container_create_given_capacity(thiscard); + if(c == NULL) {// memory allocation failure + fprintf(stderr, "Failed to allocate memory for an array container.\n"); + ra_clear(answer);// we need to clear the containers already allocated, and the roaring array + return false; + } + answer->size++; + buf += array_container_read(thiscard, c, buf); + answer->containers[k] = c; + answer->typecodes[k] = ARRAY_CONTAINER_TYPE; + } + } + return true; +} + +#ifdef __cplusplus +} } } // extern "C" { namespace roaring { namespace internal { +#endif +/* end file src/roaring_array.c */ +/* begin file src/roaring_priority_queue.c */ + + +#ifdef __cplusplus +using namespace ::roaring::internal; + +extern "C" { namespace roaring { namespace api { +#endif + +struct roaring_pq_element_s { + uint64_t size; + bool is_temporary; + roaring_bitmap_t *bitmap; +}; + +typedef struct roaring_pq_element_s roaring_pq_element_t; + +struct roaring_pq_s { + roaring_pq_element_t *elements; + uint64_t size; +}; + +typedef struct roaring_pq_s roaring_pq_t; + +static inline bool compare(roaring_pq_element_t *t1, roaring_pq_element_t *t2) { + return t1->size < t2->size; +} + +static void pq_add(roaring_pq_t *pq, roaring_pq_element_t *t) { + uint64_t i = pq->size; + pq->elements[pq->size++] = *t; + while (i > 0) { + uint64_t p = (i - 1) >> 1; + roaring_pq_element_t ap = pq->elements[p]; + if (!compare(t, &ap)) break; + pq->elements[i] = ap; + i = p; + } + pq->elements[i] = *t; +} + +static void pq_free(roaring_pq_t *pq) { + roaring_free(pq); +} + +static void percolate_down(roaring_pq_t *pq, uint32_t i) { + uint32_t size = (uint32_t)pq->size; + uint32_t hsize = size >> 1; + roaring_pq_element_t ai = pq->elements[i]; + while (i < hsize) { + uint32_t l = (i << 1) + 1; + uint32_t r = l + 1; + roaring_pq_element_t bestc = pq->elements[l]; + if (r < size) { + if (compare(pq->elements + r, &bestc)) { + l = r; + bestc = pq->elements[r]; + } + } + if (!compare(&bestc, &ai)) { + break; + } + pq->elements[i] = bestc; + i = l; + } + pq->elements[i] = ai; +} + +static roaring_pq_t *create_pq(const roaring_bitmap_t **arr, uint32_t length) { + size_t alloc_size = sizeof(roaring_pq_t) + sizeof(roaring_pq_element_t) * length; + roaring_pq_t *answer = (roaring_pq_t *)roaring_malloc(alloc_size); + answer->elements = (roaring_pq_element_t *)(answer + 1); + answer->size = length; + for (uint32_t i = 0; i < length; i++) { + answer->elements[i].bitmap = (roaring_bitmap_t *)arr[i]; + answer->elements[i].is_temporary = false; + answer->elements[i].size = + roaring_bitmap_portable_size_in_bytes(arr[i]); + } + for (int32_t i = (length >> 1); i >= 0; i--) { + percolate_down(answer, i); + } + return answer; +} + +static roaring_pq_element_t pq_poll(roaring_pq_t *pq) { + roaring_pq_element_t ans = *pq->elements; + if (pq->size > 1) { + pq->elements[0] = pq->elements[--pq->size]; + percolate_down(pq, 0); + } else + --pq->size; + // memmove(pq->elements,pq->elements+1,(pq->size-1)*sizeof(roaring_pq_element_t));--pq->size; + return ans; +} + +// this function consumes and frees the inputs +static roaring_bitmap_t *lazy_or_from_lazy_inputs(roaring_bitmap_t *x1, + roaring_bitmap_t *x2) { + uint8_t result_type = 0; + const int length1 = ra_get_size(&x1->high_low_container), + length2 = ra_get_size(&x2->high_low_container); + if (0 == length1) { + roaring_bitmap_free(x1); + return x2; + } + if (0 == length2) { + roaring_bitmap_free(x2); + return x1; + } + uint32_t neededcap = length1 > length2 ? length2 : length1; + roaring_bitmap_t *answer = roaring_bitmap_create_with_capacity(neededcap); + int pos1 = 0, pos2 = 0; + uint8_t type1, type2; + uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1); + uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2); + while (true) { + if (s1 == s2) { + // todo: unsharing can be inefficient as it may create a clone where + // none + // is needed, but it has the benefit of being easy to reason about. + + ra_unshare_container_at_index(&x1->high_low_container, pos1); + container_t *c1 = ra_get_container_at_index( + &x1->high_low_container, pos1, &type1); + assert(type1 != SHARED_CONTAINER_TYPE); + + ra_unshare_container_at_index(&x2->high_low_container, pos2); + container_t *c2 = ra_get_container_at_index( + &x2->high_low_container, pos2, &type2); + assert(type2 != SHARED_CONTAINER_TYPE); + + container_t *c; + + if ((type2 == BITSET_CONTAINER_TYPE) && + (type1 != BITSET_CONTAINER_TYPE) + ){ + c = container_lazy_ior(c2, type2, c1, type1, &result_type); + container_free(c1, type1); + if (c != c2) { + container_free(c2, type2); + } + } else { + c = container_lazy_ior(c1, type1, c2, type2, &result_type); + container_free(c2, type2); + if (c != c1) { + container_free(c1, type1); + } + } + // since we assume that the initial containers are non-empty, the + // result here + // can only be non-empty + ra_append(&answer->high_low_container, s1, c, result_type); + ++pos1; + ++pos2; + if (pos1 == length1) break; + if (pos2 == length2) break; + s1 = ra_get_key_at_index(&x1->high_low_container, pos1); + s2 = ra_get_key_at_index(&x2->high_low_container, pos2); + + } else if (s1 < s2) { // s1 < s2 + container_t *c1 = ra_get_container_at_index( + &x1->high_low_container, pos1, &type1); + ra_append(&answer->high_low_container, s1, c1, type1); + pos1++; + if (pos1 == length1) break; + s1 = ra_get_key_at_index(&x1->high_low_container, pos1); + + } else { // s1 > s2 + container_t *c2 = ra_get_container_at_index( + &x2->high_low_container, pos2, &type2); + ra_append(&answer->high_low_container, s2, c2, type2); + pos2++; + if (pos2 == length2) break; + s2 = ra_get_key_at_index(&x2->high_low_container, pos2); + } + } + if (pos1 == length1) { + ra_append_move_range(&answer->high_low_container, + &x2->high_low_container, pos2, length2); + } else if (pos2 == length2) { + ra_append_move_range(&answer->high_low_container, + &x1->high_low_container, pos1, length1); + } + ra_clear_without_containers(&x1->high_low_container); + ra_clear_without_containers(&x2->high_low_container); + roaring_free(x1); + roaring_free(x2); + return answer; +} + +/** + * Compute the union of 'number' bitmaps using a heap. This can + * sometimes be faster than roaring_bitmap_or_many which uses + * a naive algorithm. Caller is responsible for freeing the + * result. + */ +roaring_bitmap_t *roaring_bitmap_or_many_heap(uint32_t number, + const roaring_bitmap_t **x) { + if (number == 0) { + return roaring_bitmap_create(); + } + if (number == 1) { + return roaring_bitmap_copy(x[0]); + } + roaring_pq_t *pq = create_pq(x, number); + while (pq->size > 1) { + roaring_pq_element_t x1 = pq_poll(pq); + roaring_pq_element_t x2 = pq_poll(pq); + + if (x1.is_temporary && x2.is_temporary) { + roaring_bitmap_t *newb = + lazy_or_from_lazy_inputs(x1.bitmap, x2.bitmap); + // should normally return a fresh new bitmap *except* that + // it can return x1.bitmap or x2.bitmap in degenerate cases + bool temporary = !((newb == x1.bitmap) && (newb == x2.bitmap)); + uint64_t bsize = roaring_bitmap_portable_size_in_bytes(newb); + roaring_pq_element_t newelement = { + .size = bsize, .is_temporary = temporary, .bitmap = newb}; + pq_add(pq, &newelement); + } else if (x2.is_temporary) { + roaring_bitmap_lazy_or_inplace(x2.bitmap, x1.bitmap, false); + x2.size = roaring_bitmap_portable_size_in_bytes(x2.bitmap); + pq_add(pq, &x2); + } else if (x1.is_temporary) { + roaring_bitmap_lazy_or_inplace(x1.bitmap, x2.bitmap, false); + x1.size = roaring_bitmap_portable_size_in_bytes(x1.bitmap); + + pq_add(pq, &x1); + } else { + roaring_bitmap_t *newb = + roaring_bitmap_lazy_or(x1.bitmap, x2.bitmap, false); + uint64_t bsize = roaring_bitmap_portable_size_in_bytes(newb); + roaring_pq_element_t newelement = { + .size = bsize, .is_temporary = true, .bitmap = newb}; + + pq_add(pq, &newelement); + } + } + roaring_pq_element_t X = pq_poll(pq); + roaring_bitmap_t *answer = X.bitmap; + roaring_bitmap_repair_after_lazy(answer); + pq_free(pq); + return answer; +} + +#ifdef __cplusplus +} } } // extern "C" { namespace roaring { namespace api { +#endif +/* end file src/roaring_priority_queue.c */ diff --git a/src/roaring.h b/src/roaring.h new file mode 100644 index 00000000..4b1769f1 --- /dev/null +++ b/src/roaring.h @@ -0,0 +1,2017 @@ +// !!! DO NOT EDIT - THIS IS AN AUTO-GENERATED FILE !!! +// Created by amalgamation.sh on 2023-06-08T13:19:03Z + +/* + * The CRoaring project is under a dual license (Apache/MIT). + * Users of the library may choose one or the other license. + */ +/* + * Copyright 2016-2022 The CRoaring authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ +/* + * MIT License + * + * Copyright 2016-2022 The CRoaring authors + * + * Permission is hereby granted, free of charge, to any + * person obtaining a copy of this software and associated + * documentation files (the "Software"), to deal in the + * Software without restriction, including without + * limitation the rights to use, copy, modify, merge, + * publish, distribute, sublicense, and/or sell copies of + * the Software, and to permit persons to whom the Software + * is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice + * shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF + * ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED + * TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A + * PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT + * SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR + * IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * SPDX-License-Identifier: MIT + */ + +/* begin file include/roaring/roaring_version.h */ +// /include/roaring/roaring_version.h automatically generated by release.py, do not change by hand +#ifndef ROARING_INCLUDE_ROARING_VERSION +#define ROARING_INCLUDE_ROARING_VERSION +#define ROARING_VERSION "1.1.5" +enum { + ROARING_VERSION_MAJOR = 1, + ROARING_VERSION_MINOR = 1, + ROARING_VERSION_REVISION = 5 +}; +#endif // ROARING_INCLUDE_ROARING_VERSION +/* end file include/roaring/roaring_version.h */ +/* begin file include/roaring/roaring_types.h */ +/* + Typedefs used by various components +*/ + +#ifndef ROARING_TYPES_H +#define ROARING_TYPES_H + +#include +#include + +#ifdef __cplusplus +extern "C" { namespace roaring { namespace api { +#endif + + +/** + * When building .c files as C++, there's added compile-time checking if the + * container types are derived from a `container_t` base class. So long as + * such a base class is empty, the struct will behave compatibly with C structs + * despite the derivation. This is due to the Empty Base Class Optimization: + * + * https://en.cppreference.com/w/cpp/language/ebo + * + * But since C isn't namespaced, taking `container_t` globally might collide + * with other projects. So roaring.h uses ROARING_CONTAINER_T, while internal + * code #undefs that after declaring `typedef ROARING_CONTAINER_T container_t;` + */ +#if defined(__cplusplus) + extern "C++" { + struct container_s {}; + } + #define ROARING_CONTAINER_T ::roaring::api::container_s +#else + #define ROARING_CONTAINER_T void // no compile-time checking +#endif + +#define ROARING_FLAG_COW UINT8_C(0x1) +#define ROARING_FLAG_FROZEN UINT8_C(0x2) + +/** + * Roaring arrays are array-based key-value pairs having containers as values + * and 16-bit integer keys. A roaring bitmap might be implemented as such. + */ + +// parallel arrays. Element sizes quite different. +// Alternative is array +// of structs. Which would have better +// cache performance through binary searches? + +typedef struct roaring_array_s { + int32_t size; + int32_t allocation_size; + ROARING_CONTAINER_T **containers; // Use container_t in non-API files! + uint16_t *keys; + uint8_t *typecodes; + uint8_t flags; +} roaring_array_t; + + +typedef bool (*roaring_iterator)(uint32_t value, void *param); +typedef bool (*roaring_iterator64)(uint64_t value, void *param); + +/** +* (For advanced users.) +* The roaring_statistics_t can be used to collect detailed statistics about +* the composition of a roaring bitmap. +*/ +typedef struct roaring_statistics_s { + uint32_t n_containers; /* number of containers */ + + uint32_t n_array_containers; /* number of array containers */ + uint32_t n_run_containers; /* number of run containers */ + uint32_t n_bitset_containers; /* number of bitmap containers */ + + uint32_t + n_values_array_containers; /* number of values in array containers */ + uint32_t n_values_run_containers; /* number of values in run containers */ + uint32_t + n_values_bitset_containers; /* number of values in bitmap containers */ + + uint32_t n_bytes_array_containers; /* number of allocated bytes in array + containers */ + uint32_t n_bytes_run_containers; /* number of allocated bytes in run + containers */ + uint32_t n_bytes_bitset_containers; /* number of allocated bytes in bitmap + containers */ + + uint32_t + max_value; /* the maximal value, undefined if cardinality is zero */ + uint32_t + min_value; /* the minimal value, undefined if cardinality is zero */ + uint64_t sum_value; /* the sum of all values (could be used to compute + average) */ + + uint64_t cardinality; /* total number of values stored in the bitmap */ + + // and n_values_arrays, n_values_rle, n_values_bitmap +} roaring_statistics_t; + +#ifdef __cplusplus +} } } // extern "C" { namespace roaring { namespace api { +#endif + +#endif /* ROARING_TYPES_H */ +/* end file include/roaring/roaring_types.h */ +/* begin file include/roaring/portability.h */ +/* + * portability.h + * + */ + + /** + * All macros should be prefixed with either CROARING or ROARING. + * The library uses both ROARING_... + * as well as CROAIRING_ as prefixes. The ROARING_ prefix is for + * macros that are provided by the build system or that are closely + * related to the format. The header macros may also use ROARING_. + * The CROARING_ prefix is for internal macros that a user is unlikely + * to ever interact with. + */ + +#ifndef INCLUDE_PORTABILITY_H_ +#define INCLUDE_PORTABILITY_H_ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE 1 +#endif // _GNU_SOURCE +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS 1 +#endif // __STDC_FORMAT_MACROS + +#ifdef _MSC_VER +#define CROARING_VISUAL_STUDIO 1 +/** + * We want to differentiate carefully between + * clang under visual studio and regular visual + * studio. + */ +#ifdef __clang__ +// clang under visual studio +#define CROARING_CLANG_VISUAL_STUDIO 1 +#else +// just regular visual studio (best guess) +#define CROARING_REGULAR_VISUAL_STUDIO 1 +#endif // __clang__ +#endif // _MSC_VER +#ifndef CROARING_VISUAL_STUDIO +#define CROARING_VISUAL_STUDIO 0 +#endif +#ifndef CROARING_CLANG_VISUAL_STUDIO +#define CROARING_CLANG_VISUAL_STUDIO 0 +#endif +#ifndef CROARING_REGULAR_VISUAL_STUDIO +#define CROARING_REGULAR_VISUAL_STUDIO 0 +#endif + +#if defined(_POSIX_C_SOURCE) && (_POSIX_C_SOURCE < 200809L) +#undef _POSIX_C_SOURCE +#endif + +#ifndef _POSIX_C_SOURCE +#define _POSIX_C_SOURCE 200809L +#endif // !(defined(_POSIX_C_SOURCE)) || (_POSIX_C_SOURCE < 200809L) +#if !(defined(_XOPEN_SOURCE)) || (_XOPEN_SOURCE < 700) +#define _XOPEN_SOURCE 700 +#endif // !(defined(_XOPEN_SOURCE)) || (_XOPEN_SOURCE < 700) + +#include +#include +#include // will provide posix_memalign with _POSIX_C_SOURCE as defined above +#ifdef __GLIBC__ +#include // this should never be needed but there are some reports that it is needed. +#endif + +#ifdef __cplusplus +extern "C" { // portability definitions are in global scope, not a namespace +#endif + +#if defined(__SIZEOF_LONG_LONG__) && __SIZEOF_LONG_LONG__ != 8 +#error This code assumes 64-bit long longs (by use of the GCC intrinsics). Your system is not currently supported. +#endif + +#if CROARING_REGULAR_VISUAL_STUDIO +#ifndef __restrict__ +#define __restrict__ __restrict +#endif // __restrict__ +#endif // CROARING_REGULAR_VISUAL_STUDIO + + + +#if defined(__x86_64__) || defined(_M_X64) +// we have an x64 processor +#define CROARING_IS_X64 1 + +#if defined(_MSC_VER) && (_MSC_VER < 1910) +// Old visual studio systems won't support AVX2 well. +#undef CROARING_IS_X64 +#endif + +#if defined(__clang_major__) && (__clang_major__<= 8) && !defined(__AVX2__) +// Older versions of clang have a bug affecting us +// https://stackoverflow.com/questions/57228537/how-does-one-use-pragma-clang-attribute-push-with-c-namespaces +#undef CROARING_IS_X64 +#endif + +#ifdef ROARING_DISABLE_X64 +#undef CROARING_IS_X64 +#endif +// we include the intrinsic header +#if !CROARING_REGULAR_VISUAL_STUDIO +/* Non-Microsoft C/C++-compatible compiler */ +#include // on some recent GCC, this will declare posix_memalign + + + +#if CROARING_CLANG_VISUAL_STUDIO + +/** + * You are not supposed, normally, to include these + * headers directly. Instead you should either include intrin.h + * or x86intrin.h. However, when compiling with clang + * under Windows (i.e., when _MSC_VER is set), these headers + * only get included *if* the corresponding features are detected + * from macros: + * e.g., if __AVX2__ is set... in turn, we normally set these + * macros by compiling against the corresponding architecture + * (e.g., arch:AVX2, -mavx2, etc.) which compiles the whole + * software with these advanced instructions. These headers would + * normally guard against such usage, but we carefully included + * (or ) before, so the headers + * are fooled. + */ +#include // for _blsr_u64 +#include // for __lzcnt64 +#include // for most things (AVX2, AVX512, _popcnt64) +#include +#include +#include +#include +#include +#if _MSC_VER >= 1920 +// Important: we need the AVX-512 headers: +#include +#include +#include +#include +#include +#include +#include +#include +#endif // _MSC_VER >= 1920 +// unfortunately, we may not get _blsr_u64, but, thankfully, clang +// has it as a macro. +#ifndef _blsr_u64 +// we roll our own +#define _blsr_u64(n) ((n - 1) & n) +#endif // _blsr_u64 +#endif // SIMDJSON_CLANG_VISUAL_STUDIO + + +#endif // CROARING_REGULAR_VISUAL_STUDIO +#endif // defined(__x86_64__) || defined(_M_X64) + +#if !defined(CROARING_USENEON) && !defined(DISABLENEON) && defined(__ARM_NEON) +# define CROARING_USENEON +#endif +#if defined(CROARING_USENEON) +# include +#endif + +#if !CROARING_REGULAR_VISUAL_STUDIO +/* Non-Microsoft C/C++-compatible compiler, assumes that it supports inline + * assembly */ +#define CROARING_INLINE_ASM 1 +#endif // _MSC_VER + +#if CROARING_REGULAR_VISUAL_STUDIO +/* Microsoft C/C++-compatible compiler */ +#include + +#ifndef __clang__ // if one compiles with MSVC *with* clang, then these + // intrinsics are defined!!! +#define CROARING_INTRINSICS 1 +// sadly there is no way to check whether we are missing these intrinsics +// specifically. + +/* wrappers for Visual Studio built-ins that look like gcc built-ins __builtin_ctzll */ +/* result might be undefined when input_num is zero */ +static inline int roaring_trailing_zeroes(unsigned long long input_num) { + unsigned long index; +#ifdef _WIN64 // highly recommended!!! + _BitScanForward64(&index, input_num); +#else // if we must support 32-bit Windows + if ((uint32_t)input_num != 0) { + _BitScanForward(&index, (uint32_t)input_num); + } else { + _BitScanForward(&index, (uint32_t)(input_num >> 32)); + index += 32; + } +#endif // _WIN64 + return index; +} + +/* wrappers for Visual Studio built-ins that look like gcc built-ins __builtin_clzll */ +/* result might be undefined when input_num is zero */ +inline int roaring_leading_zeroes(unsigned long long input_num) { + unsigned long index; +#ifdef _WIN64 // highly recommended!!! + _BitScanReverse64(&index, input_num); +#else // if we must support 32-bit Windows + if (input_num > 0xFFFFFFFF) { + _BitScanReverse(&index, (uint32_t)(input_num >> 32)); + index += 32; + } else { + _BitScanReverse(&index, (uint32_t)(input_num)); + } +#endif // _WIN64 + return 63 - index; +} + +/* Use #define so this is effective even under /Ob0 (no inline) */ +#define roaring_unreachable __assume(0) +#endif // __clang__ + +#endif // CROARING_REGULAR_VISUAL_STUDIO + +#ifndef CROARING_INTRINSICS +#define CROARING_INTRINSICS 1 +#define roaring_unreachable __builtin_unreachable() +static inline int roaring_trailing_zeroes(unsigned long long input_num) { return __builtin_ctzll(input_num); } +static inline int roaring_leading_zeroes(unsigned long long input_num) { return __builtin_clzll(input_num); } +#endif + +#if CROARING_REGULAR_VISUAL_STUDIO +#define ALIGNED(x) __declspec(align(x)) +#elif defined(__GNUC__) || defined(__clang__) +#define ALIGNED(x) __attribute__((aligned(x))) +#else +#warning "Warning. Unrecognized compiler." +#define ALIGNED(x) +#endif + +#if defined(__GNUC__) || defined(__clang__) +#define WARN_UNUSED __attribute__((warn_unused_result)) +#else +#define WARN_UNUSED +#endif + +#define IS_BIG_ENDIAN (*(uint16_t *)"\0\xff" < 0x100) + +#ifdef CROARING_USENEON +// we can always compute the popcount fast. +#elif (defined(_M_ARM) || defined(_M_ARM64)) && ((defined(_WIN64) || defined(_WIN32)) && defined(CROARING_REGULAR_VISUAL_STUDIO) && CROARING_REGULAR_VISUAL_STUDIO) +// we will need this function: +static inline int roaring_hamming_backup(uint64_t x) { + uint64_t c1 = UINT64_C(0x5555555555555555); + uint64_t c2 = UINT64_C(0x3333333333333333); + uint64_t c4 = UINT64_C(0x0F0F0F0F0F0F0F0F); + x -= (x >> 1) & c1; + x = (( x >> 2) & c2) + (x & c2); x=(x +(x>>4))&c4; + x *= UINT64_C(0x0101010101010101); + return x >> 56; +} +#endif + + +static inline int roaring_hamming(uint64_t x) { +#if defined(_WIN64) && defined(CROARING_REGULAR_VISUAL_STUDIO) && CROARING_REGULAR_VISUAL_STUDIO +#ifdef CROARING_USENEON + return vaddv_u8(vcnt_u8(vcreate_u8(input_num))); +#elif defined(_M_ARM64) + return roaring_hamming_backup(x); + // (int) _CountOneBits64(x); is unavailable +#else // _M_ARM64 + return (int) __popcnt64(x); +#endif // _M_ARM64 +#elif defined(_WIN32) && defined(CROARING_REGULAR_VISUAL_STUDIO) && CROARING_REGULAR_VISUAL_STUDIO +#ifdef _M_ARM + return roaring_hamming_backup(x); + // _CountOneBits is unavailable +#else // _M_ARM + return (int) __popcnt(( unsigned int)x) + (int) __popcnt(( unsigned int)(x>>32)); +#endif // _M_ARM +#else + return __builtin_popcountll(x); +#endif +} + +#ifndef UINT64_C +#define UINT64_C(c) (c##ULL) +#endif // UINT64_C + +#ifndef UINT32_C +#define UINT32_C(c) (c##UL) +#endif // UINT32_C + +#ifdef __cplusplus +} // extern "C" { +#endif // __cplusplus + + +// this is almost standard? +#undef STRINGIFY_IMPLEMENTATION_ +#undef STRINGIFY +#define STRINGIFY_IMPLEMENTATION_(a) #a +#define STRINGIFY(a) STRINGIFY_IMPLEMENTATION_(a) + +// Our fast kernels require 64-bit systems. +// +// On 32-bit x86, we lack 64-bit popcnt, lzcnt, blsr instructions. +// Furthermore, the number of SIMD registers is reduced. +// +// On 32-bit ARM, we would have smaller registers. +// +// The library should still have the fallback kernel. It is +// slower, but it should run everywhere. + +// +// Enable valid runtime implementations, and select CROARING_BUILTIN_IMPLEMENTATION +// + +// We are going to use runtime dispatch. +#if CROARING_IS_X64 +#ifdef __clang__ +// clang does not have GCC push pop +// warning: clang attribute push can't be used within a namespace in clang up +// til 8.0 so CROARING_TARGET_REGION and CROARING_UNTARGET_REGION must be *outside* of a +// namespace. +#define CROARING_TARGET_REGION(T) \ + _Pragma(STRINGIFY( \ + clang attribute push(__attribute__((target(T))), apply_to = function))) +#define CROARING_UNTARGET_REGION _Pragma("clang attribute pop") +#elif defined(__GNUC__) +// GCC is easier +#define CROARING_TARGET_REGION(T) \ + _Pragma("GCC push_options") _Pragma(STRINGIFY(GCC target(T))) +#define CROARING_UNTARGET_REGION _Pragma("GCC pop_options") +#endif // clang then gcc + +#endif // CROARING_IS_X64 + +// Default target region macros don't do anything. +#ifndef CROARING_TARGET_REGION +#define CROARING_TARGET_REGION(T) +#define CROARING_UNTARGET_REGION +#endif + + +#define CROARING_TARGET_AVX2 CROARING_TARGET_REGION("avx2,bmi,pclmul,lzcnt,popcnt") +#define CROARING_TARGET_AVX512 CROARING_TARGET_REGION("avx2,bmi,bmi2,pclmul,lzcnt,popcnt,avx512f,avx512dq,avx512bw,avx512vbmi2,avx512bitalg,avx512vpopcntdq") +#define CROARING_UNTARGET_AVX2 CROARING_UNTARGET_REGION +#define CROARING_UNTARGET_AVX512 CROARING_UNTARGET_REGION + +#ifdef __AVX2__ +// No need for runtime dispatching. +// It is unnecessary and harmful to old clang to tag regions. +#undef CROARING_TARGET_AVX2 +#define CROARING_TARGET_AVX2 +#undef CROARING_UNTARGET_AVX2 +#define CROARING_UNTARGET_AVX2 +#endif + +#if defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VBMI2__) && defined(__AVX512BITALG__) && defined(__AVX512VPOPCNTDQ__) +// No need for runtime dispatching. +// It is unnecessary and harmful to old clang to tag regions. +#undef CROARING_TARGET_AVX512 +#define CROARING_TARGET_AVX512 +#undef CROARING_UNTARGET_AVX512 +#define CROARING_UNTARGET_AVX512 +#endif + +// Allow unaligned memory access +#if defined(__GNUC__) || defined(__clang__) +#define ALLOW_UNALIGNED __attribute__((no_sanitize("alignment"))) +#else +#define ALLOW_UNALIGNED +#endif + +#if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) + #define CROARING_IS_BIG_ENDIAN (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +#elif defined(_WIN32) + #define CROARING_IS_BIG_ENDIAN 0 + #else + #if defined(__APPLE__) || defined(__FreeBSD__) // defined __BYTE_ORDER__ && defined __ORDER_BIG_ENDIAN__ + #include + #elif defined(sun) || defined(__sun) // defined(__APPLE__) || defined(__FreeBSD__) + #include + #else // defined(__APPLE__) || defined(__FreeBSD__) + + #ifdef __has_include + #if __has_include() + #include + #endif //__has_include() + #endif //__has_include + + #endif // defined(__APPLE__) || defined(__FreeBSD__) + + + #ifndef !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) + #define CROARING_IS_BIG_ENDIAN 0 + #endif + + #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + #define CROARING_IS_BIG_ENDIAN 0 + #else // __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + #define CROARING_IS_BIG_ENDIAN 1 + #endif // __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#endif + +// Defines for the possible CROARING atomic implementations +#define CROARING_ATOMIC_IMPL_NONE 1 +#define CROARING_ATOMIC_IMPL_CPP 2 +#define CROARING_ATOMIC_IMPL_C 3 +#define CROARING_ATOMIC_IMPL_C_WINDOWS 4 + +// If the use has forced a specific implementation, use that, otherwise, +// figure out the best implementation we can use. +#if !defined(CROARING_ATOMIC_IMPL) + #if defined(__cplusplus) && __cplusplus >= 201103L + #ifdef __has_include + #if __has_include() + #define CROARING_ATOMIC_IMPL CROARING_ATOMIC_IMPL_CPP + #endif //__has_include() + #else + // We lack __has_include to check: + #define CROARING_ATOMIC_IMPL CROARING_ATOMIC_IMPL_CPP + #endif //__has_include + #elif __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_ATOMICS__) + #define CROARING_ATOMIC_IMPL CROARING_ATOMIC_IMPL_C + #elif CROARING_REGULAR_VISUAL_STUDIO + // https://www.technetworkhub.com/c11-atomics-in-visual-studio-2022-version-17/ + #define CROARING_ATOMIC_IMPL CROARING_ATOMIC_IMPL_C_WINDOWS + #endif +#endif // !defined(CROARING_ATOMIC_IMPL) + +#if !defined(CROARING_ATOMIC_IMPL) + #ifndef CROARING_SILENT_BUILD + #pragma message ( "No atomic implementation found, copy on write bitmaps will not be threadsafe" ) + #endif // CROARING_SILENT_BUILD + #define CROARING_ATOMIC_IMPL CROARING_ATOMIC_IMPL_NONE +#endif + +#if CROARING_ATOMIC_IMPL == CROARING_ATOMIC_IMPL_C +#include +typedef _Atomic(uint32_t) croaring_refcount_t; + +static inline void croaring_refcount_inc(croaring_refcount_t *val) { + // Increasing the reference counter can always be done with + // memory_order_relaxed: New references to an object can only be formed from + // an existing reference, and passing an existing reference from one thread to + // another must already provide any required synchronization. + atomic_fetch_add_explicit(val, 1, memory_order_relaxed); +} + +static inline bool croaring_refcount_dec(croaring_refcount_t *val) { + // It is important to enforce any possible access to the object in one thread + // (through an existing reference) to happen before deleting the object in a + // different thread. This is achieved by a "release" operation after dropping + // a reference (any access to the object through this reference must obviously + // happened before), and an "acquire" operation before deleting the object. + bool is_zero = atomic_fetch_sub_explicit(val, 1, memory_order_release) == 1; + if (is_zero) { + atomic_thread_fence(memory_order_acquire); + } + return is_zero; +} + +static inline uint32_t croaring_refcount_get(croaring_refcount_t *val) { + return atomic_load_explicit(val, memory_order_relaxed); +} +#elif CROARING_ATOMIC_IMPL == CROARING_ATOMIC_IMPL_CPP +#include +typedef std::atomic croaring_refcount_t; + +static inline void croaring_refcount_inc(croaring_refcount_t *val) { + val->fetch_add(1, std::memory_order_relaxed); +} + +static inline bool croaring_refcount_dec(croaring_refcount_t *val) { + // See above comments on the c11 atomic implementation for memory ordering + bool is_zero = val->fetch_sub(1, std::memory_order_release) == 1; + if (is_zero) { + std::atomic_thread_fence(std::memory_order_acquire); + } + return is_zero; +} + +static inline uint32_t croaring_refcount_get(croaring_refcount_t *val) { + return val->load(std::memory_order_relaxed); +} +#elif CROARING_ATOMIC_IMPL == CROARING_ATOMIC_IMPL_C_WINDOWS +#include +#pragma intrinsic(_InterlockedIncrement) +#pragma intrinsic(_InterlockedDecrement) + +// _InterlockedIncrement and _InterlockedDecrement take a (signed) long, and +// overflow is defined to wrap, so we can pretend it is a uint32_t for our case +typedef volatile long croaring_refcount_t; + +static inline void croaring_refcount_inc(croaring_refcount_t *val) { + _InterlockedIncrement(val); +} + +static inline bool croaring_refcount_dec(croaring_refcount_t *val) { + return _InterlockedDecrement(val) == 0; +} + +static inline uint32_t croaring_refcount_get(croaring_refcount_t *val) { + // Per https://learn.microsoft.com/en-us/windows/win32/sync/interlocked-variable-access + // > Simple reads and writes to properly-aligned 32-bit variables are atomic + // > operations. In other words, you will not end up with only one portion + // > of the variable updated; all bits are updated in an atomic fashion. + return *val; +} +#elif CROARING_ATOMIC_IMPL == CROARING_ATOMIC_IMPL_NONE +#include +typedef uint32_t croaring_refcount_t; + +static inline void croaring_refcount_inc(croaring_refcount_t *val) { + *val += 1; +} + +static inline bool croaring_refcount_dec(croaring_refcount_t *val) { + assert(*val > 0); + *val -= 1; + return val == 0; +} + +static inline uint32_t croaring_refcount_get(croaring_refcount_t *val) { + return *val; +} +#else +#error "Unknown atomic implementation" +#endif + + +// We need portability.h to be included first, +// but we also always want isadetection.h to be +// included (right after). +// See https://github.com/RoaringBitmap/CRoaring/issues/394 +// There is no scenario where we want portability.h to +// be included, but not isadetection.h: the latter is a +// strict requirement. +#endif /* INCLUDE_PORTABILITY_H_ */ +/* end file include/roaring/portability.h */ +/* begin file include/roaring/bitset/bitset.h */ +#ifndef CBITSET_BITSET_H +#define CBITSET_BITSET_H + +// For compatibility with MSVC with the use of `restrict` +#if (__STDC_VERSION__ >= 199901L) || \ + (defined(__GNUC__) && defined(__STDC_VERSION__)) +#define CBITSET_RESTRICT restrict +#else +#define CBITSET_RESTRICT +#endif // (__STDC_VERSION__ >= 199901L) || (defined(__GNUC__) && + // defined(__STDC_VERSION__ )) + +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { namespace roaring { namespace api { +#endif + +struct bitset_s { + uint64_t *CBITSET_RESTRICT array; + /* For simplicity and performance, we prefer to have a size and a capacity that is a multiple of 64 bits. + * Thus we only track the size and the capacity in terms of 64-bit words allocated */ + size_t arraysize; + size_t capacity; +}; + +typedef struct bitset_s bitset_t; + +/* Create a new bitset. Return NULL in case of failure. */ +bitset_t *bitset_create(void); + +/* Create a new bitset able to contain size bits. Return NULL in case of + * failure. */ +bitset_t *bitset_create_with_capacity(size_t size); + +/* Free memory. */ +void bitset_free(bitset_t *bitset); + +/* Set all bits to zero. */ +void bitset_clear(bitset_t *bitset); + +/* Set all bits to one. */ +void bitset_fill(bitset_t *bitset); + +/* Create a copy */ +bitset_t *bitset_copy(const bitset_t *bitset); + +/* For advanced users: Resize the bitset so that it can support newarraysize * 64 bits. + * Return true in case of success, false for failure. Pad + * with zeroes new buffer areas if requested. */ +bool bitset_resize(bitset_t *bitset, size_t newarraysize, bool padwithzeroes); + +/* returns how many bytes of memory the backend buffer uses */ +static inline size_t bitset_size_in_bytes(const bitset_t *bitset) { + return bitset->arraysize * sizeof(uint64_t); +} + +/* returns how many bits can be accessed */ +static inline size_t bitset_size_in_bits(const bitset_t *bitset) { + return bitset->arraysize * 64; +} + +/* returns how many words (64-bit) of memory the backend buffer uses */ +static inline size_t bitset_size_in_words(const bitset_t *bitset) { + return bitset->arraysize; +} + +/* For advanced users: Grow the bitset so that it can support newarraysize * 64 bits with padding. + * Return true in case of success, false for failure. */ +bool bitset_grow(bitset_t *bitset, size_t newarraysize); + +/* attempts to recover unused memory, return false in case of roaring_reallocation + * failure */ +bool bitset_trim(bitset_t *bitset); + +/* shifts all bits by 's' positions so that the bitset representing values + * 1,2,10 would represent values 1+s, 2+s, 10+s */ +void bitset_shift_left(bitset_t *bitset, size_t s); + +/* shifts all bits by 's' positions so that the bitset representing values + * 1,2,10 would represent values 1-s, 2-s, 10-s, negative values are deleted */ +void bitset_shift_right(bitset_t *bitset, size_t s); + +/* Set the ith bit. Attempts to resize the bitset if needed (may silently fail) + */ +static inline void bitset_set(bitset_t *bitset, size_t i) { + size_t shiftedi = i / 64; + if (shiftedi >= bitset->arraysize) { + if (!bitset_grow(bitset, shiftedi + 1)) { + return; + } + } + bitset->array[shiftedi] |= ((uint64_t)1) << (i % 64); +} + +/* Set the ith bit to the specified value. Attempts to resize the bitset if + * needed (may silently fail) */ +static inline void bitset_set_to_value(bitset_t *bitset, size_t i, bool flag) { + size_t shiftedi = i / 64; + uint64_t mask = ((uint64_t)1) << (i % 64); + uint64_t dynmask = ((uint64_t)flag) << (i % 64); + if (shiftedi >= bitset->arraysize) { + if (!bitset_grow(bitset, shiftedi + 1)) { + return; + } + } + uint64_t w = bitset->array[shiftedi]; + w &= ~mask; + w |= dynmask; + bitset->array[shiftedi] = w; +} + +/* Get the value of the ith bit. */ +static inline bool bitset_get(const bitset_t *bitset, size_t i) { + size_t shiftedi = i / 64; + if (shiftedi >= bitset->arraysize) { + return false; + } + return (bitset->array[shiftedi] & (((uint64_t)1) << (i % 64))) != 0; +} + +/* Count number of bits set. */ +size_t bitset_count(const bitset_t *bitset); + +/* Find the index of the first bit set. Or zero if the bitset is empty. */ +size_t bitset_minimum(const bitset_t *bitset); + +/* Find the index of the last bit set. Or zero if the bitset is empty. */ +size_t bitset_maximum(const bitset_t *bitset); + +/* compute the union in-place (to b1), returns true if successful, to generate a + * new bitset first call bitset_copy */ +bool bitset_inplace_union(bitset_t *CBITSET_RESTRICT b1, + const bitset_t *CBITSET_RESTRICT b2); + +/* report the size of the union (without materializing it) */ +size_t bitset_union_count(const bitset_t *CBITSET_RESTRICT b1, + const bitset_t *CBITSET_RESTRICT b2); + +/* compute the intersection in-place (to b1), to generate a new bitset first + * call bitset_copy */ +void bitset_inplace_intersection(bitset_t *CBITSET_RESTRICT b1, + const bitset_t *CBITSET_RESTRICT b2); + +/* report the size of the intersection (without materializing it) */ +size_t bitset_intersection_count(const bitset_t *CBITSET_RESTRICT b1, + const bitset_t *CBITSET_RESTRICT b2); + +/* returns true if the bitsets contain no common elements */ +bool bitsets_disjoint(const bitset_t *CBITSET_RESTRICT b1, const bitset_t *CBITSET_RESTRICT b2); + +/* returns true if the bitsets contain any common elements */ +bool bitsets_intersect(const bitset_t *CBITSET_RESTRICT b1, const bitset_t *CBITSET_RESTRICT b2); + +/* returns true if b1 contains all of the set bits of b2 */ +bool bitset_contains_all(const bitset_t *CBITSET_RESTRICT b1, const bitset_t *CBITSET_RESTRICT b2); + +/* compute the difference in-place (to b1), to generate a new bitset first call + * bitset_copy */ +void bitset_inplace_difference(bitset_t *CBITSET_RESTRICT b1, + const bitset_t *CBITSET_RESTRICT b2); + +/* compute the size of the difference */ +size_t bitset_difference_count(const bitset_t *CBITSET_RESTRICT b1, + const bitset_t *CBITSET_RESTRICT b2); + +/* compute the symmetric difference in-place (to b1), return true if successful, + * to generate a new bitset first call bitset_copy */ +bool bitset_inplace_symmetric_difference(bitset_t *CBITSET_RESTRICT b1, + const bitset_t *CBITSET_RESTRICT b2); + +/* compute the size of the symmetric difference */ +size_t bitset_symmetric_difference_count(const bitset_t *CBITSET_RESTRICT b1, + const bitset_t *CBITSET_RESTRICT b2); + +/* iterate over the set bits + like so : + for(size_t i = 0; bitset_next_set_bit(b,&i) ; i++) { + //..... + } + */ +static inline bool bitset_next_set_bit(const bitset_t *bitset, size_t *i) { + size_t x = *i / 64; + if (x >= bitset->arraysize) { + return false; + } + uint64_t w = bitset->array[x]; + w >>= (*i & 63); + if (w != 0) { + *i += roaring_trailing_zeroes(w); + return true; + } + x++; + while (x < bitset->arraysize) { + w = bitset->array[x]; + if (w != 0) { + *i = x * 64 + roaring_trailing_zeroes(w); + return true; + } + x++; + } + return false; +} + +/* iterate over the set bits + like so : + size_t buffer[256]; + size_t howmany = 0; + for(size_t startfrom = 0; (howmany = bitset_next_set_bits(b,buffer,256, &startfrom)) > + 0 ; startfrom++) { + //..... + } + */ +static inline size_t bitset_next_set_bits(const bitset_t *bitset, size_t *buffer, + size_t capacity, size_t *startfrom) { + if (capacity == 0) return 0; // sanity check + size_t x = *startfrom / 64; + if (x >= bitset->arraysize) { + return 0; // nothing more to iterate over + } + uint64_t w = bitset->array[x]; + w >>= (*startfrom & 63); + size_t howmany = 0; + size_t base = x << 6; + while (howmany < capacity) { + while (w != 0) { + uint64_t t = w & (~w + 1); + int r = roaring_trailing_zeroes(w); + buffer[howmany++] = r + base; + if (howmany == capacity) goto end; + w ^= t; + } + x += 1; + if (x == bitset->arraysize) { + break; + } + base += 64; + w = bitset->array[x]; + } +end: + if (howmany > 0) { + *startfrom = buffer[howmany - 1]; + } + return howmany; +} + +typedef bool (*bitset_iterator)(size_t value, void *param); + +// return true if uninterrupted +static inline bool bitset_for_each(const bitset_t *b, bitset_iterator iterator, + void *ptr) { + size_t base = 0; + for (size_t i = 0; i < b->arraysize; ++i) { + uint64_t w = b->array[i]; + while (w != 0) { + uint64_t t = w & (~w + 1); + int r = roaring_trailing_zeroes(w); + if (!iterator(r + base, ptr)) return false; + w ^= t; + } + base += 64; + } + return true; +} + +static inline void bitset_print(const bitset_t *b) { + printf("{"); + for (size_t i = 0; bitset_next_set_bit(b, &i); i++) { + printf("%zu, ", i); + } + printf("}"); +} + +#ifdef __cplusplus +} } } // extern "C" { namespace roaring { namespace api { +#endif + +#endif +/* end file include/roaring/bitset/bitset.h */ +/* begin file include/roaring/roaring.h */ +/* + * An implementation of Roaring Bitmaps in C. + */ + +#ifndef ROARING_H +#define ROARING_H + +#include +#include +#include // for `size_t` + + +#ifdef __cplusplus +extern "C" { namespace roaring { namespace api { +#endif + +typedef struct roaring_bitmap_s { + roaring_array_t high_low_container; +} roaring_bitmap_t; + +/** + * Dynamically allocates a new bitmap (initially empty). + * Returns NULL if the allocation fails. + * Capacity is a performance hint for how many "containers" the data will need. + * Client is responsible for calling `roaring_bitmap_free()`. + */ +roaring_bitmap_t *roaring_bitmap_create_with_capacity(uint32_t cap); + +/** + * Dynamically allocates a new bitmap (initially empty). + * Returns NULL if the allocation fails. + * Client is responsible for calling `roaring_bitmap_free()`. + */ +static inline roaring_bitmap_t *roaring_bitmap_create(void) + { return roaring_bitmap_create_with_capacity(0); } + +/** + * Initialize a roaring bitmap structure in memory controlled by client. + * Capacity is a performance hint for how many "containers" the data will need. + * Can return false if auxiliary allocations fail when capacity greater than 0. + */ +bool roaring_bitmap_init_with_capacity(roaring_bitmap_t *r, uint32_t cap); + +/** + * Initialize a roaring bitmap structure in memory controlled by client. + * The bitmap will be in a "clear" state, with no auxiliary allocations. + * Since this performs no allocations, the function will not fail. + */ +static inline void roaring_bitmap_init_cleared(roaring_bitmap_t *r) + { roaring_bitmap_init_with_capacity(r, 0); } + +/** + * Add all the values between min (included) and max (excluded) that are at a + * distance k*step from min. +*/ +roaring_bitmap_t *roaring_bitmap_from_range(uint64_t min, uint64_t max, + uint32_t step); + +/** + * Creates a new bitmap from a pointer of uint32_t integers + */ +roaring_bitmap_t *roaring_bitmap_of_ptr(size_t n_args, const uint32_t *vals); + +/* + * Whether you want to use copy-on-write. + * Saves memory and avoids copies, but needs more care in a threaded context. + * Most users should ignore this flag. + * + * Note: If you do turn this flag to 'true', enabling COW, then ensure that you + * do so for all of your bitmaps, since interactions between bitmaps with and + * without COW is unsafe. + */ +static inline bool roaring_bitmap_get_copy_on_write(const roaring_bitmap_t* r) { + return r->high_low_container.flags & ROARING_FLAG_COW; +} +static inline void roaring_bitmap_set_copy_on_write(roaring_bitmap_t* r, + bool cow) { + if (cow) { + r->high_low_container.flags |= ROARING_FLAG_COW; + } else { + r->high_low_container.flags &= ~ROARING_FLAG_COW; + } +} + +roaring_bitmap_t *roaring_bitmap_add_offset(const roaring_bitmap_t *bm, + int64_t offset); +/** + * Describe the inner structure of the bitmap. + */ +void roaring_bitmap_printf_describe(const roaring_bitmap_t *r); + +/** + * Creates a new bitmap from a list of uint32_t integers + */ +roaring_bitmap_t *roaring_bitmap_of(size_t n, ...); + +/** + * Copies a bitmap (this does memory allocation). + * The caller is responsible for memory management. + */ +roaring_bitmap_t *roaring_bitmap_copy(const roaring_bitmap_t *r); + +/** + * Copies a bitmap from src to dest. It is assumed that the pointer dest + * is to an already allocated bitmap. The content of the dest bitmap is + * freed/deleted. + * + * It might be preferable and simpler to call roaring_bitmap_copy except + * that roaring_bitmap_overwrite can save on memory allocations. + */ +bool roaring_bitmap_overwrite(roaring_bitmap_t *dest, + const roaring_bitmap_t *src); + +/** + * Print the content of the bitmap. + */ +void roaring_bitmap_printf(const roaring_bitmap_t *r); + +/** + * Computes the intersection between two bitmaps and returns new bitmap. The + * caller is responsible for memory management. + * + * Performance hint: if you are computing the intersection between several + * bitmaps, two-by-two, it is best to start with the smallest bitmap. + * You may also rely on roaring_bitmap_and_inplace to avoid creating + * many temporary bitmaps. + */ +roaring_bitmap_t *roaring_bitmap_and(const roaring_bitmap_t *r1, + const roaring_bitmap_t *r2); + +/** + * Computes the size of the intersection between two bitmaps. + */ +uint64_t roaring_bitmap_and_cardinality(const roaring_bitmap_t *r1, + const roaring_bitmap_t *r2); + +/** + * Check whether two bitmaps intersect. + */ +bool roaring_bitmap_intersect(const roaring_bitmap_t *r1, + const roaring_bitmap_t *r2); + +/** + * Check whether a bitmap and a closed range intersect. + */ +bool roaring_bitmap_intersect_with_range(const roaring_bitmap_t *bm, + uint64_t x, uint64_t y); + +/** + * Computes the Jaccard index between two bitmaps. (Also known as the Tanimoto + * distance, or the Jaccard similarity coefficient) + * + * The Jaccard index is undefined if both bitmaps are empty. + */ +double roaring_bitmap_jaccard_index(const roaring_bitmap_t *r1, + const roaring_bitmap_t *r2); + +/** + * Computes the size of the union between two bitmaps. + */ +uint64_t roaring_bitmap_or_cardinality(const roaring_bitmap_t *r1, + const roaring_bitmap_t *r2); + +/** + * Computes the size of the difference (andnot) between two bitmaps. + */ +uint64_t roaring_bitmap_andnot_cardinality(const roaring_bitmap_t *r1, + const roaring_bitmap_t *r2); + +/** + * Computes the size of the symmetric difference (xor) between two bitmaps. + */ +uint64_t roaring_bitmap_xor_cardinality(const roaring_bitmap_t *r1, + const roaring_bitmap_t *r2); + +/** + * Inplace version of `roaring_bitmap_and()`, modifies r1 + * r1 == r2 is allowed. + * + * Performance hint: if you are computing the intersection between several + * bitmaps, two-by-two, it is best to start with the smallest bitmap. + */ +void roaring_bitmap_and_inplace(roaring_bitmap_t *r1, + const roaring_bitmap_t *r2); + +/** + * Computes the union between two bitmaps and returns new bitmap. The caller is + * responsible for memory management. + */ +roaring_bitmap_t *roaring_bitmap_or(const roaring_bitmap_t *r1, + const roaring_bitmap_t *r2); + +/** + * Inplace version of `roaring_bitmap_or(), modifies r1. + * TODO: decide whether r1 == r2 ok + */ +void roaring_bitmap_or_inplace(roaring_bitmap_t *r1, + const roaring_bitmap_t *r2); + +/** + * Compute the union of 'number' bitmaps. + * Caller is responsible for freeing the result. + * See also `roaring_bitmap_or_many_heap()` + */ +roaring_bitmap_t *roaring_bitmap_or_many(size_t number, + const roaring_bitmap_t **rs); + +/** + * Compute the union of 'number' bitmaps using a heap. This can sometimes be + * faster than `roaring_bitmap_or_many() which uses a naive algorithm. + * Caller is responsible for freeing the result. + */ +roaring_bitmap_t *roaring_bitmap_or_many_heap(uint32_t number, + const roaring_bitmap_t **rs); + +/** + * Computes the symmetric difference (xor) between two bitmaps + * and returns new bitmap. The caller is responsible for memory management. + */ +roaring_bitmap_t *roaring_bitmap_xor(const roaring_bitmap_t *r1, + const roaring_bitmap_t *r2); + +/** + * Inplace version of roaring_bitmap_xor, modifies r1, r1 != r2. + */ +void roaring_bitmap_xor_inplace(roaring_bitmap_t *r1, + const roaring_bitmap_t *r2); + +/** + * Compute the xor of 'number' bitmaps. + * Caller is responsible for freeing the result. + */ +roaring_bitmap_t *roaring_bitmap_xor_many(size_t number, + const roaring_bitmap_t **rs); + +/** + * Computes the difference (andnot) between two bitmaps and returns new bitmap. + * Caller is responsible for freeing the result. + */ +roaring_bitmap_t *roaring_bitmap_andnot(const roaring_bitmap_t *r1, + const roaring_bitmap_t *r2); + +/** + * Inplace version of roaring_bitmap_andnot, modifies r1, r1 != r2. + */ +void roaring_bitmap_andnot_inplace(roaring_bitmap_t *r1, + const roaring_bitmap_t *r2); + +/** + * TODO: consider implementing: + * + * "Compute the xor of 'number' bitmaps using a heap. This can sometimes be + * faster than roaring_bitmap_xor_many which uses a naive algorithm. Caller is + * responsible for freeing the result."" + * + * roaring_bitmap_t *roaring_bitmap_xor_many_heap(uint32_t number, + * const roaring_bitmap_t **rs); + */ + +/** + * Frees the memory. + */ +void roaring_bitmap_free(const roaring_bitmap_t *r); + +/** + * A bit of context usable with `roaring_bitmap_*_bulk()` functions + * + * Should be initialized with `{0}` (or `memset()` to all zeros). + * Callers should treat it as an opaque type. + * + * A context may only be used with a single bitmap + * (unless re-initialized to zero), and any modification to a bitmap + * (other than modifications performed with `_bulk()` functions with the context + * passed) will invalidate any contexts associated with that bitmap. + */ +typedef struct roaring_bulk_context_s { + ROARING_CONTAINER_T *container; + int idx; + uint16_t key; + uint8_t typecode; +} roaring_bulk_context_t; + +/** + * Add an item, using context from a previous insert for speed optimization. + * + * `context` will be used to store information between calls to make bulk + * operations faster. `*context` should be zero-initialized before the first + * call to this function. + * + * Modifying the bitmap in any way (other than `-bulk` suffixed functions) + * will invalidate the stored context, calling this function with a non-zero + * context after doing any modification invokes undefined behavior. + * + * In order to exploit this optimization, the caller should call this function + * with values with the same "key" (high 16 bits of the value) consecutively. + */ +void roaring_bitmap_add_bulk(roaring_bitmap_t *r, + roaring_bulk_context_t *context, uint32_t val); + +/** + * Add value n_args from pointer vals, faster than repeatedly calling + * `roaring_bitmap_add()` + * + * In order to exploit this optimization, the caller should attempt to keep + * values with the same "key" (high 16 bits of the value) as consecutive + * elements in `vals` + */ +void roaring_bitmap_add_many(roaring_bitmap_t *r, size_t n_args, + const uint32_t *vals); + +/** + * Add value x + */ +void roaring_bitmap_add(roaring_bitmap_t *r, uint32_t x); + +/** + * Add value x + * Returns true if a new value was added, false if the value already existed. + */ +bool roaring_bitmap_add_checked(roaring_bitmap_t *r, uint32_t x); + +/** + * Add all values in range [min, max] + */ +void roaring_bitmap_add_range_closed(roaring_bitmap_t *r, + uint32_t min, uint32_t max); + +/** + * Add all values in range [min, max) + */ +static inline void roaring_bitmap_add_range(roaring_bitmap_t *r, + uint64_t min, uint64_t max) { + if(max <= min) return; + roaring_bitmap_add_range_closed(r, (uint32_t)min, (uint32_t)(max - 1)); +} + +/** + * Remove value x + */ +void roaring_bitmap_remove(roaring_bitmap_t *r, uint32_t x); + +/** + * Remove all values in range [min, max] + */ +void roaring_bitmap_remove_range_closed(roaring_bitmap_t *r, + uint32_t min, uint32_t max); + +/** + * Remove all values in range [min, max) + */ +static inline void roaring_bitmap_remove_range(roaring_bitmap_t *r, + uint64_t min, uint64_t max) { + if(max <= min) return; + roaring_bitmap_remove_range_closed(r, (uint32_t)min, (uint32_t)(max - 1)); +} + +/** + * Remove multiple values + */ +void roaring_bitmap_remove_many(roaring_bitmap_t *r, size_t n_args, + const uint32_t *vals); + +/** + * Remove value x + * Returns true if a new value was removed, false if the value was not existing. + */ +bool roaring_bitmap_remove_checked(roaring_bitmap_t *r, uint32_t x); + +/** + * Check if value is present + */ +bool roaring_bitmap_contains(const roaring_bitmap_t *r, uint32_t val); + +/** + * Check whether a range of values from range_start (included) + * to range_end (excluded) is present + */ +bool roaring_bitmap_contains_range(const roaring_bitmap_t *r, + uint64_t range_start, + uint64_t range_end); + +/** + * Check if an items is present, using context from a previous insert for speed + * optimization. + * + * `context` will be used to store information between calls to make bulk + * operations faster. `*context` should be zero-initialized before the first + * call to this function. + * + * Modifying the bitmap in any way (other than `-bulk` suffixed functions) + * will invalidate the stored context, calling this function with a non-zero + * context after doing any modification invokes undefined behavior. + * + * In order to exploit this optimization, the caller should call this function + * with values with the same "key" (high 16 bits of the value) consecutively. + */ +bool roaring_bitmap_contains_bulk(const roaring_bitmap_t *r, + roaring_bulk_context_t *context, + uint32_t val); + +/** + * Get the cardinality of the bitmap (number of elements). + */ +uint64_t roaring_bitmap_get_cardinality(const roaring_bitmap_t *r); + +/** + * Returns the number of elements in the range [range_start, range_end). + */ +uint64_t roaring_bitmap_range_cardinality(const roaring_bitmap_t *r, + uint64_t range_start, + uint64_t range_end); + +/** +* Returns true if the bitmap is empty (cardinality is zero). +*/ +bool roaring_bitmap_is_empty(const roaring_bitmap_t *r); + + +/** + * Empties the bitmap. It will have no auxiliary allocations (so if the bitmap + * was initialized in client memory via roaring_bitmap_init(), then a call to + * roaring_bitmap_clear() would be enough to "free" it) + */ +void roaring_bitmap_clear(roaring_bitmap_t *r); + +/** + * Convert the bitmap to a sorted array, output in `ans`. + * + * Caller is responsible to ensure that there is enough memory allocated, e.g. + * + * ans = malloc(roaring_bitmap_get_cardinality(bitmap) * sizeof(uint32_t)); + */ +void roaring_bitmap_to_uint32_array(const roaring_bitmap_t *r, uint32_t *ans); + +/** + * Store the bitmap to a bitset. This can be useful for people + * who need the performance and simplicity of a standard bitset. + * We assume that the input bitset is originally empty (does not + * have any set bit). + * + * bitset_t * out = bitset_create(); + * // if the bitset has content in it, call "bitset_clear(out)" + * bool success = roaring_bitmap_to_bitset(mybitmap, out); + * // on failure, success will be false. + * // You can then query the bitset: + * bool is_present = bitset_get(out, 10011 ); + * // you must free the memory: + * bitset_free(out); + * + */ +bool roaring_bitmap_to_bitset(const roaring_bitmap_t *r, bitset_t * bitset); + +/** + * Convert the bitmap to a sorted array from `offset` by `limit`, output in `ans`. + * + * Caller is responsible to ensure that there is enough memory allocated, e.g. + * + * ans = malloc(roaring_bitmap_get_cardinality(limit) * sizeof(uint32_t)); + * + * Return false in case of failure (e.g., insufficient memory) + */ +bool roaring_bitmap_range_uint32_array(const roaring_bitmap_t *r, + size_t offset, size_t limit, + uint32_t *ans); + +/** + * Remove run-length encoding even when it is more space efficient. + * Return whether a change was applied. + */ +bool roaring_bitmap_remove_run_compression(roaring_bitmap_t *r); + +/** + * Convert array and bitmap containers to run containers when it is more + * efficient; also convert from run containers when more space efficient. + * + * Returns true if the result has at least one run container. + * Additional savings might be possible by calling `shrinkToFit()`. + */ +bool roaring_bitmap_run_optimize(roaring_bitmap_t *r); + +/** + * If needed, reallocate memory to shrink the memory usage. + * Returns the number of bytes saved. + */ +size_t roaring_bitmap_shrink_to_fit(roaring_bitmap_t *r); + +/** + * Write the bitmap to an output pointer, this output buffer should refer to + * at least `roaring_bitmap_size_in_bytes(r)` allocated bytes. + * + * See `roaring_bitmap_portable_serialize()` if you want a format that's + * compatible with Java and Go implementations. This format can sometimes be + * more space efficient than the portable form, e.g. when the data is sparse. + * + * Returns how many bytes written, should be `roaring_bitmap_size_in_bytes(r)`. + * + * This function is endian-sensitive. If you have a big-endian system (e.g., a mainframe IBM s390x), + * the data format is going to be big-endian and not compatible with little-endian systems. + */ +size_t roaring_bitmap_serialize(const roaring_bitmap_t *r, char *buf); + +/** + * Use with `roaring_bitmap_serialize()`. + * + * (See `roaring_bitmap_portable_deserialize()` if you want a format that's + * compatible with Java and Go implementations). + * + * This function is endian-sensitive. If you have a big-endian system (e.g., a mainframe IBM s390x), + * the data format is going to be big-endian and not compatible with little-endian systems. + */ +roaring_bitmap_t *roaring_bitmap_deserialize(const void *buf); + +/** + * Use with `roaring_bitmap_serialize()`. + * + * (See `roaring_bitmap_portable_deserialize_safe()` if you want a format that's + * compatible with Java and Go implementations). + * + * This function is endian-sensitive. If you have a big-endian system (e.g., a mainframe IBM s390x), + * the data format is going to be big-endian and not compatible with little-endian systems. + * + * The difference with `roaring_bitmap_deserialize()` is that this function checks that the input buffer + * is a valid bitmap. If the buffer is too small, NULL is returned. + */ +roaring_bitmap_t *roaring_bitmap_deserialize_safe(const void *buf, size_t maxbytes); + +/** + * How many bytes are required to serialize this bitmap (NOT compatible + * with Java and Go versions) + */ +size_t roaring_bitmap_size_in_bytes(const roaring_bitmap_t *r); + +/** + * Read bitmap from a serialized buffer. + * In case of failure, NULL is returned. + * + * This function is unsafe in the sense that if there is no valid serialized + * bitmap at the pointer, then many bytes could be read, possibly causing a + * buffer overflow. See also roaring_bitmap_portable_deserialize_safe(). + * + * This is meant to be compatible with the Java and Go versions: + * https://github.com/RoaringBitmap/RoaringFormatSpec +* + * This function is endian-sensitive. If you have a big-endian system (e.g., a mainframe IBM s390x), + * the data format is going to be big-endian and not compatible with little-endian systems. + */ +roaring_bitmap_t *roaring_bitmap_portable_deserialize(const char *buf); + +/** + * Read bitmap from a serialized buffer safely (reading up to maxbytes). + * In case of failure, NULL is returned. + * + * This is meant to be compatible with the Java and Go versions: + * https://github.com/RoaringBitmap/RoaringFormatSpec + * + * The function itself is safe in the sense that it will not cause buffer overflows. + * However, for correct operations, it is assumed that the bitmap read was once + * serialized from a valid bitmap (i.e., it follows the format specification). + * If you provided an incorrect input (garbage), then the bitmap read may not be in + * a valid state and following operations may not lead to sensible results. + * In particular, the serialized array containers need to be in sorted order, and the + * run containers should be in sorted non-overlapping order. This is is guaranteed to + * happen when serializing an existing bitmap, but not for random inputs. + * + * This function is endian-sensitive. If you have a big-endian system (e.g., a mainframe IBM s390x), + * the data format is going to be big-endian and not compatible with little-endian systems. + */ +roaring_bitmap_t *roaring_bitmap_portable_deserialize_safe(const char *buf, + size_t maxbytes); + +/** + * Read bitmap from a serialized buffer. + * In case of failure, NULL is returned. + * + * Bitmap returned by this function can be used in all readonly contexts. + * Bitmap must be freed as usual, by calling roaring_bitmap_free(). + * Underlying buffer must not be freed or modified while it backs any bitmaps. + * + * The function is unsafe in the following ways: + * 1) It may execute unaligned memory accesses. + * 2) A buffer overflow may occur if buf does not point to a valid serialized + * bitmap. + * + * This is meant to be compatible with the Java and Go versions: + * https://github.com/RoaringBitmap/RoaringFormatSpec + * + * This function is endian-sensitive. If you have a big-endian system (e.g., a mainframe IBM s390x), + * the data format is going to be big-endian and not compatible with little-endian systems. + */ +roaring_bitmap_t *roaring_bitmap_portable_deserialize_frozen(const char *buf); + +/** + * Check how many bytes would be read (up to maxbytes) at this pointer if there + * is a bitmap, returns zero if there is no valid bitmap. + * + * This is meant to be compatible with the Java and Go versions: + * https://github.com/RoaringBitmap/RoaringFormatSpec + */ +size_t roaring_bitmap_portable_deserialize_size(const char *buf, + size_t maxbytes); + +/** + * How many bytes are required to serialize this bitmap. + * + * This is meant to be compatible with the Java and Go versions: + * https://github.com/RoaringBitmap/RoaringFormatSpec + */ +size_t roaring_bitmap_portable_size_in_bytes(const roaring_bitmap_t *r); + +/** + * Write a bitmap to a char buffer. The output buffer should refer to at least + * `roaring_bitmap_portable_size_in_bytes(r)` bytes of allocated memory. + * + * Returns how many bytes were written which should match + * `roaring_bitmap_portable_size_in_bytes(r)`. + * + * This is meant to be compatible with the Java and Go versions: + * https://github.com/RoaringBitmap/RoaringFormatSpec + * + * This function is endian-sensitive. If you have a big-endian system (e.g., a mainframe IBM s390x), + * the data format is going to be big-endian and not compatible with little-endian systems. + */ +size_t roaring_bitmap_portable_serialize(const roaring_bitmap_t *r, char *buf); + +/* + * "Frozen" serialization format imitates memory layout of roaring_bitmap_t. + * Deserialized bitmap is a constant view of the underlying buffer. + * This significantly reduces amount of allocations and copying required during + * deserialization. + * It can be used with memory mapped files. + * Example can be found in benchmarks/frozen_benchmark.c + * + * [#####] const roaring_bitmap_t * + * | | | + * +----+ | +-+ + * | | | + * [#####################################] underlying buffer + * + * Note that because frozen serialization format imitates C memory layout + * of roaring_bitmap_t, it is not fixed. It is different on big/little endian + * platforms and can be changed in future. + */ + +/** + * Returns number of bytes required to serialize bitmap using frozen format. + */ +size_t roaring_bitmap_frozen_size_in_bytes(const roaring_bitmap_t *r); + +/** + * Serializes bitmap using frozen format. + * Buffer size must be at least roaring_bitmap_frozen_size_in_bytes(). + * + * This function is endian-sensitive. If you have a big-endian system (e.g., a mainframe IBM s390x), + * the data format is going to be big-endian and not compatible with little-endian systems. + */ +void roaring_bitmap_frozen_serialize(const roaring_bitmap_t *r, char *buf); + +/** + * Creates constant bitmap that is a view of a given buffer. + * Buffer data should have been written by `roaring_bitmap_frozen_serialize()` + * Its beginning must also be aligned by 32 bytes. + * Length must be equal exactly to `roaring_bitmap_frozen_size_in_bytes()`. + * In case of failure, NULL is returned. + * + * Bitmap returned by this function can be used in all readonly contexts. + * Bitmap must be freed as usual, by calling roaring_bitmap_free(). + * Underlying buffer must not be freed or modified while it backs any bitmaps. + * + * This function is endian-sensitive. If you have a big-endian system (e.g., a mainframe IBM s390x), + * the data format is going to be big-endian and not compatible with little-endian systems. + */ +const roaring_bitmap_t *roaring_bitmap_frozen_view(const char *buf, + size_t length); + +/** + * Iterate over the bitmap elements. The function iterator is called once for + * all the values with ptr (can be NULL) as the second parameter of each call. + * + * `roaring_iterator` is simply a pointer to a function that returns bool + * (true means that the iteration should continue while false means that it + * should stop), and takes (uint32_t,void*) as inputs. + * + * Returns true if the roaring_iterator returned true throughout (so that all + * data points were necessarily visited). + * + * Iteration is ordered: from the smallest to the largest elements. + */ +bool roaring_iterate(const roaring_bitmap_t *r, roaring_iterator iterator, + void *ptr); + +bool roaring_iterate64(const roaring_bitmap_t *r, roaring_iterator64 iterator, + uint64_t high_bits, void *ptr); + +/** + * Return true if the two bitmaps contain the same elements. + */ +bool roaring_bitmap_equals(const roaring_bitmap_t *r1, + const roaring_bitmap_t *r2); + +/** + * Return true if all the elements of r1 are also in r2. + */ +bool roaring_bitmap_is_subset(const roaring_bitmap_t *r1, + const roaring_bitmap_t *r2); + +/** + * Return true if all the elements of r1 are also in r2, and r2 is strictly + * greater than r1. + */ +bool roaring_bitmap_is_strict_subset(const roaring_bitmap_t *r1, + const roaring_bitmap_t *r2); + +/** + * (For expert users who seek high performance.) + * + * Computes the union between two bitmaps and returns new bitmap. The caller is + * responsible for memory management. + * + * The lazy version defers some computations such as the maintenance of the + * cardinality counts. Thus you must call `roaring_bitmap_repair_after_lazy()` + * after executing "lazy" computations. + * + * It is safe to repeatedly call roaring_bitmap_lazy_or_inplace on the result. + * + * `bitsetconversion` is a flag which determines whether container-container + * operations force a bitset conversion. + */ +roaring_bitmap_t *roaring_bitmap_lazy_or(const roaring_bitmap_t *r1, + const roaring_bitmap_t *r2, + const bool bitsetconversion); + +/** + * (For expert users who seek high performance.) + * + * Inplace version of roaring_bitmap_lazy_or, modifies r1. + * + * `bitsetconversion` is a flag which determines whether container-container + * operations force a bitset conversion. + */ +void roaring_bitmap_lazy_or_inplace(roaring_bitmap_t *r1, + const roaring_bitmap_t *r2, + const bool bitsetconversion); + +/** + * (For expert users who seek high performance.) + * + * Execute maintenance on a bitmap created from `roaring_bitmap_lazy_or()` + * or modified with `roaring_bitmap_lazy_or_inplace()`. + */ +void roaring_bitmap_repair_after_lazy(roaring_bitmap_t *r1); + +/** + * Computes the symmetric difference between two bitmaps and returns new bitmap. + * The caller is responsible for memory management. + * + * The lazy version defers some computations such as the maintenance of the + * cardinality counts. Thus you must call `roaring_bitmap_repair_after_lazy()` + * after executing "lazy" computations. + * + * It is safe to repeatedly call `roaring_bitmap_lazy_xor_inplace()` on + * the result. + */ +roaring_bitmap_t *roaring_bitmap_lazy_xor(const roaring_bitmap_t *r1, + const roaring_bitmap_t *r2); + +/** + * (For expert users who seek high performance.) + * + * Inplace version of roaring_bitmap_lazy_xor, modifies r1. r1 != r2 + */ +void roaring_bitmap_lazy_xor_inplace(roaring_bitmap_t *r1, + const roaring_bitmap_t *r2); + +/** + * Compute the negation of the bitmap in the interval [range_start, range_end). + * The number of negated values is range_end - range_start. + * Areas outside the range are passed through unchanged. + */ +roaring_bitmap_t *roaring_bitmap_flip(const roaring_bitmap_t *r1, + uint64_t range_start, uint64_t range_end); + +/** + * compute (in place) the negation of the roaring bitmap within a specified + * interval: [range_start, range_end). The number of negated values is + * range_end - range_start. + * Areas outside the range are passed through unchanged. + */ +void roaring_bitmap_flip_inplace(roaring_bitmap_t *r1, uint64_t range_start, + uint64_t range_end); + +/** + * Selects the element at index 'rank' where the smallest element is at index 0. + * If the size of the roaring bitmap is strictly greater than rank, then this + * function returns true and sets element to the element of given rank. + * Otherwise, it returns false. + */ +bool roaring_bitmap_select(const roaring_bitmap_t *r, uint32_t rank, + uint32_t *element); + +/** + * roaring_bitmap_rank returns the number of integers that are smaller or equal + * to x. Thus if x is the first element, this function will return 1. If + * x is smaller than the smallest element, this function will return 0. + * + * The indexing convention differs between roaring_bitmap_select and + * roaring_bitmap_rank: roaring_bitmap_select refers to the smallest value + * as having index 0, whereas roaring_bitmap_rank returns 1 when ranking + * the smallest value. + */ +uint64_t roaring_bitmap_rank(const roaring_bitmap_t *r, uint32_t x); + +/** + * Returns the index of x in the given roaring bitmap. + * If the roaring bitmap doesn't contain x , this function will return -1. + * The difference with rank function is that this function will return -1 when x + * is not the element of roaring bitmap, but the rank function will return a + * non-negative number. + */ +int64_t roaring_bitmap_get_index(const roaring_bitmap_t *r, uint32_t x); + +/** + * Returns the smallest value in the set, or UINT32_MAX if the set is empty. + */ +uint32_t roaring_bitmap_minimum(const roaring_bitmap_t *r); + +/** + * Returns the greatest value in the set, or 0 if the set is empty. + */ +uint32_t roaring_bitmap_maximum(const roaring_bitmap_t *r); + +/** + * (For advanced users.) + * + * Collect statistics about the bitmap, see roaring_types.h for + * a description of roaring_statistics_t + */ +void roaring_bitmap_statistics(const roaring_bitmap_t *r, + roaring_statistics_t *stat); + +/********************* +* What follows is code use to iterate through values in a roaring bitmap + +roaring_bitmap_t *r =... +roaring_uint32_iterator_t i; +roaring_create_iterator(r, &i); +while(i.has_value) { + printf("value = %d\n", i.current_value); + roaring_advance_uint32_iterator(&i); +} + +Obviously, if you modify the underlying bitmap, the iterator +becomes invalid. So don't. +*/ + +typedef struct roaring_uint32_iterator_s { + const roaring_bitmap_t *parent; // owner + int32_t container_index; // point to the current container index + int32_t in_container_index; // for bitset and array container, this is out + // index + int32_t run_index; // for run container, this points at the run + + uint32_t current_value; + bool has_value; + + const ROARING_CONTAINER_T + *container; // should be: + // parent->high_low_container.containers[container_index]; + uint8_t typecode; // should be: + // parent->high_low_container.typecodes[container_index]; + uint32_t highbits; // should be: + // parent->high_low_container.keys[container_index]) << + // 16; + +} roaring_uint32_iterator_t; + +bool loadfirstvalue_largeorequal(roaring_uint32_iterator_t *newit, uint32_t val); + +bool loadfirstvalue(roaring_uint32_iterator_t *newit); + +bool loadlastvalue(roaring_uint32_iterator_t* newit); + +/** + * Initialize an iterator object that can be used to iterate through the + * values. If there is a value, then this iterator points to the first value + * and `it->has_value` is true. The value is in `it->current_value`. + */ +void roaring_init_iterator(const roaring_bitmap_t *r, + roaring_uint32_iterator_t *newit); + +/** + * Initialize an iterator object that can be used to iterate through the + * values. If there is a value, then this iterator points to the last value + * and `it->has_value` is true. The value is in `it->current_value`. + */ +void roaring_init_iterator_last(const roaring_bitmap_t *r, + roaring_uint32_iterator_t *newit); + +/** + * Create an iterator object that can be used to iterate through the values. + * Caller is responsible for calling `roaring_free_iterator()`. + * + * The iterator is initialized (this function calls `roaring_init_iterator()`) + * If there is a value, then this iterator points to the first value and + * `it->has_value` is true. The value is in `it->current_value`. + */ +roaring_uint32_iterator_t *roaring_create_iterator(const roaring_bitmap_t *r); + +/** +* Advance the iterator. If there is a new value, then `it->has_value` is true. +* The new value is in `it->current_value`. Values are traversed in increasing +* orders. For convenience, returns `it->has_value`. +*/ +bool roaring_advance_uint32_iterator(roaring_uint32_iterator_t *it); + +/** +* Decrement the iterator. If there's a new value, then `it->has_value` is true. +* The new value is in `it->current_value`. Values are traversed in decreasing +* order. For convenience, returns `it->has_value`. +*/ +bool roaring_previous_uint32_iterator(roaring_uint32_iterator_t *it); + +/** + * Move the iterator to the first value >= `val`. If there is a such a value, + * then `it->has_value` is true. The new value is in `it->current_value`. + * For convenience, returns `it->has_value`. + */ +bool roaring_move_uint32_iterator_equalorlarger(roaring_uint32_iterator_t *it, + uint32_t val); + +/** + * Creates a copy of an iterator. + * Caller must free it. + */ +roaring_uint32_iterator_t *roaring_copy_uint32_iterator( + const roaring_uint32_iterator_t *it); + +/** + * Free memory following `roaring_create_iterator()` + */ +void roaring_free_uint32_iterator(roaring_uint32_iterator_t *it); + +/* + * Reads next ${count} values from iterator into user-supplied ${buf}. + * Returns the number of read elements. + * This number can be smaller than ${count}, which means that iterator is drained. + * + * This function satisfies semantics of iteration and can be used together with + * other iterator functions. + * - first value is copied from ${it}->current_value + * - after function returns, iterator is positioned at the next element + */ +uint32_t roaring_read_uint32_iterator(roaring_uint32_iterator_t *it, + uint32_t* buf, uint32_t count); + +#ifdef __cplusplus +} } } // extern "C" { namespace roaring { namespace api { +#endif + +#endif /* ROARING_H */ + +#ifdef __cplusplus + /** + * Best practices for C++ headers is to avoid polluting global scope. + * But for C compatibility when just `roaring.h` is included building as + * C++, default to global access for the C public API. + * + * BUT when `roaring.hh` is included instead, it sets this flag. That way + * explicit namespacing must be used to get the C functions. + * + * This is outside the include guard so that if you include BOTH headers, + * the order won't matter; you still get the global definitions. + */ + #if !defined(ROARING_API_NOT_IN_GLOBAL_NAMESPACE) + using namespace ::roaring::api; + #endif +#endif +/* end file include/roaring/roaring.h */ +/* begin file include/roaring/memory.h */ +#ifndef INCLUDE_ROARING_MEMORY_H_ +#define INCLUDE_ROARING_MEMORY_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include // for size_t + +typedef void* (*roaring_malloc_p)(size_t); +typedef void* (*roaring_realloc_p)(void*, size_t); +typedef void* (*roaring_calloc_p)(size_t, size_t); +typedef void (*roaring_free_p)(void*); +typedef void* (*roaring_aligned_malloc_p)(size_t, size_t); +typedef void (*roaring_aligned_free_p)(void*); + +typedef struct roaring_memory_s { + roaring_malloc_p malloc; + roaring_realloc_p realloc; + roaring_calloc_p calloc; + roaring_free_p free; + roaring_aligned_malloc_p aligned_malloc; + roaring_aligned_free_p aligned_free; +} roaring_memory_t; + +void roaring_init_memory_hook(roaring_memory_t memory_hook); + +void* roaring_malloc(size_t); +void* roaring_realloc(void*, size_t); +void* roaring_calloc(size_t, size_t); +void roaring_free(void*); +void* roaring_aligned_malloc(size_t, size_t); +void roaring_aligned_free(void*); + +#ifdef __cplusplus +} +#endif + +#endif // INCLUDE_ROARING_MEMORY_H_ +/* end file include/roaring/memory.h */ diff --git a/src/sylvan.h b/src/sylvan.h index b09aa136..c4b0155f 100644 --- a/src/sylvan.h +++ b/src/sylvan.h @@ -42,6 +42,7 @@ * Sylvan header files outside the namespace */ +#include #include #include @@ -56,10 +57,12 @@ namespace sylvan { #include #include #include +#include #include #include #include #include +#include #ifdef __cplusplus } diff --git a/src/sylvan_align.h b/src/sylvan_align.h index 5c498d7a..5a9eacba 100644 --- a/src/sylvan_align.h +++ b/src/sylvan_align.h @@ -20,6 +20,9 @@ #if SYLVAN_USE_MMAP #include // for mmap +#ifndef MAP_ANONYMOUS +#define MAP_ANONYMOUS 0x1000 /* allocated from memory, swap space */ +#endif #endif #ifndef SYLVAN_ALIGN_H diff --git a/src/sylvan_bitmap.c b/src/sylvan_bitmap.c new file mode 100644 index 00000000..a84914ec --- /dev/null +++ b/src/sylvan_bitmap.c @@ -0,0 +1,270 @@ +#include +#include +#include + +#include +#include + +/** + * @brief Return position of the first most significant 1-bit + */ +static inline bitmap_bucket_t get_first_msb_one_bit_pos(bitmap_bucket_t bucket, size_t word_idx) +{ + + return NBITS_PER_BUCKET * word_idx + __builtin_clzll(bucket); +} + +/** + * @brief Return position of the first least significant 1-bit + */ +static inline bitmap_bucket_t get_first_lsb_one_bit_pos(bitmap_bucket_t bucket, size_t word_idx) +{ + return NBITS_PER_BUCKET * word_idx + (64 - __builtin_ctzll(bucket)); +} + +void bitmap_init(bitmap_t* bitmap, size_t new_size) +{ + bitmap_deinit(bitmap); + bitmap->buckets = (bitmap_bucket_t *) alloc_aligned(new_size); + if (bitmap != NULL) bitmap->size = new_size; + else bitmap->size = 0; +} + + +void bitmap_deinit(bitmap_t *bitmap) +{ + if (bitmap->buckets != NULL) free_aligned(bitmap->buckets, bitmap->size); + bitmap->size = 0; + bitmap->buckets = NULL; +} + +inline void bitmap_set(bitmap_t *bitmap, size_t pos) +{ + bitmap->buckets[BUCKET_OFFSET(pos)] |= BIT_MASK(pos); +} + +inline void bitmap_clear(bitmap_t *bitmap, size_t pos) +{ + bitmap->buckets[BUCKET_OFFSET(pos)] &= ~BIT_MASK(pos); +} + +inline char bitmap_get(const bitmap_t *bitmap, size_t pos) +{ + return bitmap->buckets[BUCKET_OFFSET(pos)] & BIT_MASK(pos) ? 1 : 0; +} + +inline void bitmap_clear_all(bitmap_t *bitmap) +{ + if (bitmap->buckets == NULL) { + bitmap->size = 0; + return; + } + if (bitmap->size == 0) return; + clear_aligned(bitmap->buckets, bitmap->size); +} + +inline size_t bitmap_first(bitmap_t *bitmap) +{ + return bitmap_first_from(bitmap, 0); +} + +size_t bitmap_first_from(bitmap_t *bitmap, size_t bucket_idx) +{ + size_t n_buckets = NBUCKETS(bitmap->size); + // find the first word which contains at least one 1-bit + while (bucket_idx < n_buckets && bitmap->buckets[bucket_idx] == 0) { + bucket_idx++; + } + if (bucket_idx == n_buckets) { + return npos; // no 1-bit found + } else { + return get_first_msb_one_bit_pos(bitmap->buckets[bucket_idx], bucket_idx); + } +} + +size_t bitmap_next(bitmap_t *bitmap, size_t pos) +{ + if (pos == npos || (pos + 1) >= bitmap->size) return npos; + pos++; + // get word for pos++ + size_t word_idx = BUCKET_OFFSET(pos); + // check whether there are still any 1-bits in the current word + bitmap_bucket_t word = bitmap->buckets[word_idx] & (0xffffffffffffffffLL >> BIT_OFFSET(pos)); + if (word) { + // there exist some successor 1 bit in the word, thus return the pos directly + return get_first_msb_one_bit_pos(word, word_idx); + } else { + // the current word does not contain any successor 1-bits, + // thus now find the next word that is not 0 and return the first 1-bit pos + word_idx++; + return bitmap_first_from(bitmap, word_idx); + } +} + +inline size_t bitmap_last(bitmap_t *bitmap) +{ + return bitmap_last_from(bitmap, bitmap->size - 1); +} + +size_t bitmap_last_from(bitmap_t *bitmap, size_t pos) +{ + size_t word_idx = BUCKET_OFFSET(pos); + if (word_idx == 0) return npos; + // find the last word which contains at least one 1-bit + while (word_idx > 0 && bitmap->buckets[word_idx] == 0) word_idx--; + if (word_idx == 0) return npos; // no 1-bit found + else return get_first_lsb_one_bit_pos(bitmap->buckets[word_idx], word_idx); +} + +size_t bitmap_prev(bitmap_t *bitmap, size_t pos) +{ + if (pos == 0 || pos == npos) return npos; + pos--; + size_t word_idx = BUCKET_OFFSET(pos); + // check whether there are still any predecessor 1-bits in the current word + bitmap_bucket_t word = bitmap->buckets[word_idx] & ~(0xffffffffffffffffLL >> BIT_OFFSET(pos)); + if (word) { + // there exist some predecessor 1 bit in the word, thus return the pos directly + return get_first_lsb_one_bit_pos(word, word_idx); + } else { + // the current word does not contain any successor 1-bits, + // thus now find the next word that is not 0 and return the first 1-bit pos + word_idx--; + return bitmap_last_from(bitmap, word_idx); + } +} + +size_t bitmap_count(bitmap_t *bitmap) +{ + return popcnt(bitmap->buckets, NBUCKETS(bitmap->size) * 8); +} + +inline size_t atomic_bitmap_first(atomic_bitmap_t *bitmap) +{ + return atomic_bitmap_first_from(bitmap, 0); +} + +void atomic_bitmap_init(atomic_bitmap_t* bitmap, size_t new_size) +{ + atomic_bitmap_deinit(bitmap); + bitmap->container = (_Atomic(bitmap_bucket_t) *) alloc_aligned(new_size); + if (bitmap != NULL) bitmap->size = new_size; + else bitmap->size = 0; +} + +void atomic_bitmap_deinit(atomic_bitmap_t *bitmap) +{ + if (bitmap->container != NULL && bitmap->size > 0) free_aligned(bitmap->container, bitmap->size); + bitmap->size = 0; + bitmap->container = NULL; +} + +void atomic_bitmap_clear_all(atomic_bitmap_t *bitmap) +{ + if (bitmap->container == NULL) { + bitmap->size = 0; + return; + } + if (bitmap->size == 0) return; + clear_aligned(bitmap->container, bitmap->size); +} + +size_t atomic_bitmap_first_from(atomic_bitmap_t *bitmap, size_t word_idx) +{ + size_t nwords = NBUCKETS(bitmap->size); + bitmap_bucket_t word = atomic_load_explicit(bitmap->container + word_idx, memory_order_relaxed); + // find the first word which contains at least one 1-bit + while (word_idx < nwords && word == 0) { + word_idx++; + word = atomic_load_explicit(bitmap->container + word_idx, memory_order_relaxed); + } + if (word_idx == nwords) { + // we have reached the end of the bitmap + return npos; + } else { + // we have found the first word which contains at least one 1-bit + return get_first_msb_one_bit_pos(word, word_idx); + } +} + +size_t atomic_bitmap_next(atomic_bitmap_t *bitmap, size_t pos) +{ + if (pos == npos || (pos + 1) >= bitmap->size) return npos; + pos++; + // get word index for pos + size_t word_idx = BUCKET_OFFSET(pos); + // check whether there are still any successor 1-bits in the current word + bitmap_bucket_t word = atomic_load_explicit(bitmap->container + word_idx, memory_order_relaxed) & (0xffffffffffffffffLL >> BIT_OFFSET(pos)); + if (word) { + // there exist some successor 1 bit in the word, thus return the pos directly + return get_first_msb_one_bit_pos(word, word_idx); + } else { + // the current word does not contain any successor 1-bits, + // thus now find the next word that is not 0 and return the first 1-bit pos + word_idx++; + return atomic_bitmap_first_from(bitmap, word_idx); + } +} + +inline size_t atomic_bitmap_last(atomic_bitmap_t *bitmap) +{ + return atomic_bitmap_last_from(bitmap, bitmap->size - 1); +} + +size_t atomic_bitmap_last_from(atomic_bitmap_t *bitmap, size_t pos) +{ + size_t word_idx = BUCKET_OFFSET(pos); + if (word_idx == 0 || word_idx == npos) return npos; + _Atomic(bitmap_bucket_t) *ptr = bitmap->container + word_idx; + bitmap_bucket_t word = atomic_load_explicit(ptr, memory_order_relaxed); + // find the last word which contains at least one 1-bit + while (word_idx > 0 && word == 0) { + word_idx--; + ptr = bitmap->container + word_idx; + word = atomic_load_explicit(ptr, memory_order_relaxed); + } + if (word_idx == 0) { + // we have reached the end of the bitmap + return npos; + } else { + // we have found the first word which contains at least one 1-bit + return get_first_lsb_one_bit_pos(word, word_idx); + } +} + +size_t atomic_bitmap_prev(atomic_bitmap_t *bitmap, size_t pos) +{ + if (pos == 0 || pos == npos) return npos; + pos--; + size_t word_idx = BUCKET_OFFSET(pos); + _Atomic(bitmap_bucket_t) *ptr = bitmap->container + word_idx; + // check whether there are still any predecessor 1-bits in the current word + bitmap_bucket_t word = atomic_load_explicit(ptr, memory_order_relaxed) & ~(0xffffffffffffffffLL >> BIT_OFFSET(pos)); + if (word) { + // there exist some predecessor 1 bit in the word, thus return the pos directly + return get_first_lsb_one_bit_pos(word, word_idx); + } else { + // the current word does not contain any predecessor 1-bits, + // thus now find the next word that is not 0 and return the first 1-bit lsb pos + word_idx--; + return atomic_bitmap_last_from(bitmap, word_idx); + } +} + +void atomic_bitmap_set(atomic_bitmap_t *bitmap, size_t pos, memory_order ordering) +{ + uint64_t mask = BIT_MASK(pos); + atomic_fetch_or_explicit(bitmap->container + BUCKET_OFFSET(pos), mask, ordering); +} + +void atomic_bitmap_clear(atomic_bitmap_t *bitmap, size_t pos, memory_order ordering) +{ + uint64_t mask = BIT_MASK(pos); + atomic_fetch_and_explicit(bitmap->container + BUCKET_OFFSET(pos), ~mask, ordering); +} + +int atomic_bitmap_get(const atomic_bitmap_t *bitmap, size_t pos, memory_order ordering) +{ + bitmap_bucket_t word = atomic_load_explicit(bitmap->container + BUCKET_OFFSET(pos), ordering); + return word & BIT_MASK(pos) ? 1 : 0; +} \ No newline at end of file diff --git a/src/sylvan_bitmap.h b/src/sylvan_bitmap.h new file mode 100644 index 00000000..b2cddfb3 --- /dev/null +++ b/src/sylvan_bitmap.h @@ -0,0 +1,172 @@ +#ifndef SYLVAN_BITMAP_H +#define SYLVAN_BITMAP_H + +#ifndef __cplusplus + #include + #define memory_order memory_order +#else + // Compatibility with C11 + #define memory_order std::memory_order +#endif + +#include +#include +#include // for CHAR_BIT + + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +// use uint64_t/ uint32_t to advantage the usual 64 bytes per cache line +typedef uint64_t bitmap_bucket_t; + +typedef struct bitmap_s { + bitmap_bucket_t *buckets; + size_t size; +} bitmap_t; + +typedef struct atomic_bitmap_s { + _Atomic (bitmap_bucket_t) *container; + size_t size; +} atomic_bitmap_t; + +static const size_t npos = (bitmap_bucket_t)-1; +static const size_t NBITS_PER_BUCKET = sizeof(bitmap_bucket_t) * CHAR_BIT; + +#define BUCKET_OFFSET(b) ((b) / NBITS_PER_BUCKET) +#define BIT_OFFSET(b) ((b) % NBITS_PER_BUCKET) +#define BIT_MASK(b) (0x8000000000000000LL >> (BIT_OFFSET(b))) +#define NBUCKETS(b) (((b) + NBITS_PER_BUCKET-1) / NBITS_PER_BUCKET) + +/* + * Allocate a new bitmap with the given size + */ +void bitmap_init(bitmap_t* bitmap, size_t new_size); + +/* + * Free the bitmap + */ +void bitmap_deinit(bitmap_t *bitmap); + +/** + * Set the bit at position n to 1, if it was 0. + */ +void bitmap_set(bitmap_t *bitmap, size_t pos); + +/** + * Set the bit at position n to 0, if it was 1. + */ +void bitmap_clear(bitmap_t *bitmap, size_t pos); + +/** + * Get the bit at position n. + */ +char bitmap_get(const bitmap_t *bitmap, size_t pos); + +/** + * Set the bit at position n to 0, if it was 1. + */ +void bitmap_clear_all(bitmap_t *bitmap); + +/** + * Get the first bit set to 1 + */ +size_t bitmap_first(bitmap_t *bitmap); + +/** + * Get the first bit set to 1 (atomic version) + */ +size_t bitmap_first_from(bitmap_t *bitmap, size_t bucket_idx); + +/** + * Get the last bit set to 1 + */ +size_t bitmap_last(bitmap_t *bitmap); + +/** + * Get the last 1-bit position from the given word index + */ +size_t bitmap_last_from(bitmap_t *bitmap, size_t pos); + +/** + * Get the next bit set to 1 + */ +size_t bitmap_next(bitmap_t *bitmap, size_t pos); + +/** + * Get the previous bit set to 1 + */ +size_t bitmap_prev(bitmap_t *bitmap, size_t pos); + +/** + * Count the number of bits set to 1 + */ +size_t bitmap_count(bitmap_t *bitmap); + +/* + * Allocate a new bitmap with the given size (heap allocation) + */ +void atomic_bitmap_init(atomic_bitmap_t* bitmap, size_t new_size); + +/* + * Free the bitmap + */ +void atomic_bitmap_deinit(atomic_bitmap_t *bitmap); + +/** + * Set all bits to 0 + */ +void atomic_bitmap_clear_all(atomic_bitmap_t *bitmap); + +/** + * Get the first bit set to 1 (atomic version) + */ +size_t atomic_bitmap_first(atomic_bitmap_t *bitmap); + +/** + * Get the first 1-bit position from the given word index (atomic version) + */ +size_t atomic_bitmap_first_from(atomic_bitmap_t *bitmap, size_t word_idx); + +/** + * Get the last bit set to 1 + */ +size_t atomic_bitmap_last(atomic_bitmap_t *bitmap); + +/* + * Get the last 1-bit position from the given word index (atomic version) + */ +size_t atomic_bitmap_last_from(atomic_bitmap_t *bitmap, size_t pos); + +/** + * Get the next bit set to 1 (atomic version) + */ +size_t atomic_bitmap_next(atomic_bitmap_t *bitmap, size_t pos); + +/** + * Get the previous bit set to 1 + */ +size_t atomic_bitmap_prev(atomic_bitmap_t *bitmap, size_t pos); + +/** + * Set the bit at position n to 1, if it was 0. (Atomic version) + */ +void atomic_bitmap_set(atomic_bitmap_t *bitmap, size_t pos, memory_order ordering); + +/** + * Set the bit at position n to 0, if it was 1. (Atomic version) + */ +void atomic_bitmap_clear(atomic_bitmap_t *bitmap, size_t pos, memory_order ordering); + +/** + * Get the bit at position n. (Atomic version) + */ +int atomic_bitmap_get(const atomic_bitmap_t *bitmap, size_t pos, memory_order ordering); + + +#ifdef __cplusplus +} +#endif /* __cplusplus */ + +#endif // SYLVAN_BITMAP_H \ No newline at end of file diff --git a/src/sylvan_common.c b/src/sylvan_common.c index 65269930..41653b63 100644 --- a/src/sylvan_common.c +++ b/src/sylvan_common.c @@ -262,6 +262,11 @@ VOID_TASK_IMPL_0(sylvan_gc) } } +/** + * The reorder table used with dynamic variable reordering. + */ +reorder_db_t reorder_db; + /** * The unique table */ diff --git a/src/sylvan_config.h b/src/sylvan_config.h index f818c627..502a0473 100644 --- a/src/sylvan_config.h +++ b/src/sylvan_config.h @@ -33,3 +33,21 @@ #ifndef SYLVAN_AGGRESSIVE_RESIZE #define SYLVAN_AGGRESSIVE_RESIZE 1 #endif + +/* Either use chaining or linear implementation as a hash collision strategy */ +#ifndef SYLVAN_USE_LINEAR_PROBING +#define SYLVAN_USE_LINEAR_PROBING 0 +#endif + +// Variable ordering default parameter values +#define SYLVAN_REORDER_MAX_VAR 100 +#define SYLVAN_REORDER_MAX_SWAPS 10000 +#define SYLVAN_REORDER_GROWTH 1.2f +#define SYLVAN_REORDER_NODES_THRESHOLD 1 +#define SYLVAN_REORDER_TIME_LIMIT_MS (10000 * 60 * 1000) +#define SYLVAN_REORDER_SIZE_THRESHOLD 4000 +#define SYLVAN_REORDER_SIZE_RATIO (2) +#define SYLVAN_REORDER_LIMIT 10 // maximum number of reordering calls allowed +#define SYLVAN_REORDER_TYPE_DEFAULT SYLVAN_REORDER_BOUNDED_SIFT +#define SYLVAN_REORDER_PRINT_STAT 0 +#define SYLVAN_REORDER_MIN_MEM_REQ (0.9f) \ No newline at end of file diff --git a/src/sylvan_int.h b/src/sylvan_int.h index 84cc6ba9..3f43148f 100644 --- a/src/sylvan_int.h +++ b/src/sylvan_int.h @@ -30,9 +30,14 @@ namespace sylvan { * Sylvan internal header files inside the namespace */ +#include #include #include #include +#include +#include +#include +#include #ifndef SYLVAN_INT_H #define SYLVAN_INT_H @@ -41,6 +46,15 @@ namespace sylvan { extern "C" { #endif /* __cplusplus */ +/* 40 bits for the index, 24 bits for the hash */ +#define SYLVAN_TABLE_MASK_INDEX ((uint64_t)0x000000ffffffffff) +#define SYLVAN_TABLE_MASK_HASH ((uint64_t)0xffffff0000000000) + +/** + * The reorder table used with dynamic variable reordering. + */ +extern reorder_db_t reorder_db; + /** * Nodes table. */ diff --git a/src/sylvan_interact.c b/src/sylvan_interact.c new file mode 100644 index 00000000..5ecca090 --- /dev/null +++ b/src/sylvan_interact.c @@ -0,0 +1,186 @@ +#include +#include + +#include +#include +#include +#include + + +void interact_deinit(interact_t *self) +{ + atomic_bitmap_deinit(self); +} + +static inline size_t interact_get_nrows(const interact_t *self) +{ + double nrows = sqrt(self->size); + return nrows < 0 ? 0 : (size_t) nrows; +} + +inline void interact_set(interact_t *self, size_t row, size_t col) +{ + atomic_bitmap_set(self, (row * interact_get_nrows(self)) + col, memory_order_seq_cst); +} + +inline int interact_get(const interact_t *self, size_t row, size_t col) +{ + return atomic_bitmap_get(self, (row * interact_get_nrows(self)) + col, memory_order_relaxed); +} + +inline int interact_test(const interact_t *self, uint32_t x, uint32_t y) +{ + // ensure x < y + // this is because we only keep the upper triangle of the matrix + if (x > y) { + int tmp = x; + x = y; + y = tmp; + } + return interact_get(self, x, y); +} + +void interact_update(interact_t *self, atomic_bitmap_t *bitmap) +{ + size_t i, j; + size_t nrows = interact_get_nrows(self); + size_t ncols = nrows; + for (i = 0; i < nrows - 1; i++) { + if (atomic_bitmap_get(bitmap, i, memory_order_relaxed) == 1) { + atomic_bitmap_clear(bitmap, i, memory_order_relaxed); + for (j = i + 1; j < ncols; j++) { + if (atomic_bitmap_get(bitmap, j, memory_order_relaxed) == 1) { + interact_set(self, i, j); + } + } + } + } + atomic_bitmap_clear(bitmap, nrows - 1, memory_order_relaxed); +} + +void interact_print(const interact_t *self) +{ + size_t nrows = interact_get_nrows(self); + size_t ncols = nrows; + printf("Interaction matrix: \n"); + printf(" \t"); + for (size_t i = 0; i < nrows; ++i) printf("%zu ", i); + printf("\n"); + + for (size_t i = 0; i < nrows; ++i) { + printf("%zu \t", i); + for (size_t j = 0; j < ncols; ++j) { + printf("%d ", interact_test(self, i, j)); + if (j > 9) printf(" "); + if (j > 99) printf(" "); + if (j > 999) printf(" "); + } + printf("\n"); + } + + printf("\n"); +} + +/** + * + * @brief Find the support of f. (parallel) + * + * @sideeffect Accumulates in support the variables on which f depends. + * + * If F00 = F01 and F10 = F11, then F does not depend on . If this is the case + * for all the nodes of variable , we say that variables and do not interact. + * + * Performs a tree search on the BDD to accumulate the support array of the variables on which f depends. + * + * (x)F + * / \ + * (y)F0 (y)F1 + * / \ / \ + * F00 F01 F10 F11 + */ +#define find_support(f, lvl_db, support, global, local) RUN(find_support, f, support, global, local) +VOID_TASK_5(find_support, MTBDD, f, levels_t*, lvl_db, atomic_bitmap_t*, support, atomic_bitmap_t*, global, + atomic_bitmap_t*, local) +{ + uint64_t index = f & SYLVAN_TABLE_MASK_INDEX; + if (index == 0 || index == 1 || index == sylvan_invalid) return; + if (f == mtbdd_true || f == mtbdd_false) return; + + if (atomic_bitmap_get(local, index, memory_order_relaxed)) return; + + BDDVAR var = mtbdd_getvar(f); + // set support bitmap, contributes to the outcome of + atomic_bitmap_set(support, lvl_db->level_to_order[var], memory_order_relaxed); + + if (!mtbdd_isleaf(f)) { + // visit all nodes reachable from + MTBDD f1 = mtbdd_gethigh(f); + MTBDD f0 = mtbdd_getlow(f); + CALL(find_support, f1, lvl_db, support, global, local); + CALL(find_support, f0, lvl_db, support, global, local); + } + + // locally visited node used to avoid duplicate node visit for a given tree + atomic_bitmap_set(local, index, memory_order_relaxed); + // globally visited node used to determining root nodes + atomic_bitmap_set(global, index, memory_order_relaxed); +} + +VOID_TASK_IMPL_5(interact_init, interact_t*, self, levels_t*, lvl_db, mrc_t*, mrc, size_t, nvars, size_t, nnodes) +{ + atomic_bitmap_init(self, nvars * nvars); + + atomic_bitmap_t support = (atomic_bitmap_t) { + .container = NULL, + .size = 0 + }; // support bitmap + atomic_bitmap_t global = (atomic_bitmap_t) { + .container = NULL, + .size = 0 + }; // globally visited nodes bitmap (forest wise) + atomic_bitmap_t local = (atomic_bitmap_t) { + .container = NULL, + .size = 0 + }; // locally visited nodes bitmap (tree wise) + + atomic_bitmap_init(&support, nvars); + atomic_bitmap_init(&global, nnodes); + atomic_bitmap_init(&local, nnodes); + + // start the tree traversals only form nodes with external references + for (size_t index = atomic_bitmap_first(&mrc->ext_ref_nodes); + index < nodes->table_size; index = atomic_bitmap_next(&mrc->ext_ref_nodes, index)) { + // A node is a root of the DAG if it cannot be reached by nodes above it. + // If a node was never reached during the previous searches, + // then it is a root, and we start a new search from it. + mtbddnode_t node = MTBDD_GETNODE(index); + if (mtbddnode_isleaf(node)) { + // if the node was a leaf, job done + continue; + } + + if (atomic_bitmap_get(&global, index, memory_order_relaxed) == 1) { + // already visited node, thus can not be a root and we can skip it + continue; + } + + // visit all nodes reachable from + MTBDD f1 = mtbddnode_gethigh(node); + MTBDD f0 = mtbddnode_getlow(node); + CALL(find_support, f1, lvl_db, &support, &global, &local); + CALL(find_support, f0, lvl_db, &support, &global, &local); + + BDDVAR var = mtbddnode_getvariable(node); + // set support bitmap, contributes to the outcome of + atomic_bitmap_set(&support, lvl_db->level_to_order[var], memory_order_relaxed); + + // clear locally visited nodes bitmap, + atomic_bitmap_clear_all(&local); + // update interaction matrix + interact_update(self, &support); + } + + atomic_bitmap_deinit(&support); + atomic_bitmap_deinit(&global); + atomic_bitmap_deinit(&local); +} \ No newline at end of file diff --git a/src/sylvan_interact.h b/src/sylvan_interact.h new file mode 100644 index 00000000..3c9257fa --- /dev/null +++ b/src/sylvan_interact.h @@ -0,0 +1,46 @@ +#ifndef SYLVAN_SYLVAN_INTERACT_H +#define SYLVAN_SYLVAN_INTERACT_H + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +typedef atomic_bitmap_t interact_t; + +VOID_TASK_DECL_5(interact_init, interact_t*, levels_t*, mrc_t*, size_t, size_t) +/** + * @brief Initialize the variable interaction matrix. + * + * @details The interaction matrix is a bitmap of size n*n, where n is the number of variables. + * + * @memory: # of variables * # of variables * 1 bit -> O(v^2) + */ +#define interact_init(i, l, m, v, n) RUN(interact_init, i, l, m, v, n) + +void interact_deinit(interact_t *self); + +void interact_set(interact_t *self, size_t row, size_t col); + +int interact_get(const interact_t *self, size_t row, size_t col); + +int interact_test(const interact_t *self, uint32_t x, uint32_t y); + +/** + @brief Marks as interacting all pairs of variables that appear in + support. + + @details If support[i] == support[j] == 1, sets the (i,j) entry + of the interaction matrix to 1. + + @sideeffect Clears support. + +*/ +void interact_update(interact_t *self, atomic_bitmap_t *support); + +void interact_print(const interact_t *self); + +#ifdef __cplusplus +} +#endif /* __cplusplus */ + +#endif //SYLVAN_SYLVAN_INTERACT_H diff --git a/src/sylvan_levels.c b/src/sylvan_levels.c new file mode 100644 index 00000000..e5c341fd --- /dev/null +++ b/src/sylvan_levels.c @@ -0,0 +1,169 @@ +#include +#include +#include // for errno + +static size_t levels_size = 0; // size of the arrays in levels_t used to realloc memory + +size_t levels_get(levels_t* self, uint64_t level) +{ + return self->table[self->level_to_order[level]]; +} + +size_t levels_get_count(levels_t* self) +{ + return self->count; +} + +uint64_t levels_new_one(levels_t* self) +{ + levels_new_many(1); + return self->table[levels_get_count(self) - 1]; +} + +int levels_new_many(size_t amount) +{ + if (reorder_db->levels.count + amount >= levels_size) { + // just round up to the next multiple of 64 value + // probably better than doubling anyhow... + levels_size = (reorder_db->levels.count + amount + 63) & (~63LL); + reorder_db->levels.table = (_Atomic (uint64_t) *) realloc(reorder_db->levels.table, sizeof(uint64_t[levels_size])); + reorder_db->levels.level_to_order = (_Atomic (uint32_t) *) realloc(reorder_db->levels.level_to_order, sizeof(uint32_t[levels_size])); + reorder_db->levels.order_to_level = (_Atomic (uint32_t) *) realloc(reorder_db->levels.order_to_level, sizeof(uint32_t[levels_size])); + + if (reorder_db->levels.table == NULL || reorder_db->levels.level_to_order == NULL || reorder_db->levels.order_to_level == NULL) { + fprintf(stderr, "levels_new_many failed to realloc new memory: %s!\n", strerror(errno)); + exit(1); + } + } + for (size_t i = 0; i < amount; i++) { + reorder_db->levels.table[reorder_db->levels.count] = sylvan_invalid; + reorder_db->levels.level_to_order[reorder_db->levels.count] = reorder_db->levels.count; + reorder_db->levels.order_to_level[reorder_db->levels.count] = reorder_db->levels.count; + reorder_db->levels.count++; + } + return 1; +} + +uint64_t levels_new_node(levels_t* self, uint32_t level, uint64_t low, uint64_t high) +{ + if (level >= self->count) { + fprintf(stderr, "mtbdd_levels_makenode failed. Out of level bounds."); + return 0; + } + + BDDVAR order = self->level_to_order[level]; + self->table[order] = mtbdd_makenode(order, low, high); + + return self->table[order]; +} + +void levels_reset(levels_t* self) +{ + if (levels_size != 0) { + if (!self->table) free(self->table); + self->table = NULL; + + if (!self->level_to_order) free(self->level_to_order); + self->level_to_order = NULL; + + if (!self->order_to_level) free(self->order_to_level); + self->order_to_level = NULL; + + self->count = 0; + levels_size = 0; + } +} + +uint64_t levels_ithlevel(levels_t* self, uint32_t level) +{ + if (level < self->count) { + if (levels_get(self, level) == sylvan_invalid){ + levels_new_node(self, level, mtbdd_false, mtbdd_true); + } + if (!llmsset_is_marked(nodes, levels_get(self, level) & SYLVAN_TABLE_MASK_INDEX)) { + levels_new_node(self, level, mtbdd_false, mtbdd_true); + } + } else { + size_t amount = level - self->count + 1; + levels_new_many(amount); + levels_new_node(self, level, mtbdd_false, mtbdd_true); + } + + return levels_get(self, level); +} + +uint32_t levels_order_to_level(levels_t *self, uint32_t var) +{ + if (var < self->count) return self->order_to_level[var]; + else return var; +} + +uint32_t +levels_level_to_order(levels_t *self, uint32_t level) +{ + if (level < self->count) return self->level_to_order[level]; + else return level; +} + +uint32_t levels_swap(levels_t *self, uint32_t x, uint32_t y) +{ + if (x >= self->count || y >= self->count) return 0; + + self->order_to_level[self->level_to_order[x]] = y; + self->order_to_level[self->level_to_order[y]] = x; + + uint32_t tmp = self->level_to_order[x]; + + self->level_to_order[x] = self->level_to_order[y]; + self->level_to_order[y] = tmp; + + return 1; +} + + +/** + * This function is called during garbage collection and + * marks all managed level BDDs so they are kept. + */ +VOID_TASK_0(mtbdd_gc_mark_managed_refs) +{ + for (size_t i = 0; i < reorder_db->levels.count; i++) { + if (reorder_db->levels.table[i] != sylvan_invalid){ + llmsset_mark(nodes, MTBDD_STRIPMARK(reorder_db->levels.table[i])); + } + } +} + +void levels_gc_add_mark_managed_refs(void) +{ + sylvan_gc_add_mark(TASK(mtbdd_gc_mark_managed_refs)); +} + +/** + * Sort level counts using gnome sort. + */ +void levels_gnome_sort(levels_t *self, int *levels_arr, const size_t *level_counts) +{ + unsigned int i = 1; + unsigned int j = 2; + while (i < self->count) { + long p = levels_arr[i - 1] == -1 ? -1 : (long) level_counts[self->level_to_order[levels_arr[i - 1]]]; + long q = levels_arr[i] == -1 ? -1 : (long) level_counts[self->level_to_order[levels_arr[i]]]; + if (p < q) { + int t = levels_arr[i]; + levels_arr[i] = levels_arr[i - 1]; + levels_arr[i - 1] = t; + if (--i) continue; + } + i = j++; + } +} + +// set levels below the threshold to -1 +void levels_mark_threshold(levels_t *self, int *level, const size_t *level_counts, uint32_t threshold) +{ + for (unsigned int i = 0; i < self->count; i++) { + if (level_counts[self->level_to_order[i]] < threshold) level[i] = -1; + else level[i] = i; + } +} \ No newline at end of file diff --git a/src/sylvan_levels.h b/src/sylvan_levels.h new file mode 100644 index 00000000..fc9a5f10 --- /dev/null +++ b/src/sylvan_levels.h @@ -0,0 +1,92 @@ +#ifndef SYLVAN_SYLVAN_LEVELS_H +#define SYLVAN_SYLVAN_LEVELS_H + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +/** + * When using dynamic variable reordering, it is strongly recommended to use + * "levels" instead of working directly with the internal variables. + * + * Dynamic variable reordering requires that variables are consecutive. + * Initially, variables are assigned linearly, starting with 0. + */ +typedef struct levels_s { + _Atomic(uint64_t)* table; // array holding the 1-node BDD for each level + size_t count; // number of created levels + _Atomic(uint32_t)* level_to_order; // current level wise var permutation (level to variable label) + _Atomic(uint32_t)* order_to_level; // current variable wise level permutation (variable label to level) +} levels_t; + + +/** + * @brief Get the number of levels + */ +size_t levels_get_count(levels_t* self); + +/** + * @brief Create the next level and return the BDD representing the variable (ithlevel) + * @details The BDDs representing managed levels are always kept during garbage collection. + * NOTE: not currently thread-safe. + */ +uint64_t levels_new_one(levels_t* self); + +/** + * @brief Create the next levels + * @details The BDDs representing managed levels are always kept during garbage collection. Not currently thread-safe. + */ +int levels_new_many(size_t amount); + +/** + * @brief Insert a node at given level with given low and high nodes + */ +uint64_t levels_new_node(levels_t* self, uint32_t level, uint64_t low, uint64_t high); + +/** + * \brief Reset all levels. + */ +void levels_reset(levels_t* self); + + +/** + * \brief Get the BDD node representing "if level then true else false" + * \details Order a node does not change after a swap, meaning it is in the same level, + * however, after a swap they can point to a different variable + * \param level for which the BDD needs to be returned + */ +uint64_t levels_ithlevel(levels_t* self, uint32_t level); + +/** + * @brief Get the level of the given variable + */ +uint32_t levels_order_to_level(levels_t* self, uint32_t var); + +/** + * @brief Get the variable of the given level + */ +uint32_t levels_level_to_order(levels_t* self, uint32_t level); + +uint32_t levels_swap(levels_t *self, uint32_t x, uint32_t y); + +/** + * \brief Add callback to mark managed references during garbage collection. + * \details This is used for the dynamic variable reordering. + */ +void levels_gc_add_mark_managed_refs(void); + +/** + * @brief Mark each level_count -1 which is below the threshold. + */ +void levels_mark_threshold(levels_t* self, int* level, const size_t* level_counts, uint32_t threshold); + +/** + * @brief Sort the levels in descending order according to the number of nodes. + */ +void levels_gnome_sort(levels_t* self, int *levels, const size_t* level_counts); + +#ifdef __cplusplus +} +#endif /* __cplusplus */ + +#endif //SYLVAN_SYLVAN_LEVELS_H diff --git a/src/sylvan_mrc.c b/src/sylvan_mrc.c new file mode 100644 index 00000000..5238f49d --- /dev/null +++ b/src/sylvan_mrc.c @@ -0,0 +1,363 @@ +#include +#include + +#include + +VOID_TASK_DECL_4(mrc_collect_node_ids_par, uint64_t, uint64_t, atomic_bitmap_t*, roaring_bitmap_t*) + +TASK_DECL_3(size_t, mrc_delete_node, mrc_t*, size_t, roaring_bitmap_t*) + +TASK_DECL_5(size_t, mrc_gc_go, mrc_t*, uint64_t, uint64_t, roaring_bitmap_t*, roaring_bitmap_t*) + +/** + * Atomic counters + */ +void atomic_counters16_init(atomic_counters16_t *self, size_t new_size) +{ + atomic_counters16_deinit(self); + self->container = (atomic_counter16_t *) alloc_aligned(sizeof(atomic_counter16_t[new_size])); + if (self->container == NULL) { + fprintf(stderr, "atomic_counter_realloc: Unable to allocate memory: %s!\n", strerror(errno)); + exit(1); + } + self->size = new_size; +} + +void atomic_counters16_deinit(atomic_counters16_t *self) +{ + if (self->container != NULL && self->size > 0) { + free_aligned(self->container, self->size); + } + self->size = 0; + self->container = NULL; +} + +void atomic_counters16_add(atomic_counters16_t *self, size_t idx, int val) +{ + counter16_t curr = atomic_counters16_get(self, idx); + if (curr == 0 && val < 0) return; + if ((curr + val) >= COUNTER16_T_MAX) return; + if (idx >= self->size) return; + atomic_counter16_t *ptr = self->container + idx; + atomic_fetch_add_explicit(ptr, val, memory_order_relaxed); +} + +counter16_t atomic_counters16_get(const atomic_counters16_t *self, size_t idx) +{ + atomic_counter16_t *ptr = self->container + idx; + return atomic_load_explicit(ptr, memory_order_relaxed); +} + +void atomic_counters32_init(atomic_counters32_t *self, size_t new_size) +{ + atomic_counters32_deinit(self); + self->container = (atomic_counter32_t *) alloc_aligned(sizeof(atomic_counter32_t[new_size])); + if (self->container == NULL) { + fprintf(stderr, "atomic_counter_realloc: Unable to allocate memory: %s!\n", strerror(errno)); + exit(1); + } + self->size = new_size; +} + +void atomic_counters32_deinit(atomic_counters32_t *self) +{ + if (self->container != NULL && self->size > 0) { + free_aligned(self->container, self->size); + } + self->size = 0; + self->container = NULL; +} + +void atomic_counters32_add(atomic_counters32_t *self, size_t idx, int val) +{ + counter32_t curr = atomic_counters32_get(self, idx); + if (curr == 0 && val < 0) return; + if ((curr + val) >= COUNTER32_T_MAX) return; + if (idx >= self->size) return; + atomic_counter32_t *ptr = self->container + idx; + atomic_fetch_add_explicit(ptr, val, memory_order_relaxed); +} + +counter32_t atomic_counters32_get(const atomic_counters32_t *self, size_t idx) +{ + atomic_counter32_t *ptr = self->container + idx; + return atomic_load_explicit(ptr, memory_order_relaxed); +} + +/** + * @brief MRC initialization. + * + * @details Traverse the forest and count the number of nodes and variables and their internal and external references. + * + * @preconditions + * - The forest must be initialized. + */ +void mrc_init(mrc_t *self, size_t nvars, size_t nnodes) +{ + // memory usage: # of nodes * sizeof (counter_t) bits (16n) + atomic_counters32_init(&self->ref_nodes, nnodes); + // memory usage: # of variables * sizeof (counter_t) bits (16v) + atomic_counters32_init(&self->var_nnodes, nvars); + // memory usage: # of nodes * 1 bit (n) + atomic_bitmap_init(&self->ext_ref_nodes, nnodes); + + mrc_nnodes_set(self, 2); + + roaring_uint32_iterator_t it; + roaring_init_iterator(self->node_ids, &it); + roaring_move_uint32_iterator_equalorlarger(&it, 2); + + while (it.has_value) { + size_t index = it.current_value; + roaring_advance_uint32_iterator(&it); + if (index == 0 || index == 1) continue; + mrc_nnodes_add(self, 1); + + mtbddnode_t node = MTBDD_GETNODE(index); + BDDVAR var = mtbddnode_getvariable(node); + mrc_var_nnodes_add(self, var, 1); + + if (mtbddnode_isleaf(node)) continue; + + MTBDD f1 = mtbddnode_gethigh(node); + size_t f1_index = f1 & SYLVAN_TABLE_MASK_INDEX; + if (f1 != sylvan_invalid && (f1_index) != 0 && (f1_index) != 1) { + mrc_ref_nodes_add(self, f1_index, 1); + } + + MTBDD f0 = mtbddnode_getlow(node); + size_t f0_index = f0 & SYLVAN_TABLE_MASK_INDEX; + if (f0 != sylvan_invalid && (f0_index) != 0 && (f0_index) != 1) { + mrc_ref_nodes_add(self, f0_index, 1); + } + } + + roaring_init_iterator(self->node_ids, &it); + roaring_move_uint32_iterator_equalorlarger(&it, 2); + + mtbdd_re_mark_external_refs(self->ext_ref_nodes.container); + mtbdd_re_mark_protected(self->ext_ref_nodes.container); +} + +void mrc_deinit(mrc_t *self) +{ + if (self->node_ids == NULL) roaring_bitmap_free(self->node_ids); + atomic_counters32_deinit(&self->ref_nodes); + atomic_counters32_deinit(&self->var_nnodes); + atomic_bitmap_deinit(&self->ext_ref_nodes); +} + +void mrc_nnodes_set(mrc_t *self, int val) +{ + atomic_store_explicit(&self->nnodes, val, memory_order_relaxed); +} + +void mrc_ref_nodes_add(mrc_t *self, size_t idx, int val) +{ + atomic_counters32_add(&self->ref_nodes, idx, val); +} + +void mrc_var_nnodes_add(mrc_t *self, size_t idx, int val) +{ + atomic_counters32_add(&self->var_nnodes, idx, val); +} + +void mrc_nnodes_add(mrc_t *self, int val) +{ + size_t curr = mrc_nnodes_get(self); + if (curr == 0 && val < 0) return; + atomic_fetch_add_explicit(&self->nnodes, val, memory_order_relaxed); +} + +counter16_t mrc_ext_ref_nodes_get(const mrc_t *self, size_t idx) +{ + return atomic_bitmap_get(&self->ext_ref_nodes, idx, memory_order_relaxed); +} + +counter32_t mrc_ref_nodes_get(const mrc_t *self, size_t idx) +{ + return atomic_counters32_get(&self->ref_nodes, idx); +} + +counter32_t mrc_var_nnodes_get(const mrc_t *self, size_t idx) +{ + return atomic_counters32_get(&self->var_nnodes, idx); +} + +size_t mrc_nnodes_get(const mrc_t *self) +{ + return atomic_load_explicit(&self->nnodes, memory_order_relaxed); +} + +int mrc_is_node_dead(const mrc_t *self, size_t idx) +{ + counter16_t int_count = mrc_ref_nodes_get(self, idx); + if (int_count > 0) return 0; + // mrc_ext_ref_nodes_get is an atomic bitmap call which is much more expensive than mrc_ref_nodes_get + // thus, invoke it only if really necessary + counter16_t ext_count = mrc_ext_ref_nodes_get(self, idx); + if (ext_count > 0) return 0; + return llmsset_is_marked(nodes, idx) == 1; +} + +VOID_TASK_IMPL_2(mrc_gc, mrc_t*, self, roaring_bitmap_t*, ids) +{ + roaring_bitmap_t dead_ids; + roaring_bitmap_init_with_capacity(&dead_ids, nodes->table_size); + + size_t deleted_nnodes = CALL(mrc_gc_go, self, 0, nodes->table_size, &dead_ids, ids); + if (deleted_nnodes == 0) return; + + // calling bitmap remove per each node is more expensive than calling it once with many ids + // thus, we group the ids into and let the bitmap delete them in one go + roaring_uint32_iterator_t it_old; + roaring_init_iterator(&dead_ids, &it_old); + uint32_t arr[deleted_nnodes]; + size_t x = 0; + while (it_old.has_value) { + arr[x] = it_old.current_value; + roaring_advance_uint32_iterator(&it_old); + x++; + } + roaring_bitmap_remove_many(self->node_ids, deleted_nnodes, arr); + +#if SYLVAN_USE_LINEAR_PROBING + sylvan_clear_and_mark(); + sylvan_rehash_all(); +#else + CALL(llmsset_reset_all_regions); +#endif +} + +#define index(x) ((x) & SYLVAN_TABLE_MASK_INDEX) + +TASK_IMPL_5(size_t, mrc_gc_go, mrc_t*, self, uint64_t, first, uint64_t, count, roaring_bitmap_t *, dead_ids, + roaring_bitmap_t *, ids) +{ + roaring_uint32_iterator_t it; + roaring_init_iterator(ids, &it); + if (!roaring_move_uint32_iterator_equalorlarger(&it, first)) return 0; + + size_t deleted = 0; + + const size_t end = first + count; + while (it.has_value && it.current_value < end) { + if (mrc_is_node_dead(self, it.current_value)) { + deleted += CALL(mrc_delete_node, self, it.current_value, dead_ids); + } + roaring_advance_uint32_iterator(&it); + } + if (deleted > 0) mrc_nnodes_add(self, -(int) deleted); + return deleted; +} + +TASK_IMPL_3(size_t, mrc_delete_node, mrc_t*, self, size_t, index, roaring_bitmap_t*, dead_ids) +{ + size_t deleted = 1; + mtbddnode_t f = MTBDD_GETNODE(index); + // roaring_bitmap_add does not allow concurrent writes, thus we invoke recursive mrc_delete_node function sequentially + roaring_bitmap_add(dead_ids, index); + mrc_var_nnodes_add(self, mtbddnode_getvariable(f), -1); + + if (!mtbddnode_isleaf(f)) { + MTBDD f1 = mtbddnode_gethigh(f); + if (f1 != sylvan_invalid && index(f1) != 0 && index(f1) != 1) { + mrc_ref_nodes_add(&reorder_db->mrc, index(f1), -1); + if (mrc_is_node_dead(self, index(f1))) { + deleted += CALL(mrc_delete_node, self, index(f1), dead_ids); + } + } + MTBDD f0 = mtbddnode_getlow(f); + if (f0 != sylvan_invalid && index(f0) != 0 && index(f0) != 1) { + mrc_ref_nodes_add(&reorder_db->mrc, index(f0), -1); + if (mrc_is_node_dead(self, index(f0))) { + deleted += CALL(mrc_delete_node, self, index(f0), dead_ids); + } + } + } +#if !SYLVAN_USE_LINEAR_PROBING + llmsset_clear_one_hash(nodes, index); + llmsset_clear_one_data(nodes, index); +#endif + return deleted; +} + +VOID_TASK_IMPL_2(mrc_collect_node_ids, mrc_t*, self, llmsset_t, dbs) +{ + atomic_bitmap_t bitmap = { + .container = dbs->bitmap2, + .size = dbs->table_size + }; + roaring_bitmap_clear(self->node_ids); + roaring_bitmap_init_with_capacity(self->node_ids, llmsset_count_marked(dbs)); + CALL(mrc_collect_node_ids_par, 0, dbs->table_size, &bitmap, self->node_ids); +} + +VOID_TASK_IMPL_4(mrc_collect_node_ids_par, uint64_t, first, uint64_t, count, atomic_bitmap_t*, bitmap, + roaring_bitmap_t *, collected_ids) +{ + if (count > 1024) { + // standard reduction pattern with local roaring bitmaps collecting new node indices + size_t split = count / 2; + roaring_bitmap_t a; + roaring_bitmap_init_cleared(&a); + SPAWN(mrc_collect_node_ids_par, first, split, bitmap, &a); + roaring_bitmap_t b; + roaring_bitmap_init_cleared(&b); + CALL(mrc_collect_node_ids_par, first + split, count - split, bitmap, &b); + roaring_bitmap_or_inplace(collected_ids, &b); + SYNC(mrc_collect_node_ids_par); + roaring_bitmap_or_inplace(collected_ids, &a); + roaring_bitmap_clear(&a); + roaring_bitmap_clear(&b); + return; + } + // skip buckets 0 and 1 + if (first < 2) { + count = count + first - 2; + first = 2; + } + + const size_t end = first + count; + for (first = atomic_bitmap_next(bitmap, first - 1); first < end; first = atomic_bitmap_next(bitmap, first)) { + roaring_bitmap_add(collected_ids, first); + } +} + +MTBDD mrc_make_node(mrc_t *self, BDDVAR var, MTBDD low, MTBDD high, int *created, int add_id) +{ + MTBDD new = mtbdd_varswap_makenode(var, low, high, created); + if (new == mtbdd_invalid) { + return mtbdd_invalid; + } + if (*created) { + mrc_nnodes_add(self, 1); + mrc_var_nnodes_add(self, var, 1); + if (add_id) roaring_bitmap_add(self->node_ids, index(new)); + mrc_ref_nodes_add(self, index(new), 1); + mrc_ref_nodes_add(self, index(high), 1); + mrc_ref_nodes_add(self, index(low), 1); + } else { + mrc_ref_nodes_add(self, index(new), 1); + } + return new; +} + +MTBDD mrc_make_mapnode(mrc_t *self, BDDVAR var, MTBDD low, MTBDD high, int *created, int add_id) +{ + MTBDD new = mtbdd_varswap_makemapnode(var, low, high, created); + if (new == mtbdd_invalid) { + return mtbdd_invalid; + } + if (*created) { + mrc_nnodes_add(self, 1); + mrc_var_nnodes_add(self, var, 1); + if (add_id) roaring_bitmap_add(self->node_ids, index(new)); + mrc_ref_nodes_add(self, index(new), 1); + mrc_ref_nodes_add(self, index(high), 1); + mrc_ref_nodes_add(self, index(low), 1); + } else { + mrc_ref_nodes_add(self, index(new), 1); + } + return new; +} \ No newline at end of file diff --git a/src/sylvan_mrc.h b/src/sylvan_mrc.h new file mode 100644 index 00000000..60eeb141 --- /dev/null +++ b/src/sylvan_mrc.h @@ -0,0 +1,123 @@ +#ifndef SYLVAN_BENCHMARKS_SYLVAN_MRC_H +#define SYLVAN_BENCHMARKS_SYLVAN_MRC_H + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +#define COUNTER16_T_MAX UINT16_MAX +#define COUNTER32_T_MAX UINT32_MAX + +typedef uint16_t counter16_t; +typedef _Atomic(counter16_t) atomic_counter16_t; + +typedef struct atomic_counters16_s +{ + atomic_counter16_t *container; + size_t size; +} atomic_counters16_t; + +void atomic_counters16_init(atomic_counters16_t* self, size_t new_size); + +void atomic_counters16_deinit(atomic_counters16_t *self); + +void atomic_counters16_add(atomic_counters16_t* self, size_t idx, int val); + +counter16_t atomic_counters16_get(const atomic_counters16_t* self, size_t idx); + +typedef uint32_t counter32_t; +typedef _Atomic(counter32_t) atomic_counter32_t; + +typedef struct atomic_counters32_s +{ + atomic_counter32_t *container; + size_t size; +} atomic_counters32_t; + +void atomic_counters32_init(atomic_counters32_t* self, size_t new_size); + +void atomic_counters32_deinit(atomic_counters32_t *self); + +void atomic_counters32_add(atomic_counters32_t* self, size_t idx, int val); + +counter32_t atomic_counters32_get(const atomic_counters32_t* self, size_t idx); + +/** + * Manual Reference Counter (MRC) for the unique table nodes. + * Used for tracking dead nodes during dynamic variable reordering and + * performing selective garbage collection. + */ +typedef struct mrc_s +{ + roaring_bitmap_t* node_ids; // indices of the nodes unique table + _Atomic(size_t) nnodes; // number of all nodes in DD + atomic_counters32_t ref_nodes; // number of internal references per node + atomic_counters32_t var_nnodes; // number of nodes per variable + atomic_bitmap_t ext_ref_nodes; // nodes with external references +} mrc_t; + +/** + * init/ deinit functions. + */ +void mrc_init(mrc_t* self, size_t nvars, size_t nnodes); + +void mrc_deinit(mrc_t* self); + +/** + * setters + */ +void mrc_nnodes_set(mrc_t* self, int val); + +/** + * adders + */ +void mrc_ref_nodes_add(mrc_t* self, size_t idx, int val); + +void mrc_var_nnodes_add(mrc_t* self, size_t idx, int val); + +void mrc_nnodes_add(mrc_t* self, int val); + +/** + * getters + */ +counter16_t mrc_ext_ref_nodes_get(const mrc_t* self, size_t idx); + +counter32_t mrc_ref_nodes_get(const mrc_t* self, size_t idx); + +counter32_t mrc_var_nnodes_get(const mrc_t* self, size_t idx); + +size_t mrc_nnodes_get(const mrc_t* self); + +/** + * @brief Perform selective garbage collection. + * + * @details This function performs selective garbage collection on the unique table nodes. + * For every node with .ref_count == 0 perform delete and decrease ref count of its children. + * If the children become dead, delete them as well, repeat until no more dead nodes exist. + */ +#define mrc_gc(...) RUN(mrc_gc, __VA_ARGS__) +VOID_TASK_DECL_2(mrc_gc, mrc_t*, roaring_bitmap_t*) + +int mrc_is_node_dead(const mrc_t* self, size_t idx); + +void mrc_delete_node(mrc_t *self, size_t index, roaring_bitmap_t *old_ids); + +/** + * @brief Create a new node in the unique table. (with == 1, not thread-safe!) + */ +MTBDD mrc_make_node(mrc_t *self, BDDVAR var, MTBDD low, MTBDD high, int* created, int add_id); + +/** + * @brief Create a new mapnode in the unique table. (with == 1, not thread-safe!) + */ +MTBDD mrc_make_mapnode(mrc_t *self, BDDVAR var, MTBDD low, MTBDD high, int *created, int add_id); + +#define mrc_collect_node_ids(...) CALL(mrc_collect_node_ids, __VA_ARGS__) +VOID_TASK_DECL_2(mrc_collect_node_ids, mrc_t*, llmsset_t) + + +#ifdef __cplusplus +} +#endif /* __cplusplus */ + +#endif //SYLVAN_BENCHMARKS_SYLVAN_MRC_H diff --git a/src/sylvan_mtbdd.c b/src/sylvan_mtbdd.c index 67434f39..c42692ed 100644 --- a/src/sylvan_mtbdd.c +++ b/src/sylvan_mtbdd.c @@ -70,7 +70,7 @@ int64_t mtbdd_getint64(MTBDD leaf) { uint64_t value = mtbdd_getvalue(leaf); - return *(int64_t*)&value; + return *(int64_t *) &value; } // for leaf type 1 (double) @@ -78,7 +78,7 @@ double mtbdd_getdouble(MTBDD leaf) { uint64_t value = mtbdd_getvalue(leaf); - return *(double*)&value; + return *(double *) &value; } /** @@ -109,6 +109,32 @@ refs_table_t mtbdd_refs; refs_table_t mtbdd_protected; static int mtbdd_protected_created = 0; +/* Called during dynamic variable reordering */ +VOID_TASK_IMPL_1(mtbdd_re_mark_external_refs, _Atomic (uint64_t)*, bitmap) +{ + uint64_t *it = refs_iter(&mtbdd_refs, 0, mtbdd_refs.refs_size); + while (it != NULL) { + MTBDD dd = refs_next(&mtbdd_refs, &it, mtbdd_refs.refs_size); + size_t index = (dd & SYLVAN_TABLE_MASK_INDEX); + _Atomic (uint64_t) *ptr = bitmap + BUCKET_OFFSET(index); + uint64_t mask = BIT_MASK(index); + atomic_fetch_or_explicit(ptr, mask, memory_order_relaxed); + } +} + +/* Called during dynamic variable reordering */ +VOID_TASK_IMPL_1(mtbdd_re_mark_protected, _Atomic (uint64_t)*, bitmap) +{ + uint64_t *it = protect_iter(&mtbdd_protected, 0, mtbdd_protected.refs_size); + while (it != NULL) { + BDD *dd = (BDD *) protect_next(&mtbdd_protected, &it, mtbdd_protected.refs_size); + size_t index = (*dd & SYLVAN_TABLE_MASK_INDEX); + _Atomic (uint64_t) *ptr = bitmap + BUCKET_OFFSET(index); + uint64_t mask = BIT_MASK(index); + atomic_fetch_or_explicit(ptr, mask, memory_order_relaxed); + } +} + MDD mtbdd_ref(MDD a) { @@ -138,13 +164,13 @@ mtbdd_protect(MTBDD *a) protect_create(&mtbdd_protected, 4096); mtbdd_protected_created = 1; } - protect_up(&mtbdd_protected, (size_t)a); + protect_up(&mtbdd_protected, (size_t) a); } void mtbdd_unprotect(MTBDD *a) { - if (mtbdd_protected.refs_table != NULL) protect_down(&mtbdd_protected, (size_t)a); + if (mtbdd_protected.refs_table != NULL) protect_down(&mtbdd_protected, (size_t) a); } size_t @@ -157,7 +183,7 @@ mtbdd_count_protected() VOID_TASK_0(mtbdd_gc_mark_external_refs) { // iterate through refs hash table, mark all found - size_t count=0; + size_t count = 0; uint64_t *it = refs_iter(&mtbdd_refs, 0, mtbdd_refs.refs_size); while (it != NULL) { SPAWN(mtbdd_gc_mark_rec, refs_next(&mtbdd_refs, &it, mtbdd_refs.refs_size)); @@ -171,10 +197,10 @@ VOID_TASK_0(mtbdd_gc_mark_external_refs) VOID_TASK_0(mtbdd_gc_mark_protected) { // iterate through refs hash table, mark all found - size_t count=0; + size_t count = 0; uint64_t *it = protect_iter(&mtbdd_protected, 0, mtbdd_protected.refs_size); while (it != NULL) { - BDD *to_mark = (BDD*)protect_next(&mtbdd_protected, &it, mtbdd_protected.refs_size); + BDD *to_mark = (BDD *) protect_next(&mtbdd_protected, &it, mtbdd_protected.refs_size); SPAWN(mtbdd_gc_mark_rec, *to_mark); count++; } @@ -234,7 +260,7 @@ VOID_TASK_2(mtbdd_refs_mark_s_par, mtbdd_refs_task_t, begin, size_t, count) Task *t = begin->t; if (!TASK_IS_STOLEN(t)) return; if (t->f == begin->f && TASK_IS_COMPLETED(t)) { - mtbdd_gc_mark_rec(*(MTBDD*)TASK_RESULT(t)); + mtbdd_gc_mark_rec(*(MTBDD *) TASK_RESULT(t)); } begin += 1; count -= 1; @@ -250,9 +276,9 @@ VOID_TASK_2(mtbdd_refs_mark_s_par, mtbdd_refs_task_t, begin, size_t, count) VOID_TASK_0(mtbdd_refs_mark_task) { LOCALIZE_THREAD_LOCAL(mtbdd_refs_key, mtbdd_refs_internal_t); - SPAWN(mtbdd_refs_mark_p_par, mtbdd_refs_key->pbegin, mtbdd_refs_key->pcur-mtbdd_refs_key->pbegin); - SPAWN(mtbdd_refs_mark_r_par, mtbdd_refs_key->rbegin, mtbdd_refs_key->rcur-mtbdd_refs_key->rbegin); - CALL(mtbdd_refs_mark_s_par, mtbdd_refs_key->sbegin, mtbdd_refs_key->scur-mtbdd_refs_key->sbegin); + SPAWN(mtbdd_refs_mark_p_par, mtbdd_refs_key->pbegin, mtbdd_refs_key->pcur - mtbdd_refs_key->pbegin); + SPAWN(mtbdd_refs_mark_r_par, mtbdd_refs_key->rbegin, mtbdd_refs_key->rcur - mtbdd_refs_key->rbegin); + CALL(mtbdd_refs_mark_s_par, mtbdd_refs_key->sbegin, mtbdd_refs_key->scur - mtbdd_refs_key->sbegin); SYNC(mtbdd_refs_mark_r_par); SYNC(mtbdd_refs_mark_p_par); } @@ -266,12 +292,12 @@ void mtbdd_refs_init_key(void) { assert(lace_is_worker()); // only use inside Lace workers - mtbdd_refs_internal_t s = (mtbdd_refs_internal_t)malloc(sizeof(struct mtbdd_refs_internal)); - s->pcur = s->pbegin = (const MTBDD**)malloc(sizeof(MTBDD*) * 1024); + mtbdd_refs_internal_t s = (mtbdd_refs_internal_t) malloc(sizeof(struct mtbdd_refs_internal)); + s->pcur = s->pbegin = (const MTBDD **) malloc(sizeof(MTBDD *) * 1024); s->pend = s->pbegin + 1024; - s->rcur = s->rbegin = (MTBDD*)malloc(sizeof(MTBDD) * 1024); + s->rcur = s->rbegin = (MTBDD *) malloc(sizeof(MTBDD) * 1024); s->rend = s->rbegin + 1024; - s->scur = s->sbegin = (mtbdd_refs_task_t)malloc(sizeof(struct mtbdd_refs_task) * 1024); + s->scur = s->sbegin = (mtbdd_refs_task_t) malloc(sizeof(struct mtbdd_refs_task) * 1024); s->send = s->sbegin + 1024; SET_THREAD_LOCAL(mtbdd_refs_key, s); } @@ -293,7 +319,7 @@ mtbdd_refs_ptrs_up(mtbdd_refs_internal_t mtbdd_refs_key) { size_t cur = mtbdd_refs_key->pcur - mtbdd_refs_key->pbegin; size_t size = mtbdd_refs_key->pend - mtbdd_refs_key->pbegin; - mtbdd_refs_key->pbegin = (const MTBDD**)realloc(mtbdd_refs_key->pbegin, sizeof(MTBDD*) * size * 2); + mtbdd_refs_key->pbegin = (const MTBDD **) realloc(mtbdd_refs_key->pbegin, sizeof(MTBDD *) * size * 2); mtbdd_refs_key->pcur = mtbdd_refs_key->pbegin + cur; mtbdd_refs_key->pend = mtbdd_refs_key->pbegin + (size * 2); } @@ -302,7 +328,7 @@ MTBDD __attribute__((noinline)) mtbdd_refs_refs_up(mtbdd_refs_internal_t mtbdd_refs_key, MTBDD res) { long size = mtbdd_refs_key->rend - mtbdd_refs_key->rbegin; - mtbdd_refs_key->rbegin = (MTBDD*)realloc(mtbdd_refs_key->rbegin, sizeof(MTBDD) * size * 2); + mtbdd_refs_key->rbegin = (MTBDD *) realloc(mtbdd_refs_key->rbegin, sizeof(MTBDD) * size * 2); mtbdd_refs_key->rcur = mtbdd_refs_key->rbegin + size; mtbdd_refs_key->rend = mtbdd_refs_key->rbegin + (size * 2); return res; @@ -312,7 +338,8 @@ void __attribute__((noinline)) mtbdd_refs_tasks_up(mtbdd_refs_internal_t mtbdd_refs_key) { long size = mtbdd_refs_key->send - mtbdd_refs_key->sbegin; - mtbdd_refs_key->sbegin = (mtbdd_refs_task_t)realloc(mtbdd_refs_key->sbegin, sizeof(struct mtbdd_refs_task) * size * 2); + mtbdd_refs_key->sbegin = (mtbdd_refs_task_t) realloc(mtbdd_refs_key->sbegin, + sizeof(struct mtbdd_refs_task) * size * 2); mtbdd_refs_key->scur = mtbdd_refs_key->sbegin + size; mtbdd_refs_key->send = mtbdd_refs_key->sbegin + (size * 2); } @@ -433,7 +460,8 @@ mtbdd_makeleaf(uint32_t type, uint64_t value) index = custom ? llmsset_lookupc(nodes, n.a, n.b, &created) : llmsset_lookup(nodes, n.a, n.b, &created); if (index == 0) { - fprintf(stderr, "BDD Unique table full, %zu of %zu buckets filled!\n", llmsset_count_marked(nodes), llmsset_get_size(nodes)); + fprintf(stderr, "BDD Unique table full, %zu of %zu buckets filled!\n", llmsset_count_marked(nodes), + llmsset_get_size(nodes)); exit(1); } } @@ -441,7 +469,7 @@ mtbdd_makeleaf(uint32_t type, uint64_t value) if (created) sylvan_stats_count(BDD_NODES_CREATED); else sylvan_stats_count(BDD_NODES_REUSED); - return (MTBDD)index; + return (MTBDD) index; } void @@ -458,10 +486,36 @@ void __attribute__ ((noinline)) _mtbdd_makenode_exit(void) { - fprintf(stderr, "BDD Unique table full, %zu of %zu buckets filled!\n", llmsset_count_marked(nodes), llmsset_get_size(nodes)); + fprintf(stderr, "BDD Unique table full, %zu of %zu buckets filled!\n", llmsset_count_marked(nodes), + llmsset_get_size(nodes)); exit(1); } +MTBDD +_mtbdd_varswap_makenode(BDDVAR var, MTBDD low, MTBDD high, int *created) +{ + // Normalization to keep canonicity + // low will have no mark + MTBDD result = low & mtbdd_complement; + low ^= result; + high ^= result; + + struct mtbddnode n; + mtbddnode_makenode(&n, var, low, high); + + uint64_t index = llmsset_lookup(nodes, n.a, n.b, created); + if (index == 0) { + return mtbdd_invalid; + } + + + if (created) sylvan_stats_count(BDD_NODES_CREATED); + else sylvan_stats_count(BDD_NODES_REUSED); + + result |= index; + return result; +} + MTBDD _mtbdd_makenode(uint32_t var, MTBDD low, MTBDD high) { @@ -490,6 +544,31 @@ _mtbdd_makenode(uint32_t var, MTBDD low, MTBDD high) return result; } +/** + * Custom makemapnode that doesn't trigger garbage collection. + * Instead, returns mtbdd_invalid if we can't create the node. + */ +MTBDD +mtbdd_varswap_makemapnode(BDDVAR var, MTBDD low, MTBDD high, int *created) +{ + struct mtbddnode n; + uint64_t index; + created = 0; + + // in an MTBDDMAP, the low edges eventually lead to 0 and cannot have a low mark + assert(!MTBDD_HASMARK(low)); + + mtbddnode_makemapnode(&n, var, low, high); + + index = llmsset_lookup(nodes, n.a, n.b, created); + if (index == 0) return mtbdd_invalid; + + if (created) sylvan_stats_count(BDD_NODES_CREATED); + else sylvan_stats_count(BDD_NODES_REUSED); + + return index; +} + MTBDD mtbdd_makemapnode(uint32_t var, MTBDD low, MTBDD high) { @@ -510,7 +589,8 @@ mtbdd_makemapnode(uint32_t var, MTBDD low, MTBDD high) index = llmsset_lookup(nodes, n.a, n.b, &created); if (index == 0) { - fprintf(stderr, "BDD Unique table full, %zu of %zu buckets filled!\n", llmsset_count_marked(nodes), llmsset_get_size(nodes)); + fprintf(stderr, "BDD Unique table full, %zu of %zu buckets filled!\n", llmsset_count_marked(nodes), + llmsset_get_size(nodes)); exit(1); } } @@ -524,7 +604,11 @@ mtbdd_makemapnode(uint32_t var, MTBDD low, MTBDD high) MTBDD mtbdd_ithvar(uint32_t var) { - return mtbdd_makenode(var, mtbdd_false, mtbdd_true); + if (reorder_db != NULL && reorder_db->is_initialised) { + return levels_ithlevel(&reorder_db->levels, var); + } else { + return mtbdd_makenode(var, mtbdd_false, mtbdd_true); + } } /* Operations */ @@ -560,7 +644,7 @@ gcd(uint32_t u, uint32_t v) MTBDD mtbdd_int64(int64_t value) { - return mtbdd_makeleaf(0, *(uint64_t*)&value); + return mtbdd_makeleaf(0, *(uint64_t *) &value); } MTBDD @@ -568,7 +652,7 @@ mtbdd_double(double value) { // normalize all 0.0 to 0.0 if (value == 0.0) value = 0.0; - return mtbdd_makeleaf(1, *(uint64_t*)&value); + return mtbdd_makeleaf(1, *(uint64_t *) &value); } MTBDD @@ -578,8 +662,9 @@ mtbdd_fraction(int64_t nom, uint64_t denom) uint32_t c = gcd(nom < 0 ? -nom : nom, denom); nom /= c; denom /= c; - if (nom > 2147483647 || nom < -2147483647 || denom > 4294967295) fprintf(stderr, "mtbdd_fraction: fraction overflow\n"); - return mtbdd_makeleaf(2, (nom<<32)|denom); + if (nom > 2147483647 || nom < -2147483647 || denom > 4294967295) + fprintf(stderr, "mtbdd_fraction: fraction overflow\n"); + return mtbdd_makeleaf(2, (nom << 32) | denom); } /** @@ -596,31 +681,30 @@ mtbdd_cube(MTBDD variables, uint8_t *cube, MTBDD terminal) BDD result; switch (*cube) { - case 0: - result = mtbdd_cube(node_gethigh(variables, n), cube+1, terminal); - result = mtbdd_makenode(mtbddnode_getvariable(n), result, mtbdd_false); - return result; - case 1: - result = mtbdd_cube(node_gethigh(variables, n), cube+1, terminal); - result = mtbdd_makenode(mtbddnode_getvariable(n), mtbdd_false, result); - return result; - case 2: - return mtbdd_cube(node_gethigh(variables, n), cube+1, terminal); - case 3: - { - MTBDD variables2 = node_gethigh(variables, n); - mtbddnode_t n2 = MTBDD_GETNODE(variables2); - uint32_t var2 = mtbddnode_getvariable(n2); - result = mtbdd_cube(node_gethigh(variables2, n2), cube+2, terminal); - BDD low = mtbdd_makenode(var2, result, mtbdd_false); - mtbdd_refs_push(low); - BDD high = mtbdd_makenode(var2, mtbdd_false, result); - mtbdd_refs_pop(1); - result = mtbdd_makenode(mtbddnode_getvariable(n), low, high); - return result; - } - default: - return mtbdd_false; // ? + case 0: + result = mtbdd_cube(node_gethigh(variables, n), cube + 1, terminal); + result = mtbdd_makenode(mtbddnode_getvariable(n), result, mtbdd_false); + return result; + case 1: + result = mtbdd_cube(node_gethigh(variables, n), cube + 1, terminal); + result = mtbdd_makenode(mtbddnode_getvariable(n), mtbdd_false, result); + return result; + case 2: + return mtbdd_cube(node_gethigh(variables, n), cube + 1, terminal); + case 3: { + MTBDD variables2 = node_gethigh(variables, n); + mtbddnode_t n2 = MTBDD_GETNODE(variables2); + uint32_t var2 = mtbddnode_getvariable(n2); + result = mtbdd_cube(node_gethigh(variables2, n2), cube + 2, terminal); + BDD low = mtbdd_makenode(var2, result, mtbdd_false); + mtbdd_refs_push(low); + BDD high = mtbdd_makenode(var2, mtbdd_false, result); + mtbdd_refs_pop(1); + result = mtbdd_makenode(mtbddnode_getvariable(n), low, high); + return result; + } + default: + return mtbdd_false; // ? } } @@ -657,62 +741,54 @@ TASK_IMPL_4(MTBDD, mtbdd_union_cube, MTBDD, mtbdd, MTBDD, vars, uint8_t*, cube, MTBDD low = node_getlow(mtbdd, na); MTBDD high = node_gethigh(mtbdd, na); switch (*cube) { - case 0: - { - MTBDD new_low = mtbdd_union_cube(low, node_gethigh(vars, nv), cube+1, terminal); - if (new_low != low) return mtbdd_makenode(v, new_low, high); - else return mtbdd; - } - case 1: - { - MTBDD new_high = mtbdd_union_cube(high, node_gethigh(vars, nv), cube+1, terminal); - if (new_high != high) return mtbdd_makenode(v, low, new_high); - return mtbdd; - } - case 2: - { - mtbdd_refs_spawn(SPAWN(mtbdd_union_cube, high, node_gethigh(vars, nv), cube+1, terminal)); - MTBDD new_low = mtbdd_union_cube(low, node_gethigh(vars, nv), cube+1, terminal); - mtbdd_refs_push(new_low); - MTBDD new_high = mtbdd_refs_sync(SYNC(mtbdd_union_cube)); - mtbdd_refs_pop(1); - if (new_low != low || new_high != high) return mtbdd_makenode(v, new_low, new_high); - return mtbdd; - } - case 3: - { - return mtbdd_false; // currently not implemented - } - default: - return mtbdd_false; + case 0: { + MTBDD new_low = mtbdd_union_cube(low, node_gethigh(vars, nv), cube + 1, terminal); + if (new_low != low) return mtbdd_makenode(v, new_low, high); + else return mtbdd; + } + case 1: { + MTBDD new_high = mtbdd_union_cube(high, node_gethigh(vars, nv), cube + 1, terminal); + if (new_high != high) return mtbdd_makenode(v, low, new_high); + return mtbdd; + } + case 2: { + mtbdd_refs_spawn(SPAWN(mtbdd_union_cube, high, node_gethigh(vars, nv), cube + 1, terminal)); + MTBDD new_low = mtbdd_union_cube(low, node_gethigh(vars, nv), cube + 1, terminal); + mtbdd_refs_push(new_low); + MTBDD new_high = mtbdd_refs_sync(SYNC(mtbdd_union_cube)); + mtbdd_refs_pop(1); + if (new_low != low || new_high != high) return mtbdd_makenode(v, new_low, new_high); + return mtbdd; + } + case 3: { + return mtbdd_false; // currently not implemented + } + default: + return mtbdd_false; } } else /* va > v */ { switch (*cube) { - case 0: - { - MTBDD new_low = mtbdd_union_cube(mtbdd, node_gethigh(vars, nv), cube+1, terminal); - return mtbdd_makenode(v, new_low, mtbdd_false); - } - case 1: - { - MTBDD new_high = mtbdd_union_cube(mtbdd, node_gethigh(vars, nv), cube+1, terminal); - return mtbdd_makenode(v, mtbdd_false, new_high); - } - case 2: - { - mtbdd_refs_spawn(SPAWN(mtbdd_union_cube, mtbdd, node_gethigh(vars, nv), cube+1, terminal)); - MTBDD new_low = mtbdd_union_cube(mtbdd, node_gethigh(vars, nv), cube+1, terminal); - mtbdd_refs_push(new_low); - MTBDD new_high = mtbdd_refs_sync(SYNC(mtbdd_union_cube)); - mtbdd_refs_pop(1); - return mtbdd_makenode(v, new_low, new_high); - } - case 3: - { - return mtbdd_false; // currently not implemented - } - default: - return mtbdd_false; + case 0: { + MTBDD new_low = mtbdd_union_cube(mtbdd, node_gethigh(vars, nv), cube + 1, terminal); + return mtbdd_makenode(v, new_low, mtbdd_false); + } + case 1: { + MTBDD new_high = mtbdd_union_cube(mtbdd, node_gethigh(vars, nv), cube + 1, terminal); + return mtbdd_makenode(v, mtbdd_false, new_high); + } + case 2: { + mtbdd_refs_spawn(SPAWN(mtbdd_union_cube, mtbdd, node_gethigh(vars, nv), cube + 1, terminal)); + MTBDD new_low = mtbdd_union_cube(mtbdd, node_gethigh(vars, nv), cube + 1, terminal); + mtbdd_refs_push(new_low); + MTBDD new_high = mtbdd_refs_sync(SYNC(mtbdd_union_cube)); + mtbdd_refs_pop(1); + return mtbdd_makenode(v, new_low, new_high); + } + case 3: { + return mtbdd_false; // currently not implemented + } + default: + return mtbdd_false; } } } @@ -733,7 +809,7 @@ TASK_IMPL_3(MTBDD, mtbdd_apply, MTBDD, a, MTBDD, b, mtbdd_apply_op, op) sylvan_stats_count(MTBDD_APPLY); /* Check cache */ - if (cache_get3(CACHE_MTBDD_APPLY, a, b, (size_t)op, &result)) { + if (cache_get3(CACHE_MTBDD_APPLY, a, b, (size_t) op, &result)) { sylvan_stats_count(MTBDD_APPLY_CACHED); return result; } @@ -785,7 +861,7 @@ TASK_IMPL_3(MTBDD, mtbdd_apply, MTBDD, a, MTBDD, b, mtbdd_apply_op, op) result = mtbdd_makenode(v, low, high); /* Store in cache */ - if (cache_put3(CACHE_MTBDD_APPLY, a, b, (size_t)op, result)) { + if (cache_put3(CACHE_MTBDD_APPLY, a, b, (size_t) op, result)) { sylvan_stats_count(MTBDD_APPLY_CACHEDPUT); } @@ -879,7 +955,7 @@ TASK_IMPL_3(MTBDD, mtbdd_uapply, MTBDD, dd, mtbdd_uapply_op, op, size_t, param) /* Check cache */ MTBDD result; - if (cache_get3(CACHE_MTBDD_UAPPLY, dd, (size_t)op, param, &result)) { + if (cache_get3(CACHE_MTBDD_UAPPLY, dd, (size_t) op, param, &result)) { sylvan_stats_count(MTBDD_UAPPLY_CACHED); return result; } @@ -888,7 +964,7 @@ TASK_IMPL_3(MTBDD, mtbdd_uapply, MTBDD, dd, mtbdd_uapply_op, op, size_t, param) result = WRAP(op, dd, param); if (result != mtbdd_invalid) { /* Store in cache */ - if (cache_put3(CACHE_MTBDD_UAPPLY, dd, (size_t)op, param, result)) { + if (cache_put3(CACHE_MTBDD_UAPPLY, dd, (size_t) op, param, result)) { sylvan_stats_count(MTBDD_UAPPLY_CACHEDPUT); } @@ -908,7 +984,7 @@ TASK_IMPL_3(MTBDD, mtbdd_uapply, MTBDD, dd, mtbdd_uapply_op, op, size_t, param) result = mtbdd_makenode(mtbddnode_getvariable(ndd), low, high); /* Store in cache */ - if (cache_put3(CACHE_MTBDD_UAPPLY, dd, (size_t)op, param, result)) { + if (cache_put3(CACHE_MTBDD_UAPPLY, dd, (size_t) op, param, result)) { sylvan_stats_count(MTBDD_UAPPLY_CACHEDPUT); } @@ -926,16 +1002,16 @@ TASK_2(MTBDD, mtbdd_uop_times_uint, MTBDD, a, size_t, k) if (mtbddnode_isleaf(na)) { if (mtbddnode_gettype(na) == 0) { int64_t v = mtbdd_getint64(a); - return mtbdd_int64(v*k); + return mtbdd_int64(v * k); } else if (mtbddnode_gettype(na) == 1) { double d = mtbdd_getdouble(a); - return mtbdd_double(d*k); + return mtbdd_double(d * k); } else if (mtbddnode_gettype(na) == 2) { uint64_t v = mtbddnode_getvalue(na); - int64_t n = (int32_t)(v>>32); + int64_t n = (int32_t) (v >> 32); uint32_t d = v; - uint32_t c = gcd(d, (uint32_t)k); - return mtbdd_fraction(n*(k/c), d/c); + uint32_t c = gcd(d, (uint32_t) k); + return mtbdd_fraction(n * (k / c), d / c); } else { assert(0); // failure } @@ -961,7 +1037,7 @@ TASK_2(MTBDD, mtbdd_uop_pow_uint, MTBDD, a, size_t, k) return mtbdd_double(pow(d, k)); } else if (mtbddnode_gettype(na) == 2) { uint64_t v = mtbddnode_getvalue(na); - return mtbdd_fraction(pow((int32_t)(v>>32), k), (uint32_t)v); + return mtbdd_fraction(pow((int32_t) (v >> 32), k), (uint32_t) v); } else { assert(0); // failure } @@ -972,20 +1048,20 @@ TASK_2(MTBDD, mtbdd_uop_pow_uint, MTBDD, a, size_t, k) TASK_IMPL_3(MTBDD, mtbdd_abstract_op_plus, MTBDD, a, MTBDD, b, int, k) { - if (k==0) { + if (k == 0) { return mtbdd_apply(a, b, TASK(mtbdd_op_plus)); } else { - uint64_t factor = 1ULL<>32); - int64_t nom_b = (int32_t)(val_b>>32); - uint64_t denom_a = val_a&0xffffffff; - uint64_t denom_b = val_b&0xffffffff; + int64_t nom_a = (int32_t) (val_a >> 32); + int64_t nom_b = (int32_t) (val_b >> 32); + uint64_t denom_a = val_a & 0xffffffff; + uint64_t denom_b = val_b & 0xffffffff; // common cases if (nom_a == 0) return b; if (nom_b == 0) return a; // equalize denominators uint32_t c = gcd(denom_a, denom_b); - nom_a *= denom_b/c; - nom_b *= denom_a/c; - denom_a *= denom_b/c; + nom_a *= denom_b / c; + nom_b *= denom_a / c; + denom_a *= denom_b / c; // add return mtbdd_fraction(nom_a + nom_b, denom_a); } else { @@ -1171,23 +1247,23 @@ TASK_IMPL_2(MTBDD, mtbdd_op_minus, MTBDD*, pa, MTBDD*, pb) uint64_t val_b = mtbddnode_getvalue(nb); if (mtbddnode_gettype(na) == 0 && mtbddnode_gettype(nb) == 0) { // both integer - return mtbdd_int64(*(int64_t*)(&val_a) - *(int64_t*)(&val_b)); + return mtbdd_int64(*(int64_t *) (&val_a) - *(int64_t *) (&val_b)); } else if (mtbddnode_gettype(na) == 1 && mtbddnode_gettype(nb) == 1) { // both double - return mtbdd_double(*(double*)(&val_a) - *(double*)(&val_b)); + return mtbdd_double(*(double *) (&val_a) - *(double *) (&val_b)); } else if (mtbddnode_gettype(na) == 2 && mtbddnode_gettype(nb) == 2) { // both fraction - int64_t nom_a = (int32_t)(val_a>>32); - int64_t nom_b = (int32_t)(val_b>>32); - uint64_t denom_a = val_a&0xffffffff; - uint64_t denom_b = val_b&0xffffffff; + int64_t nom_a = (int32_t) (val_a >> 32); + int64_t nom_b = (int32_t) (val_b >> 32); + uint64_t denom_a = val_a & 0xffffffff; + uint64_t denom_b = val_b & 0xffffffff; // common cases if (nom_b == 0) return a; // equalize denominators uint32_t c = gcd(denom_a, denom_b); - nom_a *= denom_b/c; - nom_b *= denom_a/c; - denom_a *= denom_b/c; + nom_a *= denom_b / c; + nom_b *= denom_a / c; + denom_a *= denom_b / c; // subtract return mtbdd_fraction(nom_a - nom_b, denom_a); } else { @@ -1221,8 +1297,8 @@ TASK_IMPL_2(MTBDD, mtbdd_op_times, MTBDD*, pa, MTBDD*, pb) uint64_t val_b = mtbddnode_getvalue(nb); if (mtbddnode_gettype(na) == 0 && mtbddnode_gettype(nb) == 0) { // both integer - int64_t i_a = *(int64_t*)(&val_a); - int64_t i_b = *(int64_t*)(&val_b); + int64_t i_a = *(int64_t *) (&val_a); + int64_t i_b = *(int64_t *) (&val_b); if (i_a == 0) return a; if (i_b == 0) return b; if (i_a == 1) return b; @@ -1230,8 +1306,8 @@ TASK_IMPL_2(MTBDD, mtbdd_op_times, MTBDD*, pa, MTBDD*, pb) return mtbdd_int64(i_a * i_b); } else if (mtbddnode_gettype(na) == 1 && mtbddnode_gettype(nb) == 1) { // both double - double d_a = *(double*)(&val_a); - double d_b = *(double*)(&val_b); + double d_a = *(double *) (&val_a); + double d_b = *(double *) (&val_b); if (d_a == 0.0) return a; if (d_a == 1.0) return b; if (d_b == 0.0) return b; @@ -1239,10 +1315,10 @@ TASK_IMPL_2(MTBDD, mtbdd_op_times, MTBDD*, pa, MTBDD*, pb) return mtbdd_double(d_a * d_b); } else if (mtbddnode_gettype(na) == 2 && mtbddnode_gettype(nb) == 2) { // both fraction - int64_t nom_a = (int32_t)(val_a>>32); - int64_t nom_b = (int32_t)(val_b>>32); - uint64_t denom_a = val_a&0xffffffff; - uint64_t denom_b = val_b&0xffffffff; + int64_t nom_a = (int32_t) (val_a >> 32); + int64_t nom_b = (int32_t) (val_b >> 32); + uint64_t denom_a = val_a & 0xffffffff; + uint64_t denom_b = val_b & 0xffffffff; if (nom_a == 0) return a; if (nom_b == 0) return b; // multiply! @@ -1250,8 +1326,8 @@ TASK_IMPL_2(MTBDD, mtbdd_op_times, MTBDD*, pa, MTBDD*, pb) uint32_t d = gcd(nom_a < 0 ? -nom_a : nom_a, denom_b); nom_a /= d; denom_a /= c; - nom_a *= (nom_b/c); - denom_a *= (denom_b/d); + nom_a *= (nom_b / c); + denom_a *= (denom_b / d); return mtbdd_fraction(nom_a, denom_a); } else { assert(0); // failure @@ -1291,24 +1367,24 @@ TASK_IMPL_2(MTBDD, mtbdd_op_min, MTBDD*, pa, MTBDD*, pb) uint64_t val_b = mtbddnode_getvalue(nb); if (mtbddnode_gettype(na) == 0 && mtbddnode_gettype(nb) == 0) { // both integer - int64_t va = *(int64_t*)(&val_a); - int64_t vb = *(int64_t*)(&val_b); + int64_t va = *(int64_t *) (&val_a); + int64_t vb = *(int64_t *) (&val_b); return va < vb ? a : b; } else if (mtbddnode_gettype(na) == 1 && mtbddnode_gettype(nb) == 1) { // both double - double va = *(double*)&val_a; - double vb = *(double*)&val_b; + double va = *(double *) &val_a; + double vb = *(double *) &val_b; return va < vb ? a : b; } else if (mtbddnode_gettype(na) == 2 && mtbddnode_gettype(nb) == 2) { // both fraction - int64_t nom_a = (int32_t)(val_a>>32); - int64_t nom_b = (int32_t)(val_b>>32); - uint64_t denom_a = val_a&0xffffffff; - uint64_t denom_b = val_b&0xffffffff; + int64_t nom_a = (int32_t) (val_a >> 32); + int64_t nom_b = (int32_t) (val_b >> 32); + uint64_t denom_a = val_a & 0xffffffff; + uint64_t denom_b = val_b & 0xffffffff; // equalize denominators uint32_t c = gcd(denom_a, denom_b); - nom_a *= denom_b/c; - nom_b *= denom_a/c; + nom_a *= denom_b / c; + nom_b *= denom_a / c; // compute lowest return nom_a < nom_b ? a : b; } else { @@ -1347,24 +1423,24 @@ TASK_IMPL_2(MTBDD, mtbdd_op_max, MTBDD*, pa, MTBDD*, pb) uint64_t val_b = mtbddnode_getvalue(nb); if (mtbddnode_gettype(na) == 0 && mtbddnode_gettype(nb) == 0) { // both integer - int64_t va = *(int64_t*)(&val_a); - int64_t vb = *(int64_t*)(&val_b); + int64_t va = *(int64_t *) (&val_a); + int64_t vb = *(int64_t *) (&val_b); return va > vb ? a : b; } else if (mtbddnode_gettype(na) == 1 && mtbddnode_gettype(nb) == 1) { // both double - double vval_a = *(double*)&val_a; - double vval_b = *(double*)&val_b; + double vval_a = *(double *) &val_a; + double vval_b = *(double *) &val_b; return vval_a > vval_b ? a : b; } else if (mtbddnode_gettype(na) == 2 && mtbddnode_gettype(nb) == 2) { // both fraction - int64_t nom_a = (int32_t)(val_a>>32); - int64_t nom_b = (int32_t)(val_b>>32); - uint64_t denom_a = val_a&0xffffffff; - uint64_t denom_b = val_b&0xffffffff; + int64_t nom_a = (int32_t) (val_a >> 32); + int64_t nom_b = (int32_t) (val_b >> 32); + uint64_t denom_a = val_a & 0xffffffff; + uint64_t denom_b = val_b & 0xffffffff; // equalize denominators uint32_t c = gcd(denom_a, denom_b); - nom_a *= denom_b/c; - nom_b *= denom_a/c; + nom_a *= denom_b / c; + nom_b *= denom_a / c; // compute highest return nom_a > nom_b ? a : b; } else { @@ -1407,7 +1483,7 @@ TASK_IMPL_2(MTBDD, mtbdd_op_cmpl, MTBDD, a, size_t, k) } return mtbdd_invalid; - (void)k; // unused variable + (void) k; // unused variable } TASK_IMPL_2(MTBDD, mtbdd_op_negate, MTBDD, a, size_t, k) @@ -1427,14 +1503,14 @@ TASK_IMPL_2(MTBDD, mtbdd_op_negate, MTBDD, a, size_t, k) return mtbdd_double(-d); } else if (mtbddnode_gettype(na) == 2) { uint64_t v = mtbddnode_getvalue(na); - return mtbdd_fraction(-(int32_t)(v>>32), (uint32_t)v); + return mtbdd_fraction(-(int32_t) (v >> 32), (uint32_t) v); } else { assert(0); // failure } } return mtbdd_invalid; - (void)k; // unused variable + (void) k; // unused variable } /** @@ -1515,11 +1591,11 @@ TASK_IMPL_2(MTBDD, mtbdd_op_threshold_double, MTBDD, a, size_t, svalue) mtbddnode_t na = MTBDD_GETNODE(a); if (mtbddnode_isleaf(na)) { - double value = *(double*)&svalue; + double value = *(double *) &svalue; if (mtbddnode_gettype(na) == 1) { return mtbdd_getdouble(a) >= value ? mtbdd_true : mtbdd_false; } else if (mtbddnode_gettype(na) == 2) { - double d = (double)mtbdd_getnumer(a); + double d = (double) mtbdd_getnumer(a); d /= mtbdd_getdenom(a); return d >= value ? mtbdd_true : mtbdd_false; } else { @@ -1543,11 +1619,11 @@ TASK_IMPL_2(MTBDD, mtbdd_op_strict_threshold_double, MTBDD, a, size_t, svalue) mtbddnode_t na = MTBDD_GETNODE(a); if (mtbddnode_isleaf(na)) { - double value = *(double*)&svalue; + double value = *(double *) &svalue; if (mtbddnode_gettype(na) == 1) { return mtbdd_getdouble(a) > value ? mtbdd_true : mtbdd_false; } else if (mtbddnode_gettype(na) == 2) { - double d = (double)mtbdd_getnumer(a); + double d = (double) mtbdd_getnumer(a); d /= mtbdd_getdenom(a); return d > value ? mtbdd_true : mtbdd_false; } else { @@ -1560,12 +1636,12 @@ TASK_IMPL_2(MTBDD, mtbdd_op_strict_threshold_double, MTBDD, a, size_t, svalue) TASK_IMPL_2(MTBDD, mtbdd_threshold_double, MTBDD, dd, double, d) { - return mtbdd_uapply(dd, TASK(mtbdd_op_threshold_double), *(size_t*)&d); + return mtbdd_uapply(dd, TASK(mtbdd_op_threshold_double), *(size_t *) &d); } TASK_IMPL_2(MTBDD, mtbdd_strict_threshold_double, MTBDD, dd, double, d) { - return mtbdd_uapply(dd, TASK(mtbdd_op_strict_threshold_double), *(size_t*)&d); + return mtbdd_uapply(dd, TASK(mtbdd_op_strict_threshold_double), *(size_t *) &d); } /** @@ -1592,7 +1668,7 @@ TASK_4(MTBDD, mtbdd_equal_norm_d2, MTBDD, a, MTBDD, b, size_t, svalue, int*, sho double vb = mtbdd_getdouble(b); va -= vb; if (va < 0) va = -va; - return (va < *(double*)&svalue) ? mtbdd_true : mtbdd_false; + return (va < *(double *) &svalue) ? mtbdd_true : mtbdd_false; } if (b < a) { @@ -1621,9 +1697,9 @@ TASK_4(MTBDD, mtbdd_equal_norm_d2, MTBDD, a, MTBDD, b, size_t, svalue, int*, sho /* Get cofactors */ MTBDD alow, ahigh, blow, bhigh; - alow = va == var ? node_getlow(a, na) : a; + alow = va == var ? node_getlow(a, na) : a; ahigh = va == var ? node_gethigh(a, na) : a; - blow = vb == var ? node_getlow(b, nb) : b; + blow = vb == var ? node_getlow(b, nb) : b; bhigh = vb == var ? node_gethigh(b, nb) : b; SPAWN(mtbdd_equal_norm_d2, ahigh, bhigh, svalue, shortcircuit); @@ -1645,7 +1721,7 @@ TASK_IMPL_3(MTBDD, mtbdd_equal_norm_d, MTBDD, a, MTBDD, b, double, d) /* the implementation checks shortcircuit in every task and if the two MTBDDs are not equal module epsilon, then the computation tree quickly aborts */ int shortcircuit = 0; - return CALL(mtbdd_equal_norm_d2, a, b, *(size_t*)&d, &shortcircuit); + return CALL(mtbdd_equal_norm_d2, a, b, *(size_t *) &d, &shortcircuit); } /** @@ -1674,7 +1750,7 @@ TASK_4(MTBDD, mtbdd_equal_norm_rel_d2, MTBDD, a, MTBDD, b, size_t, svalue, int*, if (va == 0) return mtbdd_false; va = (va - vb) / va; if (va < 0) va = -va; - return (va < *(double*)&svalue) ? mtbdd_true : mtbdd_false; + return (va < *(double *) &svalue) ? mtbdd_true : mtbdd_false; } /* Maybe perform garbage collection */ @@ -1697,9 +1773,9 @@ TASK_4(MTBDD, mtbdd_equal_norm_rel_d2, MTBDD, a, MTBDD, b, size_t, svalue, int*, /* Get cofactors */ MTBDD alow, ahigh, blow, bhigh; - alow = va == var ? node_getlow(a, na) : a; + alow = va == var ? node_getlow(a, na) : a; ahigh = va == var ? node_gethigh(a, na) : a; - blow = vb == var ? node_getlow(b, nb) : b; + blow = vb == var ? node_getlow(b, nb) : b; bhigh = vb == var ? node_gethigh(b, nb) : b; SPAWN(mtbdd_equal_norm_rel_d2, ahigh, bhigh, svalue, shortcircuit); @@ -1721,7 +1797,7 @@ TASK_IMPL_3(MTBDD, mtbdd_equal_norm_rel_d, MTBDD, a, MTBDD, b, double, d) /* the implementation checks shortcircuit in every task and if the two MTBDDs are not equal module epsilon, then the computation tree quickly aborts */ int shortcircuit = 0; - return CALL(mtbdd_equal_norm_rel_d2, a, b, *(size_t*)&d, &shortcircuit); + return CALL(mtbdd_equal_norm_rel_d2, a, b, *(size_t *) &d, &shortcircuit); } /** @@ -1764,22 +1840,22 @@ TASK_3(MTBDD, mtbdd_leq_rec, MTBDD, a, MTBDD, b, int*, shortcircuit) if (mtbddnode_gettype(na) == 0 && mtbddnode_gettype(nb) == 0) { // type 0 = integer - result = *(int64_t*)(&va) <= *(int64_t*)(&vb) ? mtbdd_true : mtbdd_false; + result = *(int64_t *) (&va) <= *(int64_t *) (&vb) ? mtbdd_true : mtbdd_false; } else if (mtbddnode_gettype(na) == 1 && mtbddnode_gettype(nb) == 1) { // type 1 = double - double vva = *(double*)&va; - double vvb = *(double*)&vb; + double vva = *(double *) &va; + double vvb = *(double *) &vb; result = vva <= vvb ? mtbdd_true : mtbdd_false; } else if (mtbddnode_gettype(na) == 2 && mtbddnode_gettype(nb) == 2) { // type 2 = fraction - int64_t nom_a = (int32_t)(va>>32); - int64_t nom_b = (int32_t)(vb>>32); - uint64_t da = va&0xffffffff; - uint64_t db = vb&0xffffffff; + int64_t nom_a = (int32_t) (va >> 32); + int64_t nom_b = (int32_t) (vb >> 32); + uint64_t da = va & 0xffffffff; + uint64_t db = vb & 0xffffffff; // equalize denominators uint32_t c = gcd(da, db); - nom_a *= db/c; - nom_b *= da/c; + nom_a *= db / c; + nom_b *= da / c; result = nom_a <= nom_b ? mtbdd_true : mtbdd_false; } else { assert(0); // failure @@ -1792,9 +1868,9 @@ TASK_3(MTBDD, mtbdd_leq_rec, MTBDD, a, MTBDD, b, int*, shortcircuit) /* Get cofactors */ MTBDD alow, ahigh, blow, bhigh; - alow = va == var ? node_getlow(a, na) : a; + alow = va == var ? node_getlow(a, na) : a; ahigh = va == var ? node_gethigh(a, na) : a; - blow = vb == var ? node_getlow(b, nb) : b; + blow = vb == var ? node_getlow(b, nb) : b; bhigh = vb == var ? node_gethigh(b, nb) : b; SPAWN(mtbdd_leq_rec, ahigh, bhigh, shortcircuit); @@ -1860,22 +1936,22 @@ TASK_3(MTBDD, mtbdd_less_rec, MTBDD, a, MTBDD, b, int*, shortcircuit) if (mtbddnode_gettype(na) == 0 && mtbddnode_gettype(nb) == 0) { // type 0 = integer - result = *(int64_t*)(&va) < *(int64_t*)(&vb) ? mtbdd_true : mtbdd_false; + result = *(int64_t *) (&va) < *(int64_t *) (&vb) ? mtbdd_true : mtbdd_false; } else if (mtbddnode_gettype(na) == 1 && mtbddnode_gettype(nb) == 1) { // type 1 = double - double vva = *(double*)&va; - double vvb = *(double*)&vb; + double vva = *(double *) &va; + double vvb = *(double *) &vb; result = vva < vvb ? mtbdd_true : mtbdd_false; } else if (mtbddnode_gettype(na) == 2 && mtbddnode_gettype(nb) == 2) { // type 2 = fraction - int64_t nom_a = (int32_t)(va>>32); - int64_t nom_b = (int32_t)(vb>>32); - uint64_t da = va&0xffffffff; - uint64_t db = vb&0xffffffff; + int64_t nom_a = (int32_t) (va >> 32); + int64_t nom_b = (int32_t) (vb >> 32); + uint64_t da = va & 0xffffffff; + uint64_t db = vb & 0xffffffff; // equalize denominators uint32_t c = gcd(da, db); - nom_a *= db/c; - nom_b *= da/c; + nom_a *= db / c; + nom_b *= da / c; result = nom_a < nom_b ? mtbdd_true : mtbdd_false; } else { assert(0); // failure @@ -1888,9 +1964,9 @@ TASK_3(MTBDD, mtbdd_less_rec, MTBDD, a, MTBDD, b, int*, shortcircuit) /* Get cofactors */ MTBDD alow, ahigh, blow, bhigh; - alow = va == var ? node_getlow(a, na) : a; + alow = va == var ? node_getlow(a, na) : a; ahigh = va == var ? node_gethigh(a, na) : a; - blow = vb == var ? node_getlow(b, nb) : b; + blow = vb == var ? node_getlow(b, nb) : b; bhigh = vb == var ? node_gethigh(b, nb) : b; SPAWN(mtbdd_less_rec, ahigh, bhigh, shortcircuit); @@ -1956,22 +2032,22 @@ TASK_3(MTBDD, mtbdd_geq_rec, MTBDD, a, MTBDD, b, int*, shortcircuit) if (mtbddnode_gettype(na) == 0 && mtbddnode_gettype(nb) == 0) { // type 0 = integer - result = *(int64_t*)(&va) >= *(int64_t*)(&vb) ? mtbdd_true : mtbdd_false; + result = *(int64_t *) (&va) >= *(int64_t *) (&vb) ? mtbdd_true : mtbdd_false; } else if (mtbddnode_gettype(na) == 1 && mtbddnode_gettype(nb) == 1) { // type 1 = double - double vva = *(double*)&va; - double vvb = *(double*)&vb; + double vva = *(double *) &va; + double vvb = *(double *) &vb; result = vva >= vvb ? mtbdd_true : mtbdd_false; } else if (mtbddnode_gettype(na) == 2 && mtbddnode_gettype(nb) == 2) { // type 2 = fraction - int64_t nom_a = (int32_t)(va>>32); - int64_t nom_b = (int32_t)(vb>>32); - uint64_t da = va&0xffffffff; - uint64_t db = vb&0xffffffff; + int64_t nom_a = (int32_t) (va >> 32); + int64_t nom_b = (int32_t) (vb >> 32); + uint64_t da = va & 0xffffffff; + uint64_t db = vb & 0xffffffff; // equalize denominators uint32_t c = gcd(da, db); - nom_a *= db/c; - nom_b *= da/c; + nom_a *= db / c; + nom_b *= da / c; result = nom_a >= nom_b ? mtbdd_true : mtbdd_false; } else { assert(0); // failure @@ -1984,9 +2060,9 @@ TASK_3(MTBDD, mtbdd_geq_rec, MTBDD, a, MTBDD, b, int*, shortcircuit) /* Get cofactors */ MTBDD alow, ahigh, blow, bhigh; - alow = va == var ? node_getlow(a, na) : a; + alow = va == var ? node_getlow(a, na) : a; ahigh = va == var ? node_gethigh(a, na) : a; - blow = vb == var ? node_getlow(b, nb) : b; + blow = vb == var ? node_getlow(b, nb) : b; bhigh = vb == var ? node_gethigh(b, nb) : b; SPAWN(mtbdd_geq_rec, ahigh, bhigh, shortcircuit); @@ -2052,22 +2128,22 @@ TASK_3(MTBDD, mtbdd_greater_rec, MTBDD, a, MTBDD, b, int*, shortcircuit) if (mtbddnode_gettype(na) == 0 && mtbddnode_gettype(nb) == 0) { // type 0 = integer - result = *(int64_t*)(&va) > *(int64_t*)(&vb) ? mtbdd_true : mtbdd_false; + result = *(int64_t *) (&va) > *(int64_t *) (&vb) ? mtbdd_true : mtbdd_false; } else if (mtbddnode_gettype(na) == 1 && mtbddnode_gettype(nb) == 1) { // type 1 = double - double vva = *(double*)&va; - double vvb = *(double*)&vb; + double vva = *(double *) &va; + double vvb = *(double *) &vb; result = vva > vvb ? mtbdd_true : mtbdd_false; } else if (mtbddnode_gettype(na) == 2 && mtbddnode_gettype(nb) == 2) { // type 2 = fraction - int64_t nom_a = (int32_t)(va>>32); - int64_t nom_b = (int32_t)(vb>>32); - uint64_t da = va&0xffffffff; - uint64_t db = vb&0xffffffff; + int64_t nom_a = (int32_t) (va >> 32); + int64_t nom_b = (int32_t) (vb >> 32); + uint64_t da = va & 0xffffffff; + uint64_t db = vb & 0xffffffff; // equalize denominators uint32_t c = gcd(da, db); - nom_a *= db/c; - nom_b *= da/c; + nom_a *= db / c; + nom_b *= da / c; result = nom_a > nom_b ? mtbdd_true : mtbdd_false; } else { assert(0); // failure @@ -2080,9 +2156,9 @@ TASK_3(MTBDD, mtbdd_greater_rec, MTBDD, a, MTBDD, b, int*, shortcircuit) /* Get cofactors */ MTBDD alow, ahigh, blow, bhigh; - alow = va == var ? node_getlow(a, na) : a; + alow = va == var ? node_getlow(a, na) : a; ahigh = va == var ? node_gethigh(a, na) : a; - blow = vb == var ? node_getlow(b, nb) : b; + blow = vb == var ? node_getlow(b, nb) : b; bhigh = vb == var ? node_gethigh(b, nb) : b; SPAWN(mtbdd_greater_rec, ahigh, bhigh, shortcircuit); @@ -2159,9 +2235,9 @@ TASK_IMPL_3(MTBDD, mtbdd_and_abstract_plus, MTBDD, a, MTBDD, b, MTBDD, v) } else { /* Get cofactors */ MTBDD alow, ahigh, blow, bhigh; - alow = (!la && va == var) ? node_getlow(a, na) : a; + alow = (!la && va == var) ? node_getlow(a, na) : a; ahigh = (!la && va == var) ? node_gethigh(a, na) : a; - blow = (!lb && vb == var) ? node_getlow(b, nb) : b; + blow = (!lb && vb == var) ? node_getlow(b, nb) : b; bhigh = (!lb && vb == var) ? node_gethigh(b, nb) : b; if (vv == var) { @@ -2240,9 +2316,9 @@ TASK_IMPL_3(MTBDD, mtbdd_and_abstract_max, MTBDD, a, MTBDD, b, MTBDD, v) /* Get cofactors */ MTBDD alow, ahigh, blow, bhigh; - alow = (!la && va == var) ? node_getlow(a, na) : a; + alow = (!la && va == var) ? node_getlow(a, na) : a; ahigh = (!la && va == var) ? node_gethigh(a, na) : a; - blow = (!lb && vb == var) ? node_getlow(b, nb) : b; + blow = (!lb && vb == var) ? node_getlow(b, nb) : b; bhigh = (!lb && vb == var) ? node_gethigh(b, nb) : b; if (vv == var) { @@ -2404,8 +2480,8 @@ TASK_IMPL_1(MTBDD, mtbdd_minimum, MTBDD, a) uint64_t denom_h = mtbdd_getdenom(high); // equalize denominators uint32_t c = gcd(denom_l, denom_h); - nom_l *= denom_h/c; - nom_h *= denom_l/c; + nom_l *= denom_h / c; + nom_h *= denom_l / c; result = nom_l < nom_h ? low : high; } else { assert(0); // failure @@ -2463,8 +2539,8 @@ TASK_IMPL_1(MTBDD, mtbdd_maximum, MTBDD, a) uint64_t denom_h = mtbdd_getdenom(high); // equalize denominators uint32_t c = gcd(denom_l, denom_h); - nom_l *= denom_h/c; - nom_h *= denom_l/c; + nom_l *= denom_h / c; + nom_h *= denom_l / c; result = nom_l > nom_h ? low : high; } else { assert(0); // failure @@ -2500,7 +2576,8 @@ TASK_IMPL_2(double, mtbdd_satcount, MTBDD, dd, size_t, nvars) /* Perhaps execute garbage collection */ sylvan_gc_test(); - union { + union + { double d; uint64_t s; } hack; @@ -2511,8 +2588,8 @@ TASK_IMPL_2(double, mtbdd_satcount, MTBDD, dd, size_t, nvars) return hack.d; } - SPAWN(mtbdd_satcount, mtbdd_gethigh(dd), nvars-1); - double low = CALL(mtbdd_satcount, mtbdd_getlow(dd), nvars-1); + SPAWN(mtbdd_satcount, mtbdd_gethigh(dd), nvars - 1); + double low = CALL(mtbdd_satcount, mtbdd_getlow(dd), nvars - 1); hack.d = low + SYNC(mtbdd_satcount); if (cache_put3(CACHE_BDD_SATCOUNT, dd, 0, nvars, hack.s)) { @@ -2554,23 +2631,23 @@ mtbdd_enum_first(MTBDD dd, MTBDD variables, uint8_t *arr, mtbdd_enum_filter_cb f mtbddnode_t n = MTBDD_GETNODE(dd); if (mtbddnode_getvariable(n) != v) { *arr = 2; - return mtbdd_enum_first(dd, variables, arr+1, filter_cb); + return mtbdd_enum_first(dd, variables, arr + 1, filter_cb); } // first maybe follow low - MTBDD res = mtbdd_enum_first(node_getlow(dd, n), variables, arr+1, filter_cb); + MTBDD res = mtbdd_enum_first(node_getlow(dd, n), variables, arr + 1, filter_cb); if (res != mtbdd_false) { *arr = 0; return res; } // if not low, try following high - res = mtbdd_enum_first(node_gethigh(dd, n), variables, arr+1, filter_cb); + res = mtbdd_enum_first(node_gethigh(dd, n), variables, arr + 1, filter_cb); if (res != mtbdd_false) { *arr = 1; return res; } - + // we've tried low and high, return false return mtbdd_false; } @@ -2595,12 +2672,12 @@ mtbdd_enum_next(MTBDD dd, MTBDD variables, uint8_t *arr, mtbdd_enum_filter_cb fi if (*arr == 0) { // previous was low mtbddnode_t n = MTBDD_GETNODE(dd); - MTBDD res = mtbdd_enum_next(node_getlow(dd, n), variables, arr+1, filter_cb); + MTBDD res = mtbdd_enum_next(node_getlow(dd, n), variables, arr + 1, filter_cb); if (res != mtbdd_false) { return res; } else { // try to find new in high branch - res = mtbdd_enum_first(node_gethigh(dd, n), variables, arr+1, filter_cb); + res = mtbdd_enum_first(node_gethigh(dd, n), variables, arr + 1, filter_cb); if (res != mtbdd_false) { *arr = 1; return res; @@ -2611,10 +2688,10 @@ mtbdd_enum_next(MTBDD dd, MTBDD variables, uint8_t *arr, mtbdd_enum_filter_cb fi } else if (*arr == 1) { // previous was high mtbddnode_t n = MTBDD_GETNODE(dd); - return mtbdd_enum_next(node_gethigh(dd, n), variables, arr+1, filter_cb); + return mtbdd_enum_next(node_gethigh(dd, n), variables, arr + 1, filter_cb); } else { // previous was either - return mtbdd_enum_next(dd, variables, arr+1, filter_cb); + return mtbdd_enum_next(dd, variables, arr + 1, filter_cb); } } } @@ -2657,14 +2734,14 @@ mtbdd_enum_all_first(MTBDD dd, MTBDD variables, uint8_t *arr, mtbdd_enum_filter_ } // first maybe follow low - MTBDD res = mtbdd_enum_all_first(low, variables, arr+1, filter_cb); + MTBDD res = mtbdd_enum_all_first(low, variables, arr + 1, filter_cb); if (res != mtbdd_false) { *arr = 0; return res; } // if not low, try following high - res = mtbdd_enum_all_first(high, variables, arr+1, filter_cb); + res = mtbdd_enum_all_first(high, variables, arr + 1, filter_cb); if (res != mtbdd_false) { *arr = 1; return res; @@ -2711,10 +2788,10 @@ mtbdd_enum_all_next(MTBDD dd, MTBDD variables, uint8_t *arr, mtbdd_enum_filter_c // try recursive next first if (*arr == 0) { - MTBDD res = mtbdd_enum_all_next(low, variables, arr+1, filter_cb); + MTBDD res = mtbdd_enum_all_next(low, variables, arr + 1, filter_cb); if (res != mtbdd_false) return res; } else if (*arr == 1) { - return mtbdd_enum_all_next(high, variables, arr+1, filter_cb); + return mtbdd_enum_all_next(high, variables, arr + 1, filter_cb); // if *arr was 1 and _next returns False, return False } else { // the array is invalid... @@ -2723,7 +2800,7 @@ mtbdd_enum_all_next(MTBDD dd, MTBDD variables, uint8_t *arr, mtbdd_enum_filter_c } // previous was low, try following high - MTBDD res = mtbdd_enum_all_first(high, variables, arr+1, filter_cb); + MTBDD res = mtbdd_enum_all_first(high, variables, arr + 1, filter_cb); if (res == mtbdd_false) return mtbdd_false; // succesful, set arr @@ -2749,8 +2826,8 @@ VOID_TASK_4(mtbdd_enum_par_do, MTBDD, dd, mtbdd_enum_cb, cb, void*, context, mtb mtbddnode_t ndd = MTBDD_GETNODE(dd); uint32_t var = mtbddnode_getvariable(ndd); - struct mtbdd_enum_trace t0 = (struct mtbdd_enum_trace){trace, var, 0}; - struct mtbdd_enum_trace t1 = (struct mtbdd_enum_trace){trace, var, 1}; + struct mtbdd_enum_trace t0 = (struct mtbdd_enum_trace) {trace, var, 0}; + struct mtbdd_enum_trace t1 = (struct mtbdd_enum_trace) {trace, var, 1}; SPAWN(mtbdd_enum_par_do, node_getlow(dd, ndd), cb, context, &t0); CALL(mtbdd_enum_par_do, node_gethigh(dd, ndd), cb, context, &t1); SYNC(mtbdd_enum_par_do); @@ -2782,7 +2859,7 @@ TASK_IMPL_3(MTBDD, mtbdd_eval_compose, MTBDD, dd, MTBDD, vars, mtbdd_eval_compos /* Check cache */ MTBDD result; - if (cache_get3(CACHE_MTBDD_EVAL_COMPOSE, dd, vars, (size_t)cb, &result)) { + if (cache_get3(CACHE_MTBDD_EVAL_COMPOSE, dd, vars, (size_t) cb, &result)) { sylvan_stats_count(MTBDD_EVAL_COMPOSE_CACHED); return result; } @@ -2831,7 +2908,7 @@ TASK_IMPL_3(MTBDD, mtbdd_eval_compose, MTBDD, dd, MTBDD, vars, mtbdd_eval_compos } /* Store in cache */ - if (cache_put3(CACHE_MTBDD_EVAL_COMPOSE, dd, vars, (size_t)cb, result)) { + if (cache_put3(CACHE_MTBDD_EVAL_COMPOSE, dd, vars, (size_t) cb, result)) { sylvan_stats_count(MTBDD_EVAL_COMPOSE_CACHEDPUT); } @@ -2872,8 +2949,8 @@ size_t mtbdd_leafcount_more(const MTBDD *mtbdds, size_t count) { size_t result = 0, i; - for (i=0; ilevels, x) +#define sylvan_order_to_level(x) levels_order_to_level(&reorder_db->levels, x) +#define sylvan_levels_count() levels_get_count(&reorder_db->levels) + /** * Initialize MTBDD functionality. * This initializes internal and external referencing datastructures, @@ -155,6 +159,28 @@ static inline MTBDD mtbdd_makenode(uint32_t var, MTBDD low, MTBDD high) return low == high ? low : _mtbdd_makenode(var, low, high); } + /** + * Create an internal MTBDD node of Boolean variable , with low edge and high edge + * that doesn't trigger garbage collection. Instead, returns mtbdd_invalid if we can't create the node. + * is a 24-bit integer. + * Please note that this does NOT check variable ordering! + */ + MTBDD _mtbdd_varswap_makenode(uint32_t var, MTBDD low, MTBDD high, int* created); +static inline MTBDD mtbdd_varswap_makenode(BDDVAR var, MTBDD low, MTBDD high, int* created) +{ + *created = 0; + return low == high ? low : _mtbdd_varswap_makenode(var, low, high, created); +} + +/** + * Create an internal MTBDD map node of Boolean variable , with low edge and high edge . + * that doesn't trigger garbage collection. Instead, returns mtbdd_invalid if we can't create the node. + * is a 24-bit integer. + * Please note that this does NOT check variable ordering! + */ +MTBDD mtbdd_varswap_makemapnode(uint32_t var, MTBDD low, MTBDD high, int* created); + + /** * Return 1 if the MTBDD is a terminal, or 0 otherwise. */ @@ -1018,6 +1044,12 @@ MTBDDMAP mtbdd_map_removeall(MTBDDMAP map, MTBDD variables); VOID_TASK_DECL_1(mtbdd_gc_mark_rec, MTBDD); #define mtbdd_gc_mark_rec(mtbdd) RUN(mtbdd_gc_mark_rec, mtbdd) +VOID_TASK_DECL_1(mtbdd_re_mark_external_refs, _Atomic(uint64_t)*); +#define mtbdd_re_mark_external_refs(bitmap) RUN(mtbdd_re_mark_external_refs, bitmap) + +VOID_TASK_DECL_1(mtbdd_re_mark_protected, _Atomic(uint64_t)*); +#define mtbdd_re_mark_protected(bitmap) RUN(mtbdd_re_mark_protected, bitmap) + /** * Infrastructure for external references using a hash table. * Two hash tables store external references: a pointers table and a values table. diff --git a/src/sylvan_mtbdd_int.h b/src/sylvan_mtbdd_int.h index 4fefc47e..56ea5dcc 100644 --- a/src/sylvan_mtbdd_int.h +++ b/src/sylvan_mtbdd_int.h @@ -133,6 +133,12 @@ mtbddnode_setmark(mtbddnode_t n, int mark) else n->a &= 0xdfffffffffffffff; } +static inline void __attribute__((unused)) +mtbddnode_setvariable(mtbddnode_t n, uint32_t newvar) +{ + n->b = ((uint64_t)newvar)<<40 | (n->b & 0x000000ffffffffff); +} + static inline void __attribute__((unused)) mtbddnode_makeleaf(mtbddnode_t n, uint32_t type, uint64_t value) { diff --git a/src/sylvan_obj.cpp b/src/sylvan_obj.cpp index 2acf0536..0a66d48e 100644 --- a/src/sylvan_obj.cpp +++ b/src/sylvan_obj.cpp @@ -975,3 +975,43 @@ Sylvan::quitPackage() { sylvan_quit(); } + +void Sylvan::initReorder() +{ + sylvan_init_reorder(); +} + +void Sylvan::setReorderThreshold(uint32_t threshold) +{ + sylvan_set_reorder_nodes_threshold(threshold); +} + +void Sylvan::setReorderMaxGrowth(float max_growth) +{ + sylvan_set_reorder_maxgrowth(max_growth); +} + +void Sylvan::setReorderMaxSwap(uint32_t max_swap) +{ + sylvan_set_reorder_maxswap(max_swap); +} + +void Sylvan::setReorderMaxVar(uint32_t max_var) +{ + sylvan_set_reorder_maxvar(max_var); +} + +void Sylvan::setReorderTimeLimit(double time_limit) +{ + sylvan_set_reorder_timelimit_ms(time_limit); +} + +void Sylvan::reduceHeap() +{ + return sylvan_reduce_heap(SYLVAN_REORDER_BOUNDED_SIFT); +} + +reorder_result_t Sylvan::reorderPerm(const std::vector &perm) +{ + return sylvan_reorder_perm(perm.data()); +} diff --git a/src/sylvan_obj.hpp b/src/sylvan_obj.hpp index 6ddd7d08..0a28751e 100644 --- a/src/sylvan_obj.hpp +++ b/src/sylvan_obj.hpp @@ -17,844 +17,1032 @@ #ifndef SYLVAN_OBJ_H #define SYLVAN_OBJ_H +#include #include #include #include #include -namespace sylvan { - -class BddSet; -class BddMap; - -class Bdd { - friend class Sylvan; - friend class BddSet; - friend class BddMap; - friend class Mtbdd; - -public: - Bdd() { bdd = sylvan_false; sylvan_protect(&bdd); } - Bdd(const BDD from) : bdd(from) { sylvan_protect(&bdd); } - Bdd(const Bdd &from) : bdd(from.bdd) { sylvan_protect(&bdd); } - Bdd(const uint32_t var) { bdd = sylvan_ithvar(var); sylvan_protect(&bdd); } - ~Bdd() { sylvan_unprotect(&bdd); } - - /** - * @brief Creates a Bdd representing just the variable index in its positive form - * The variable index must be a 0<=index<=2^23 (we use 24 bits internally) - */ - static Bdd bddVar(uint32_t index); - - /** - * @brief Returns the Bdd representing "True" - */ - static Bdd bddOne(); - - /** - * @brief Returns the Bdd representing "False" - */ - static Bdd bddZero(); - - /** - * @brief Returns the Bdd representing a cube of variables, according to the given values. - * @param variables the variables that will be in the cube in their positive or negative form - * @param values a character array describing how the variables will appear in the result - * The length of string must be equal to the number of variables in the cube. - * For every ith char in string, if it is 0, the corresponding variable will appear in its negative form, - * if it is 1, it will appear in its positive form, and if it is 2, it will appear as "any", thus it will - * be skipped. - */ - static Bdd bddCube(const BddSet &variables, unsigned char *values); - - /** - * @brief Returns the Bdd representing a cube of variables, according to the given values. - * @param variables the variables that will be in the cube in their positive or negative form - * @param string a character array describing how the variables will appear in the result - * The length of string must be equal to the number of variables in the cube. - * For every ith char in string, if it is 0, the corresponding variable will appear in its negative form, - * if it is 1, it will appear in its positive form, and if it is 2, it will appear as "any", thus it will - * be skipped. - */ - static Bdd bddCube(const BddSet &variables, std::vector values); - - bool operator==(const Bdd& other) const; - bool operator!=(const Bdd& other) const; - Bdd& operator=(const Bdd& right); - bool operator<=(const Bdd& other) const; - bool operator>=(const Bdd& other) const; - bool operator<(const Bdd& other) const; - bool operator>(const Bdd& other) const; - Bdd operator!() const; - Bdd operator~() const; - Bdd operator*(const Bdd& other) const; - Bdd& operator*=(const Bdd& other); - Bdd operator&(const Bdd& other) const; - Bdd& operator&=(const Bdd& other); - Bdd operator+(const Bdd& other) const; - Bdd& operator+=(const Bdd& other); - Bdd operator|(const Bdd& other) const; - Bdd& operator|=(const Bdd& other); - Bdd operator^(const Bdd& other) const; - Bdd& operator^=(const Bdd& other); - Bdd operator-(const Bdd& other) const; - Bdd& operator-=(const Bdd& other); - - /** - * @brief Returns non-zero if this Bdd is bddOne() or bddZero() - */ - bool isConstant() const; - - /** - * @brief Returns non-zero if this Bdd is bddOne() or bddZero() - */ - bool isTerminal() const; - - /** - * @brief Returns non-zero if this Bdd is bddOne() - */ - bool isOne() const; - - /** - * @brief Returns non-zero if this Bdd is bddZero() - */ - bool isZero() const; - - /** - * @brief Returns the top variable index of this Bdd (the variable in the root node) - */ - uint32_t TopVar() const; - - /** - * @brief Follows the high edge ("then") of the root node of this Bdd - */ - Bdd Then() const; - - /** - * @brief Follows the low edge ("else") of the root node of this Bdd - */ - Bdd Else() const; - - /** - * @brief Computes \exists cube: f \and g - */ - Bdd AndAbstract(const Bdd& g, const BddSet& cube) const; - - /** - * @brief Computes \exists cube: f - */ - Bdd ExistAbstract(const BddSet& cube) const; - - /** - * @brief Computes \forall cube: f - */ - Bdd UnivAbstract(const BddSet& cube) const; - - /** - * @brief Computes if f then g else h - */ - Bdd Ite(const Bdd& g, const Bdd& h) const; - - /** - * @brief Computes f \and g - */ - Bdd And(const Bdd& g) const; - - /** - * @brief Computes f \or g - */ - Bdd Or(const Bdd& g) const; - - /** - * @brief Computes \not (f \and g) - */ - Bdd Nand(const Bdd& g) const; - - /** - * @brief Computes \not (f \or g) - */ - Bdd Nor(const Bdd& g) const; - - /** - * @brief Computes f \xor g - */ - Bdd Xor(const Bdd& g) const; - - /** - * @brief Computes \not (f \xor g), i.e. f \equiv g - */ - Bdd Xnor(const Bdd& g) const; - - /** - * @brief Returns whether all elements in f are also in g - */ - bool Leq(const Bdd& g) const; - - /** - * @brief Computes the reverse application of a transition relation to this set. - * @param relation the transition relation to apply - * @param cube the variables that are in the transition relation - * This function assumes that s,t are interleaved with s even and t odd (s+1). - * Other variables in the relation are ignored (by existential quantification) - * Set cube to "false" (illegal cube) to assume all encountered variables are in s,t - * - * Use this function to concatenate two relations --> --> - * or to take the 'previous' of a set --> S - */ - Bdd RelPrev(const Bdd& relation, const BddSet& cube) const; - - /** - * @brief Computes the application of a transition relation to this set. - * @param relation the transition relation to apply - * @param cube the variables that are in the transition relation - * This function assumes that s,t are interleaved with s even and t odd (s+1). - * Other variables in the relation are ignored (by existential quantification) - * Set cube to "false" (illegal cube) to assume all encountered variables are in s,t - * - * Use this function to take the 'next' of a set S --> - */ - Bdd RelNext(const Bdd& relation, const BddSet& cube) const; - - /** - * @brief Computes the transitive closure by traversing the BDD recursively. - * See Y. Matsunaga, P. C. McGeer, R. K. Brayton - * On Computing the Transitive Closre of a State Transition Relation - * 30th ACM Design Automation Conference, 1993. - */ - Bdd Closure() const; - - /** - * @brief Computes the constrain f @ c - */ - Bdd Constrain(const Bdd &c) const; - - /** - * @brief Computes the BDD restrict according to Coudert and Madre's algorithm (ICCAD90). - */ - Bdd Restrict(const Bdd &c) const; - - /** - * @brief Functional composition. Whenever a variable v in the map m is found in the BDD, - * it is substituted by the associated function. - * You can also use this function to implement variable reordering. - */ - Bdd Compose(const BddMap &m) const; - - /** - * @brief Substitute all variables in the array from by the corresponding variables in to. - */ - Bdd Permute(const std::vector& from, const std::vector& to) const; - - /** - * @brief Computes the support of a Bdd. - */ - Bdd Support() const; - - /** - * @brief Gets the BDD of this Bdd (for C functions) - */ - BDD GetBDD() const; - - /** - * @brief Writes .dot file of this Bdd. Not thread-safe! - */ - void PrintDot(FILE *out) const; - - /** - * @brief Gets a SHA2 hash that describes the structure of this Bdd. - * @param string a character array of at least 65 characters (includes zero-termination) - * This hash is 64 characters long and is independent of the memory locations of BDD nodes. - */ - void GetShaHash(char *string) const; - - std::string GetShaHash() const; - - /** - * @brief Computes the number of satisfying variable assignments, using variables in cube. - */ - double SatCount(const BddSet &cube) const; - - /** - * @brief Compute the number of satisfying variable assignments, using the given number of variables. - */ - double SatCount(const size_t nvars) const; - - /** - * @brief Gets one satisfying assignment according to the variables. - * @param variables The set of variables to be assigned, must include the support of the Bdd. - */ - void PickOneCube(const BddSet &variables, uint8_t *string) const; - - /** - * @brief Gets one satisfying assignment according to the variables. - * @param variables The set of variables to be assigned, must include the support of the Bdd. - * Returns an empty vector when either this Bdd equals bddZero() or the cube is empty. - */ - std::vector PickOneCube(const BddSet &variables) const; - - /** - * @brief Gets a cube that satisfies this Bdd. - */ - Bdd PickOneCube() const; - - /** - * @brief Faster version of: *this + Sylvan::bddCube(variables, values); - */ - Bdd UnionCube(const BddSet &variables, uint8_t *values) const; - - /** - * @brief Faster version of: *this + Sylvan::bddCube(variables, values); - */ - Bdd UnionCube(const BddSet &variables, std::vector values) const; - - /** - * @brief Generate a cube representing a set of variables - */ - static Bdd VectorCube(const std::vector variables); - - /** - * @brief Generate a cube representing a set of variables - * @param variables An sorted set of variable indices - */ - static Bdd VariablesCube(const std::vector variables); - - /** - * @brief Gets the number of nodes in this Bdd. Not thread-safe! - */ - size_t NodeCount() const; - -private: - BDD bdd; -}; - -class BddSet +namespace sylvan { - friend class Bdd; - friend class Mtbdd; - Bdd set; - -public: - /** - * @brief Create a new empty set. - */ - BddSet() : set(Bdd::bddOne()) {} - - /** - * @brief Wrap the BDD cube in a set. - */ - BddSet(const Bdd &other) : set(other) {} - - /** - * @brief Create a copy of the set . - */ - BddSet(const BddSet &other) : set(other.set) {} - - /** - * @brief Add the variable to this set. - */ - void add(uint32_t variable) { - set *= Bdd::bddVar(variable); - } - - /** - * @brief Add all variables in the set to this set. - */ - void add(BddSet &other) { - set *= other.set; - } - - /** - * @brief Remove the variable from this set. - */ - void remove(uint32_t variable) { - set = set.ExistAbstract(Bdd::bddVar(variable)); - } - - /** - * @brief Remove all variables in the set from this set. - */ - void remove(BddSet &other) { - set = set.ExistAbstract(other.set); - } - - /** - * @brief Retrieve the head of the set. (The first variable.) - */ - uint32_t TopVar() const { - return set.TopVar(); - } - - /** - * @brief Retrieve the tail of the set. (The set containing all but the first variables.) - */ - BddSet Next() const { - Bdd then = set.Then(); - return BddSet(then); - } - - /** - * @brief Return true if this set is empty, or false otherwise. - */ - bool isEmpty() const { - return set.isOne(); - } - - /** - * @brief Return true if this set contains the variable , or false otherwise. - */ - bool contains(uint32_t variable) const { - if (isEmpty()) return false; - else if (TopVar() == variable) return true; - else return Next().contains(variable); - } - - /** - * @brief Return the number of variables in this set. - */ - size_t size() const { - if (isEmpty()) return 0; - else return 1 + Next().size(); - } - - /** - * @brief Create a set containing the variables in . - * It is advised to have the variables in in ascending order. - */ - static BddSet fromArray(BDDVAR *arr, size_t length) { - BddSet set; - for (size_t i = 0; i < length; i++) { - set.add(arr[length-i-1]); + + class BddSet; + + class BddMap; + + class Bdd + { + friend class Sylvan; + + friend class BddSet; + + friend class BddMap; + + friend class Mtbdd; + + public: + Bdd() + { + bdd = sylvan_false; + sylvan_protect(&bdd); + } + + Bdd(const BDD from) : bdd(from) + { sylvan_protect(&bdd); } + + Bdd(const Bdd &from) : bdd(from.bdd) + { sylvan_protect(&bdd); } + + Bdd(const uint32_t var) + { + bdd = sylvan_ithvar(var); + sylvan_protect(&bdd); } - return set; - } - - /** - * @brief Create a set containing the variables in . - * It is advised to have the variables in in ascending order. - */ - static BddSet fromVector(const std::vector variables) { - BddSet set; - for (int i=variables.size()-1; i>=0; i--) { - set.set *= variables[i]; + + ~Bdd() + { sylvan_unprotect(&bdd); } + + /** + * @brief Creates a Bdd representing just the variable index in its positive form + * The variable index must be a 0<=index<=2^23 (we use 24 bits internally) + */ + static Bdd bddVar(uint32_t index); + + /** + * @brief Get a MTBDD representing just the level index in its positive form + */ + static Bdd bddLevel(uint32_t index); + + /** + * @brief Get the level of the given variable label + */ + static uint32_t bddVarToLevel(uint32_t index); + + /** + * @brief Returns the Bdd representing "True" + */ + static Bdd bddOne(); + + /** + * @brief Returns the Bdd representing "False" + */ + static Bdd bddZero(); + + /** + * @brief Returns the Bdd representing a cube of variables, according to the given values. + * @param variables the variables that will be in the cube in their positive or negative form + * @param values a character array describing how the variables will appear in the result + * The length of string must be equal to the number of variables in the cube. + * For every ith char in string, if it is 0, the corresponding variable will appear in its negative form, + * if it is 1, it will appear in its positive form, and if it is 2, it will appear as "any", thus it will + * be skipped. + */ + static Bdd bddCube(const BddSet &variables, unsigned char *values); + + /** + * @brief Returns the Bdd representing a cube of variables, according to the given values. + * @param variables the variables that will be in the cube in their positive or negative form + * @param string a character array describing how the variables will appear in the result + * The length of string must be equal to the number of variables in the cube. + * For every ith char in string, if it is 0, the corresponding variable will appear in its negative form, + * if it is 1, it will appear in its positive form, and if it is 2, it will appear as "any", thus it will + * be skipped. + */ + static Bdd bddCube(const BddSet &variables, std::vector values); + + bool operator==(const Bdd &other) const; + + bool operator!=(const Bdd &other) const; + + Bdd &operator=(const Bdd &right); + + bool operator<=(const Bdd &other) const; + + bool operator>=(const Bdd &other) const; + + bool operator<(const Bdd &other) const; + + bool operator>(const Bdd &other) const; + + Bdd operator!() const; + + Bdd operator~() const; + + Bdd operator*(const Bdd &other) const; + + Bdd &operator*=(const Bdd &other); + + Bdd operator&(const Bdd &other) const; + + Bdd &operator&=(const Bdd &other); + + Bdd operator+(const Bdd &other) const; + + Bdd &operator+=(const Bdd &other); + + Bdd operator|(const Bdd &other) const; + + Bdd &operator|=(const Bdd &other); + + Bdd operator^(const Bdd &other) const; + + Bdd &operator^=(const Bdd &other); + + Bdd operator-(const Bdd &other) const; + + Bdd &operator-=(const Bdd &other); + + /** + * @brief Returns non-zero if this Bdd is bddOne() or bddZero() + */ + bool isConstant() const; + + /** + * @brief Returns non-zero if this Bdd is bddOne() or bddZero() + */ + bool isTerminal() const; + + /** + * @brief Returns non-zero if this Bdd is bddOne() + */ + bool isOne() const; + + /** + * @brief Returns non-zero if this Bdd is bddZero() + */ + bool isZero() const; + + /** + * @brief Returns the top variable index of this Bdd (the variable in the root node) + */ + uint32_t TopVar() const; + + /** + * @brief Follows the high edge ("then") of the root node of this Bdd + */ + Bdd Then() const; + + /** + * @brief Follows the low edge ("else") of the root node of this Bdd + */ + Bdd Else() const; + + /** + * @brief Computes \exists cube: f \and g + */ + Bdd AndAbstract(const Bdd &g, const BddSet &cube) const; + + /** + * @brief Computes \exists cube: f + */ + Bdd ExistAbstract(const BddSet &cube) const; + + /** + * @brief Computes \forall cube: f + */ + Bdd UnivAbstract(const BddSet &cube) const; + + /** + * @brief Computes if f then g else h + */ + Bdd Ite(const Bdd &g, const Bdd &h) const; + + /** + * @brief Computes f \and g + */ + Bdd And(const Bdd &g) const; + + /** + * @brief Computes f \or g + */ + Bdd Or(const Bdd &g) const; + + /** + * @brief Computes \not (f \and g) + */ + Bdd Nand(const Bdd &g) const; + + /** + * @brief Computes \not (f \or g) + */ + Bdd Nor(const Bdd &g) const; + + /** + * @brief Computes f \xor g + */ + Bdd Xor(const Bdd &g) const; + + /** + * @brief Computes \not (f \xor g), i.e. f \equiv g + */ + Bdd Xnor(const Bdd &g) const; + + /** + * @brief Returns whether all elements in f are also in g + */ + bool Leq(const Bdd &g) const; + + /** + * @brief Computes the reverse application of a transition relation to this set. + * @param relation the transition relation to apply + * @param cube the variables that are in the transition relation + * This function assumes that s,t are interleaved with s even and t odd (s+1). + * Other variables in the relation are ignored (by existential quantification) + * Set cube to "false" (illegal cube) to assume all encountered variables are in s,t + * + * Use this function to concatenate two relations --> --> + * or to take the 'previous' of a set --> S + */ + Bdd RelPrev(const Bdd &relation, const BddSet &cube) const; + + /** + * @brief Computes the application of a transition relation to this set. + * @param relation the transition relation to apply + * @param cube the variables that are in the transition relation + * This function assumes that s,t are interleaved with s even and t odd (s+1). + * Other variables in the relation are ignored (by existential quantification) + * Set cube to "false" (illegal cube) to assume all encountered variables are in s,t + * + * Use this function to take the 'next' of a set S --> + */ + Bdd RelNext(const Bdd &relation, const BddSet &cube) const; + + /** + * @brief Computes the transitive closure by traversing the BDD recursively. + * See Y. Matsunaga, P. C. McGeer, R. K. Brayton + * On Computing the Transitive Closre of a State Transition Relation + * 30th ACM Design Automation Conference, 1993. + */ + Bdd Closure() const; + + /** + * @brief Computes the constrain f @ c + */ + Bdd Constrain(const Bdd &c) const; + + /** + * @brief Computes the BDD restrict according to Coudert and Madre's algorithm (ICCAD90). + */ + Bdd Restrict(const Bdd &c) const; + + /** + * @brief Functional composition. Whenever a variable v in the map m is found in the BDD, + * it is substituted by the associated function. + * You can also use this function to implement variable reordering. + */ + Bdd Compose(const BddMap &m) const; + + /** + * @brief Substitute all variables in the array from by the corresponding variables in to. + */ + Bdd Permute(const std::vector &from, const std::vector &to) const; + + /** + * @brief Computes the support of a Bdd. + */ + Bdd Support() const; + + /** + * @brief Gets the BDD of this Bdd (for C functions) + */ + BDD GetBDD() const; + + /** + * @brief Writes .dot file of this Bdd. Not thread-safe! + */ + void PrintDot(FILE *out) const; + + /** + * @brief Gets a SHA2 hash that describes the structure of this Bdd. + * @param string a character array of at least 65 characters (includes zero-termination) + * This hash is 64 characters long and is independent of the memory locations of BDD nodes. + */ + void GetShaHash(char *string) const; + + std::string GetShaHash() const; + + /** + * @brief Computes the number of satisfying variable assignments, using variables in cube. + */ + double SatCount(const BddSet &cube) const; + + /** + * @brief Compute the number of satisfying variable assignments, using the given number of variables. + */ + double SatCount(const size_t nvars) const; + + /** + * @brief Gets one satisfying assignment according to the variables. + * @param variables The set of variables to be assigned, must include the support of the Bdd. + */ + void PickOneCube(const BddSet &variables, uint8_t *string) const; + + /** + * @brief Gets one satisfying assignment according to the variables. + * @param variables The set of variables to be assigned, must include the support of the Bdd. + * Returns an empty vector when either this Bdd equals bddZero() or the cube is empty. + */ + std::vector PickOneCube(const BddSet &variables) const; + + /** + * @brief Gets a cube that satisfies this Bdd. + */ + Bdd PickOneCube() const; + + /** + * @brief Faster version of: *this + Sylvan::bddCube(variables, values); + */ + Bdd UnionCube(const BddSet &variables, uint8_t *values) const; + + /** + * @brief Faster version of: *this + Sylvan::bddCube(variables, values); + */ + Bdd UnionCube(const BddSet &variables, std::vector values) const; + + /** + * @brief Generate a cube representing a set of variables + */ + static Bdd VectorCube(const std::vector variables); + + /** + * @brief Generate a cube representing a set of variables + * @param variables An sorted set of variable indices + */ + static Bdd VariablesCube(const std::vector variables); + + /** + * @brief Gets the number of nodes in this Bdd. Not thread-safe! + */ + size_t NodeCount() const; + + private: + BDD bdd; + }; + + class BddSet + { + friend class Bdd; + + friend class Mtbdd; + + Bdd set; + + public: + /** + * @brief Create a new empty set. + */ + BddSet() : set(Bdd::bddOne()) + {} + + /** + * @brief Wrap the BDD cube in a set. + */ + BddSet(const Bdd &other) : set(other) + {} + + /** + * @brief Create a copy of the set . + */ + BddSet(const BddSet &other) : set(other.set) + {} + + /** + * @brief Add the variable to this set. + */ + void add(uint32_t variable) + { + set *= Bdd::bddVar(variable); } - return set; - } - - /** - * @brief Create a set containing the variables in . - * It is advised to have the variables in in ascending order. - */ - static BddSet fromVector(const std::vector variables) { - BddSet set; - for (int i=variables.size()-1; i>=0; i--) { - set.add(variables[i]); + + /** + * @brief Add all variables in the set to this set. + */ + void add(BddSet &other) + { + set *= other.set; } - return set; - } - - /** - * @brief Write all variables in this set to . - * @param arr An array of at least size this.size(). - */ - void toArray(BDDVAR *arr) const { - if (!isEmpty()) { - *arr = TopVar(); - Next().toArray(arr+1); + + /** + * @brief Remove the variable from this set. + */ + void remove(uint32_t variable) + { + set = set.ExistAbstract(Bdd::bddVar(variable)); } - } - - /** - * @brief Return the vector of all variables in this set. - */ - std::vector toVector() const { - std::vector result; - Bdd x = set; - while (!x.isOne()) { - result.push_back(x.TopVar()); - x = x.Then(); + + /** + * @brief Remove all variables in the set from this set. + */ + void remove(BddSet &other) + { + set = set.ExistAbstract(other.set); } - return result; - } -}; -class BddMap -{ - friend class Bdd; - BDD bdd; - BddMap(const BDD from) : bdd(from) { sylvan_protect(&bdd); } - BddMap(const Bdd &from) : bdd(from.bdd) { sylvan_protect(&bdd); } -public: - BddMap(const BddMap& from) : bdd(from.bdd) { sylvan_protect(&bdd); } - BddMap() : bdd(sylvan_map_empty()) { sylvan_protect(&bdd); } - ~BddMap() { sylvan_unprotect(&bdd); } - - BddMap(uint32_t key_variable, const Bdd value); - - BddMap operator+(const Bdd& other) const; - BddMap& operator+=(const Bdd& other); - BddMap operator-(const Bdd& other) const; - BddMap& operator-=(const Bdd& other); - - /** - * @brief Adds a key-value pair to the map - */ - void put(uint32_t key, Bdd value); - - /** - * @brief Removes a key-value pair from the map - */ - void removeKey(uint32_t key); - - /** - * @brief Returns the number of key-value pairs in this map - */ - size_t size() const; - - /** - * @brief Returns non-zero when this map is empty - */ - bool isEmpty() const; -}; - -class MtbddMap; - -class Mtbdd { - friend class Sylvan; - friend class MtbddMap; - -public: - Mtbdd() { mtbdd = sylvan_false; mtbdd_protect(&mtbdd); } - Mtbdd(const MTBDD from) : mtbdd(from) { mtbdd_protect(&mtbdd); } - Mtbdd(const Mtbdd &from) : mtbdd(from.mtbdd) { mtbdd_protect(&mtbdd); } - Mtbdd(const Bdd &from) : mtbdd(from.bdd) { mtbdd_protect(&mtbdd); } - ~Mtbdd() { mtbdd_unprotect(&mtbdd); } - - /** - * @brief Creates a Mtbdd leaf representing the int64 value - */ - static Mtbdd int64Terminal(int64_t value); - - /** - * @brief Creates a Mtbdd leaf representing the floating-point value - */ - static Mtbdd doubleTerminal(double value); - - /** - * @brief Creates a Mtbdd leaf representing the fraction value / - * Internally, Sylvan uses 32-bit values and reports overflows to stderr. - */ - static Mtbdd fractionTerminal(int64_t nominator, uint64_t denominator); - - /** - * @brief Creates a Mtbdd leaf of type holding value - * This is useful for custom Mtbdd types. - */ - static Mtbdd terminal(uint32_t type, uint64_t value); - - /** - * @brief Creates a Boolean Mtbdd representing jsut the variable index in its positive form - * The variable index must be 0<=index<=2^23 (Sylvan uses 24 bits internally) - */ - static Mtbdd mtbddVar(uint32_t variable); - - /** - * @brief Returns the Boolean Mtbdd representing "True" - */ - static Mtbdd mtbddOne(); - - /** - * @brief Returns the Boolean Mtbdd representing "False" - */ - static Mtbdd mtbddZero(); - - /** - * @brief Returns the Mtbdd representing a cube of variables, according to the given values. - * @param variables the variables that will be in the cube in their positive or negative form - * @param values a character array describing how the variables will appear in the result - * @param terminal the leaf of the cube - * The length of string must be equal to the number of variables in the cube. - * For every ith char in string, if it is 0, the corresponding variable will appear in its negative form, - * if it is 1, it will appear in its positive form, and if it is 2, it will appear as "any", thus it will - * be skipped. - */ - static Mtbdd mtbddCube(const BddSet &variables, unsigned char *values, const Mtbdd &terminal); - - /** - * @brief Returns the Mtbdd representing a cube of variables, according to the given values. - * @param variables the variables that will be in the cube in their positive or negative form - * @param values a character array describing how the variables will appear in the result - * @param terminal the leaf of the cube - * The length of string must be equal to the number of variables in the cube. - * For every ith char in string, if it is 0, the corresponding variable will appear in its negative form, - * if it is 1, it will appear in its positive form, and if it is 2, it will appear as "any", thus it will - * be skipped. - */ - static Mtbdd mtbddCube(const BddSet &variables, std::vector values, const Mtbdd &terminal); - - bool operator==(const Mtbdd& other) const; - bool operator!=(const Mtbdd& other) const; - Mtbdd& operator=(const Mtbdd& right); - Mtbdd operator!() const; - Mtbdd operator~() const; - Mtbdd operator*(const Mtbdd& other) const; - Mtbdd& operator*=(const Mtbdd& other); - Mtbdd operator+(const Mtbdd& other) const; - Mtbdd& operator+=(const Mtbdd& other); - Mtbdd operator-(const Mtbdd& other) const; - Mtbdd& operator-=(const Mtbdd& other); - - // not implemented (compared to Bdd): <=, >=, <, >, &, &=, |, |=, ^, ^= - - /** - * @brief Returns non-zero if this Mtbdd is a leaf - */ - bool isTerminal() const; - - /** - * @brief Returns non-zero if this Mtbdd is a leaf - */ - bool isLeaf() const; - - /** - * @brief Returns non-zero if this Mtbdd is mtbddOne() - */ - bool isOne() const; - - /** - * @brief Returns non-zero if this Mtbdd is mtbddZero() - */ - bool isZero() const; - - /** - * @brief Returns the top variable index of this Mtbdd (the variable in the root node) - */ - uint32_t TopVar() const; - - /** - * @brief Follows the high edge ("then") of the root node of this Mtbdd - */ - Mtbdd Then() const; - - /** - * @brief Follows the low edge ("else") of the root node of this Mtbdd - */ - Mtbdd Else() const; - - /** - * @brief Returns the negation of the MTBDD (every terminal negative) - * Do not use this for Boolean MTBDDs, only for Integer/Double/Fraction MTBDDs. - */ - Mtbdd Negate() const; - - /** - * @brief Applies the binary operation - */ - Mtbdd Apply(const Mtbdd &other, mtbdd_apply_op op) const; - - /** - * @brief Applies the unary operation with parameter - */ - Mtbdd UApply(mtbdd_uapply_op op, size_t param) const; - - /** - * @brief Computers the abstraction on variables using operator . - * See also: AbstractPlus, AbstractTimes, AbstractMin, AbstractMax - */ - Mtbdd Abstract(const BddSet &variables, mtbdd_abstract_op op) const; - - /** - * @brief Computes if f then g else h - * This Mtbdd must be a Boolean Mtbdd - */ - Mtbdd Ite(const Mtbdd &g, const Mtbdd &h) const; - - /** - * @brief Computes f + g - */ - Mtbdd Plus(const Mtbdd &other) const; - - /** - * @brief Computes f * g - */ - Mtbdd Times(const Mtbdd &other) const; - - /** - * @brief Computes min(f, g) - */ - Mtbdd Min(const Mtbdd &other) const; - - /** - * @brief Computes max(f, g) - */ - Mtbdd Max(const Mtbdd &other) const; - - /** - * @brief Computes abstraction by summation (existential quantification) - */ - Mtbdd AbstractPlus(const BddSet &variables) const; - - /** - * @brief Computes abstraction by multiplication (universal quantification) - */ - Mtbdd AbstractTimes(const BddSet &variables) const; - - /** - * @brief Computes abstraction by minimum - */ - Mtbdd AbstractMin(const BddSet &variables) const; - - /** - * @brief Computes abstraction by maximum - */ - Mtbdd AbstractMax(const BddSet &variables) const; - - /** - * @brief Computes abstraction by summation of f \times g - */ - Mtbdd AndExists(const Mtbdd &other, const BddSet &variables) const; - - /** - * @brief Convert floating-point/fraction Mtbdd to a Boolean Mtbdd, leaf >= value ? true : false - */ - Mtbdd MtbddThreshold(double value) const; - - /** - * @brief Convert floating-point/fraction Mtbdd to a Boolean Mtbdd, leaf > value ? true : false - */ - Mtbdd MtbddStrictThreshold(double value) const; - - /** - * @brief Convert floating-point/fraction Mtbdd to a Boolean Mtbdd, leaf >= value ? true : false - * Same as MtbddThreshold (Bdd = Boolean Mtbdd) - */ - Bdd BddThreshold(double value) const; - - /** - * @brief Convert floating-point/fraction Mtbdd to a Boolean Mtbdd, leaf > value ? true : false - * Same as MtbddStrictThreshold (Bdd = Boolean Mtbdd) - */ - Bdd BddStrictThreshold(double value) const; - - /** - * @brief Computes the support of a Mtbdd. - */ - Mtbdd Support() const; - - /** - * @brief Gets the MTBDD of this Mtbdd (for C functions) - */ - MTBDD GetMTBDD() const; - - /** - * @brief Functional composition. Whenever a variable v in the map m is found in the MTBDD, - * it is substituted by the associated function (which should be a Boolean MTBDD) - * You can also use this function to implement variable reordering. - */ - Mtbdd Compose(MtbddMap &m) const; - - /** - * @brief Substitute all variables in the array from by the corresponding variables in to. - */ - Mtbdd Permute(const std::vector& from, const std::vector& to) const; - - /** - * @brief Compute the number of satisfying variable assignments, using variables in cube. - */ - double SatCount(const BddSet &variables) const; - - /** - * @brief Compute the number of satisfying variable assignments, using the given number of variables. - */ - double SatCount(const size_t nvars) const; - - /** - * @brief Gets the number of nodes in this Bdd. Not thread-safe! - */ - size_t NodeCount() const; - -private: - MTBDD mtbdd; -}; - -class MtbddMap -{ - friend class Mtbdd; - MTBDD mtbdd; - MtbddMap(MTBDD from) : mtbdd(from) { mtbdd_protect(&mtbdd); } - MtbddMap(Mtbdd &from) : mtbdd(from.mtbdd) { mtbdd_protect(&mtbdd); } -public: - MtbddMap() : mtbdd(mtbdd_map_empty()) { mtbdd_protect(&mtbdd); } - ~MtbddMap() { mtbdd_unprotect(&mtbdd); } - - MtbddMap(uint32_t key_variable, Mtbdd value); - - MtbddMap operator+(const Mtbdd& other) const; - MtbddMap& operator+=(const Mtbdd& other); - MtbddMap operator-(const Mtbdd& other) const; - MtbddMap& operator-=(const Mtbdd& other); - - /** - * @brief Adds a key-value pair to the map - */ - void put(uint32_t key, Mtbdd value); - - /** - * @brief Removes a key-value pair from the map - */ - void removeKey(uint32_t key); - - /** - * @brief Returns the number of key-value pairs in this map - */ - size_t size(); - - /** - * @brief Returns non-zero when this map is empty - */ - bool isEmpty(); -}; - -class Sylvan { -public: - /** - * @brief Initializes the Sylvan framework, call this only once in your program. - * @param initialTableSize The initial size of the nodes table. Must be a power of two. - * @param maxTableSize The maximum size of the nodes table. Must be a power of two. - * @param initialCacheSize The initial size of the operation cache. Must be a power of two. - * @param maxCacheSize The maximum size of the operation cache. Must be a power of two. - */ - static void initPackage(size_t initialTableSize, size_t maxTableSize, size_t initialCacheSize, size_t maxCacheSize); - - /** - * @brief Set the granularity for the BDD operations. - * @param granularity determins operation cache behavior; for higher values (2+) it will use the operation cache less often. - * Values of 3-7 may result in better performance, since occasionally not using the operation cache is fine in practice. - * A granularity of 1 means that every BDD operation will be cached at every variable level. - */ - static void setGranularity(int granularity); - - /** - * @brief Retrieve the granularity for the BDD operations. - */ - static int getGranularity(); - - /** - * @brief Initializes the BDD module of the Sylvan framework. - */ - static void initBdd(); - - /** - * @brief Initializes the MTBDD module of the Sylvan framework. - */ - static void initMtbdd(); - - /** - * @brief Frees all memory in use by Sylvan. - * Warning: if you have any Bdd objects which are not bddZero() or bddOne() after this, your program may crash! - */ - static void quitPackage(); -}; + /** + * @brief Retrieve the head of the set. (The first variable.) + */ + uint32_t TopVar() const + { + return set.TopVar(); + } + + /** + * @brief Retrieve the tail of the set. (The set containing all but the first variables.) + */ + BddSet Next() const + { + Bdd then = set.Then(); + return BddSet(then); + } + + /** + * @brief Return true if this set is empty, or false otherwise. + */ + bool isEmpty() const + { + return set.isOne(); + } + + /** + * @brief Return true if this set contains the variable , or false otherwise. + */ + bool contains(uint32_t variable) const + { + if (isEmpty()) return false; + else if (TopVar() == variable) return true; + else return Next().contains(variable); + } + + /** + * @brief Return the number of variables in this set. + */ + size_t size() const + { + if (isEmpty()) return 0; + else return 1 + Next().size(); + } + + /** + * @brief Create a set containing the variables in . + * It is advised to have the variables in in ascending order. + */ + static BddSet fromArray(BDDVAR *arr, size_t length) + { + BddSet set; + for (size_t i = 0; i < length; i++) { + set.add(arr[length - i - 1]); + } + return set; + } + + /** + * @brief Create a set containing the variables in . + * It is advised to have the variables in in ascending order. + */ + static BddSet fromVector(const std::vector variables) + { + BddSet set; + for (int i = variables.size() - 1; i >= 0; i--) { + set.set *= variables[i]; + } + return set; + } + + /** + * @brief Create a set containing the variables in . + * It is advised to have the variables in in ascending order. + */ + static BddSet fromVector(const std::vector variables) + { + BddSet set; + for (int i = variables.size() - 1; i >= 0; i--) { + set.add(variables[i]); + } + return set; + } + + /** + * @brief Write all variables in this set to . + * @param arr An array of at least size this.size(). + */ + void toArray(BDDVAR *arr) const + { + if (!isEmpty()) { + *arr = TopVar(); + Next().toArray(arr + 1); + } + } + + /** + * @brief Return the vector of all variables in this set. + */ + std::vector toVector() const + { + std::vector result; + Bdd x = set; + while (!x.isOne()) { + result.push_back(x.TopVar()); + x = x.Then(); + } + return result; + } + }; + + class BddMap + { + friend class Bdd; + + BDD bdd; + + BddMap(const BDD from) : bdd(from) + { sylvan_protect(&bdd); } + + BddMap(const Bdd &from) : bdd(from.bdd) + { sylvan_protect(&bdd); } + + public: + BddMap(const BddMap &from) : bdd(from.bdd) + { sylvan_protect(&bdd); } + + BddMap() : bdd(sylvan_map_empty()) + { sylvan_protect(&bdd); } + + ~BddMap() + { sylvan_unprotect(&bdd); } + + BddMap(uint32_t key_variable, const Bdd value); + + BddMap operator+(const Bdd &other) const; + + BddMap &operator+=(const Bdd &other); + + BddMap operator-(const Bdd &other) const; + + BddMap &operator-=(const Bdd &other); + + /** + * @brief Adds a key-value pair to the map + */ + void put(uint32_t key, Bdd value); + + /** + * @brief Removes a key-value pair from the map + */ + void removeKey(uint32_t key); + + /** + * @brief Returns the number of key-value pairs in this map + */ + size_t size() const; + + /** + * @brief Returns non-zero when this map is empty + */ + bool isEmpty() const; + }; + + class MtbddMap; + + class Mtbdd + { + friend class Sylvan; + + friend class MtbddMap; + + public: + Mtbdd() + { + mtbdd = sylvan_false; + mtbdd_protect(&mtbdd); + } + + Mtbdd(const MTBDD from) : mtbdd(from) + { mtbdd_protect(&mtbdd); } + + Mtbdd(const Mtbdd &from) : mtbdd(from.mtbdd) + { mtbdd_protect(&mtbdd); } + + Mtbdd(const Bdd &from) : mtbdd(from.bdd) + { mtbdd_protect(&mtbdd); } + + ~Mtbdd() + { mtbdd_unprotect(&mtbdd); } + + /** + * @brief Creates a Mtbdd leaf representing the int64 value + */ + static Mtbdd int64Terminal(int64_t value); + + /** + * @brief Creates a Mtbdd leaf representing the floating-point value + */ + static Mtbdd doubleTerminal(double value); + + /** + * @brief Creates a Mtbdd leaf representing the fraction value / + * Internally, Sylvan uses 32-bit values and reports overflows to stderr. + */ + static Mtbdd fractionTerminal(int64_t nominator, uint64_t denominator); + + /** + * @brief Creates a Mtbdd leaf of type holding value + * This is useful for custom Mtbdd types. + */ + static Mtbdd terminal(uint32_t type, uint64_t value); + + /** + * @brief Creates a Boolean Mtbdd representing jsut the variable index in its positive form + * The variable index must be 0<=index<=2^23 (Sylvan uses 24 bits internally) + */ + static Mtbdd mtbddVar(uint32_t variable); + + /** + * @brief Returns the Boolean Mtbdd representing "True" + */ + static Mtbdd mtbddOne(); + + /** + * @brief Returns the Boolean Mtbdd representing "False" + */ + static Mtbdd mtbddZero(); + + /** + * @brief Returns the Mtbdd representing a cube of variables, according to the given values. + * @param variables the variables that will be in the cube in their positive or negative form + * @param values a character array describing how the variables will appear in the result + * @param terminal the leaf of the cube + * The length of string must be equal to the number of variables in the cube. + * For every ith char in string, if it is 0, the corresponding variable will appear in its negative form, + * if it is 1, it will appear in its positive form, and if it is 2, it will appear as "any", thus it will + * be skipped. + */ + static Mtbdd mtbddCube(const BddSet &variables, unsigned char *values, const Mtbdd &terminal); + + /** + * @brief Returns the Mtbdd representing a cube of variables, according to the given values. + * @param variables the variables that will be in the cube in their positive or negative form + * @param values a character array describing how the variables will appear in the result + * @param terminal the leaf of the cube + * The length of string must be equal to the number of variables in the cube. + * For every ith char in string, if it is 0, the corresponding variable will appear in its negative form, + * if it is 1, it will appear in its positive form, and if it is 2, it will appear as "any", thus it will + * be skipped. + */ + static Mtbdd mtbddCube(const BddSet &variables, std::vector values, const Mtbdd &terminal); + + bool operator==(const Mtbdd &other) const; + + bool operator!=(const Mtbdd &other) const; + + Mtbdd &operator=(const Mtbdd &right); + + Mtbdd operator!() const; + + Mtbdd operator~() const; + + Mtbdd operator*(const Mtbdd &other) const; + + Mtbdd &operator*=(const Mtbdd &other); + + Mtbdd operator+(const Mtbdd &other) const; + + Mtbdd &operator+=(const Mtbdd &other); + + Mtbdd operator-(const Mtbdd &other) const; + + Mtbdd &operator-=(const Mtbdd &other); + + // not implemented (compared to Bdd): <=, >=, <, >, &, &=, |, |=, ^, ^= + + /** + * @brief Returns non-zero if this Mtbdd is a leaf + */ + bool isTerminal() const; + + /** + * @brief Returns non-zero if this Mtbdd is a leaf + */ + bool isLeaf() const; + + /** + * @brief Returns non-zero if this Mtbdd is mtbddOne() + */ + bool isOne() const; + + /** + * @brief Returns non-zero if this Mtbdd is mtbddZero() + */ + bool isZero() const; + + /** + * @brief Returns the top variable index of this Mtbdd (the variable in the root node) + */ + uint32_t TopVar() const; + + /** + * @brief Follows the high edge ("then") of the root node of this Mtbdd + */ + Mtbdd Then() const; + + /** + * @brief Follows the low edge ("else") of the root node of this Mtbdd + */ + Mtbdd Else() const; + + /** + * @brief Returns the negation of the MTBDD (every terminal negative) + * Do not use this for Boolean MTBDDs, only for Integer/Double/Fraction MTBDDs. + */ + Mtbdd Negate() const; + + /** + * @brief Applies the binary operation + */ + Mtbdd Apply(const Mtbdd &other, mtbdd_apply_op op) const; + + /** + * @brief Applies the unary operation with parameter + */ + Mtbdd UApply(mtbdd_uapply_op op, size_t param) const; + + /** + * @brief Computers the abstraction on variables using operator . + * See also: AbstractPlus, AbstractTimes, AbstractMin, AbstractMax + */ + Mtbdd Abstract(const BddSet &variables, mtbdd_abstract_op op) const; + + /** + * @brief Computes if f then g else h + * This Mtbdd must be a Boolean Mtbdd + */ + Mtbdd Ite(const Mtbdd &g, const Mtbdd &h) const; + + /** + * @brief Computes f + g + */ + Mtbdd Plus(const Mtbdd &other) const; + + /** + * @brief Computes f * g + */ + Mtbdd Times(const Mtbdd &other) const; + + /** + * @brief Computes min(f, g) + */ + Mtbdd Min(const Mtbdd &other) const; + + /** + * @brief Computes max(f, g) + */ + Mtbdd Max(const Mtbdd &other) const; + + /** + * @brief Computes abstraction by summation (existential quantification) + */ + Mtbdd AbstractPlus(const BddSet &variables) const; + + /** + * @brief Computes abstraction by multiplication (universal quantification) + */ + Mtbdd AbstractTimes(const BddSet &variables) const; + + /** + * @brief Computes abstraction by minimum + */ + Mtbdd AbstractMin(const BddSet &variables) const; + + /** + * @brief Computes abstraction by maximum + */ + Mtbdd AbstractMax(const BddSet &variables) const; + + /** + * @brief Computes abstraction by summation of f \times g + */ + Mtbdd AndExists(const Mtbdd &other, const BddSet &variables) const; + + /** + * @brief Convert floating-point/fraction Mtbdd to a Boolean Mtbdd, leaf >= value ? true : false + */ + Mtbdd MtbddThreshold(double value) const; + + /** + * @brief Convert floating-point/fraction Mtbdd to a Boolean Mtbdd, leaf > value ? true : false + */ + Mtbdd MtbddStrictThreshold(double value) const; + + /** + * @brief Convert floating-point/fraction Mtbdd to a Boolean Mtbdd, leaf >= value ? true : false + * Same as MtbddThreshold (Bdd = Boolean Mtbdd) + */ + Bdd BddThreshold(double value) const; + + /** + * @brief Convert floating-point/fraction Mtbdd to a Boolean Mtbdd, leaf > value ? true : false + * Same as MtbddStrictThreshold (Bdd = Boolean Mtbdd) + */ + Bdd BddStrictThreshold(double value) const; + + /** + * @brief Computes the support of a Mtbdd. + */ + Mtbdd Support() const; + + /** + * @brief Gets the MTBDD of this Mtbdd (for C functions) + */ + MTBDD GetMTBDD() const; + + /** + * @brief Functional composition. Whenever a variable v in the map m is found in the MTBDD, + * it is substituted by the associated function (which should be a Boolean MTBDD) + * You can also use this function to implement variable reordering. + */ + Mtbdd Compose(MtbddMap &m) const; + + /** + * @brief Substitute all variables in the array from by the corresponding variables in to. + */ + Mtbdd Permute(const std::vector &from, const std::vector &to) const; + + /** + * @brief Compute the number of satisfying variable assignments, using variables in cube. + */ + double SatCount(const BddSet &variables) const; + + /** + * @brief Compute the number of satisfying variable assignments, using the given number of variables. + */ + double SatCount(const size_t nvars) const; + + /** + * @brief Gets the number of nodes in this Bdd. Not thread-safe! + */ + size_t NodeCount() const; + + private: + MTBDD mtbdd; + }; + + class MtbddMap + { + friend class Mtbdd; + + MTBDD mtbdd; + + MtbddMap(MTBDD from) : mtbdd(from) + { mtbdd_protect(&mtbdd); } + + MtbddMap(Mtbdd &from) : mtbdd(from.mtbdd) + { mtbdd_protect(&mtbdd); } + + public: + MtbddMap() : mtbdd(mtbdd_map_empty()) + { mtbdd_protect(&mtbdd); } + + ~MtbddMap() + { mtbdd_unprotect(&mtbdd); } + + MtbddMap(uint32_t key_variable, Mtbdd value); + + MtbddMap operator+(const Mtbdd &other) const; + + MtbddMap &operator+=(const Mtbdd &other); + + MtbddMap operator-(const Mtbdd &other) const; + + MtbddMap &operator-=(const Mtbdd &other); + + /** + * @brief Adds a key-value pair to the map + */ + void put(uint32_t key, Mtbdd value); + + /** + * @brief Removes a key-value pair from the map + */ + void removeKey(uint32_t key); + + /** + * @brief Returns the number of key-value pairs in this map + */ + size_t size(); + + /** + * @brief Returns non-zero when this map is empty + */ + bool isEmpty(); + }; + + class Sylvan + { + public: + /** + * @brief Initializes the Sylvan framework, call this only once in your program. + * @param initialTableSize The initial size of the nodes table. Must be a power of two. + * @param maxTableSize The maximum size of the nodes table. Must be a power of two. + * @param initialCacheSize The initial size of the operation cache. Must be a power of two. + * @param maxCacheSize The maximum size of the operation cache. Must be a power of two. + */ + static void + initPackage(size_t initialTableSize, size_t maxTableSize, size_t initialCacheSize, size_t maxCacheSize); + + /** + * @brief Set the granularity for the BDD operations. + * @param granularity determins operation cache behavior; for higher values (2+) it will use the operation cache less often. + * Values of 3-7 may result in better performance, since occasionally not using the operation cache is fine in practice. + * A granularity of 1 means that every BDD operation will be cached at every variable level. + */ + static void setGranularity(int granularity); + + /** + * @brief Retrieve the granularity for the BDD operations. + */ + static int getGranularity(); + + /** + * @brief Initializes the BDD module of the Sylvan framework. + */ + static void initBdd(); + + /** + * @brief Initializes the MTBDD module of the Sylvan framework. + */ + static void initMtbdd(); + + /** + * @brief Initializes the Dynamic variable reordering module of the Sylvan framework. + */ + static void initReorder(); + + /** + * @brief Set threshold for the number of nodes per level to consider during the reordering. + * @details If the number of nodes per level is less than the threshold, the level is skipped during the reordering. + * The default value is 32. + * @param threshold The threshold for the number of nodes per level. + */ + static void setReorderThreshold(uint32_t threshold); + + /** + * @brief Set the maximum growth coefficient. + * @details The maximum growth coefficient is used to calculate the maximum growth of the number of nodes during the reordering. + * If the number of nodes grows more than the maximum growth coefficient , sift up/down is terminated. + */ + static void setReorderMaxGrowth(float max_growth); + + /** + * @brief Set the maximum number of swaps per sifting. + */ + static void setReorderMaxSwap(uint32_t max_swap); + + /** + * @brief Set the maximum number of vars swapped per sifting. + */ + static void setReorderMaxVar(uint32_t max_var); + + /** + * @brief Set the time limit for the reordering. + */ + static void setReorderTimeLimit(double time_limit); + + /** + @brief Reduce the heap size in the entire forest. + + @details Implementation of Rudell's sifting algorithm. +
    +
  1. Order all the variables according to the number of entries + in each unique table. +
  2. Sift the variable up and down, remembering each time the + total size of the bdd size. +
  3. Select the best permutation. +
  4. Repeat 2 and 3 for all variables in given range. +
+ */ + static void reduceHeap(); + + /** + @brief Reorder the variables in the BDDs according to the given permutation. + + @details The permutation is an array of BDD labels, where the i-th element is the label + of the variable that should be moved to position i. The size + of the array should be equal or greater to the number of variables + currently in use and and should be less or equal to the number of levels. + */ + static reorder_result_t reorderPerm(const std::vector &perm); + + /** + * @brief Get the number of created levels + */ + static uint32_t getLevelsCount(); + + /** + * @brief Frees all memory in use by Sylvan. + * Warning: if you have any Bdd objects which are not bddZero() or bddOne() after this, your program may crash! + */ + static void quitPackage(); + }; } diff --git a/src/sylvan_reorder.c b/src/sylvan_reorder.c new file mode 100644 index 00000000..4ff90502 --- /dev/null +++ b/src/sylvan_reorder.c @@ -0,0 +1,460 @@ +/* + * Copyright 2016 Tom van Dijk, Johannes Kepler University Linz + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include "sylvan_align.h" + +#define STATS 0 // useful information w.r.t. dynamic reordering for debugging +#define INFO 1 // useful information w.r.t. dynamic reordering + +VOID_TASK_DECL_1(sylvan_reorder_stop_world, reordering_type_t) + +#define sylvan_reorder_stop_world(type) RUN(sylvan_reorder_stop_world, type) + +TASK_DECL_2(reorder_result_t, sylvan_sift, uint32_t, uint32_t) + +#define sylvan_sift(v, limit) CALL(sylvan_sift, v, limit) + +TASK_DECL_2(reorder_result_t, sylvan_bounded_sift, uint32_t, uint32_t) + +#define sylvan_bounded_sift(v, limit) CALL(sylvan_bounded_sift, v, limit) + +void sylvan_init_reorder() +{ + if(reorder_db != NULL && reorder_db->is_initialised) return; + reorder_db = reorder_db_init(); +} + +void sylvan_quit_reorder() +{ + if(!reorder_db->is_initialised) return; + reorder_db_deinit(reorder_db); +} + +void sylvan_set_reorder_nodes_threshold(uint32_t threshold) +{ + if(!reorder_db->is_initialised) return; + assert(threshold > 0); + reorder_db->config.threshold = threshold; +} + +void sylvan_set_reorder_maxgrowth(float max_growth) +{ + if(!reorder_db->is_initialised) return; + assert(max_growth > 1.0f); + reorder_db->config.max_growth = max_growth; +} + +void sylvan_set_reorder_maxswap(uint32_t max_swap) +{ + if(!reorder_db->is_initialised) return; + assert(max_swap > 1); + reorder_db->config.max_swap = max_swap; +} + +void sylvan_set_reorder_maxvar(uint32_t max_var) +{ + if(!reorder_db->is_initialised) return; + assert(max_var > 1); + reorder_db->config.max_var = max_var; +} + +void sylvan_set_reorder_timelimit_min(double time_limit) +{ + if(!reorder_db->is_initialised) return; + assert(time_limit > 0); + sylvan_set_reorder_timelimit_sec(time_limit * 60); +} + +void sylvan_set_reorder_timelimit_sec(double time_limit) +{ + if(!reorder_db->is_initialised) return; + assert(time_limit > 0); + sylvan_set_reorder_timelimit_ms(time_limit * 1000); +} + +void sylvan_set_reorder_timelimit_ms(double time_limit) +{ + if(!reorder_db->is_initialised) return; + assert(time_limit > 0); + reorder_db->config.time_limit_ms = time_limit; +} + +void sylvan_set_reorder_verbose(int is_verbose) +{ + if(!reorder_db->is_initialised) return; + assert(is_verbose >= 0); + reorder_db->config.print_stat = is_verbose; +} + +void sylvan_set_reorder_type(reordering_type_t type) +{ + if(!reorder_db->is_initialised) return; + reorder_db->config.type = type; +} + +void sylvan_set_reorder_print(bool is_on) +{ + reorder_db->config.print_stat = is_on; +} + +TASK_IMPL_1(reorder_result_t, sylvan_reorder_perm, const uint32_t*, permutation) +{ + sylvan_pre_reorder(SYLVAN_REORDER_SIFT); + if (!reorder_db->is_initialised) return SYLVAN_REORDER_NOT_INITIALISED; + reorder_result_t res = SYLVAN_REORDER_SUCCESS; + int is_identity = 1; + + // check if permutation is identity + for (size_t level = 0; level < reorder_db->levels.count; level++) { + if (permutation[level] != reorder_db->levels.level_to_order[level]) { + is_identity = 0; + break; + } + } + if (is_identity) return res; + + for (size_t level = 0; level < reorder_db->levels.count; ++level) { + uint32_t var = permutation[level]; + uint32_t pos = levels_order_to_level(&reorder_db->levels, var); + for (; pos < level; pos++) { + res = sylvan_varswap(pos); + if (!sylvan_reorder_issuccess(res)) return res; + } + for (; pos > level; pos--) { + res = sylvan_varswap(pos - 1); + if (!sylvan_reorder_issuccess(res)) return res; + } + if (!sylvan_reorder_issuccess(res)) break; + } + + sylvan_post_reorder(); + return res; +} + +void sylvan_test_reduce_heap() +{ + if (reorder_db == NULL || reorder_db->is_initialised == false) return; + if (llmsset_count_marked(nodes) >= reorder_db->config.size_threshold && reorder_db->call_count < SYLVAN_REORDER_LIMIT) { + sylvan_reorder_stop_world(reorder_db->config.type); + } +} + +void sylvan_reduce_heap(reordering_type_t type) +{ + if (reorder_db == NULL || reorder_db->is_initialised == false) return; + sylvan_reorder_stop_world(type); +} + +/** + * This variable is used for a cas flag so only + * one reordering runs at one time + */ +static _Atomic (int) re; + + +VOID_TASK_IMPL_1(sylvan_reorder_stop_world, reordering_type_t, type) +{ + reorder_result_t result = SYLVAN_REORDER_SUCCESS; + if (!reorder_db->is_initialised) result = SYLVAN_REORDER_NOT_INITIALISED; + if (reorder_db->levels.count < 1) result = SYLVAN_REORDER_NO_REGISTERED_VARS; + if (sylvan_reorder_issuccess(result) == 0) { + sylvan_print_reorder_res(result); + return; + } + int zero = 0; + if (atomic_compare_exchange_strong(&re, &zero, 1)) { + sylvan_pre_reorder(type); + switch (type) { + case SYLVAN_REORDER_SIFT: + result = NEWFRAME(sylvan_sift, 0, 0); + break; + case SYLVAN_REORDER_BOUNDED_SIFT: + result = NEWFRAME(sylvan_bounded_sift, 0, 0); + break; + } + re = 0; + sylvan_post_reorder(); + if (sylvan_reorder_issuccess(result) == 0) { + sylvan_print_reorder_res(result); + } + } else { + /* wait for new frame to appear */ + while (atomic_load_explicit(&lace_newframe.t, memory_order_relaxed) == 0) {} + lace_yield(__lace_worker, __lace_dq_head); + } +} + +TASK_IMPL_2(reorder_result_t, sylvan_sift, uint32_t, low, uint32_t, high) +{ + // if high == 0, then we sift all variables + if (high == 0) high = reorder_db->levels.count - 1; + + // count all variable levels (parallel...) + size_t level_counts[reorder_db->levels.count]; + for (size_t i = 0; i < reorder_db->levels.count; i++) { + level_counts[i] = mrc_var_nnodes_get(&reorder_db->mrc, reorder_db->levels.level_to_order[i]); + } + // mark and sort variable levels based on the threshold + int ordered_levels[reorder_db->levels.count]; + levels_mark_threshold(&reorder_db->levels, ordered_levels, level_counts, reorder_db->config.threshold); + levels_gnome_sort(&reorder_db->levels, ordered_levels, level_counts); + + reorder_result_t res = SYLVAN_REORDER_SUCCESS; + + size_t cursize = get_nodes_count(); + + for (int i = 0; i < (int) reorder_db->levels.count; i++) { + int lvl = ordered_levels[i]; + if (lvl < 0) break; // done + size_t pos = reorder_db->levels.level_to_order[lvl]; + + size_t bestpos = pos; + size_t bestsize = cursize; + + if (pos < low || pos > high) continue; + + reorder_db->config.varswap_count = 0; + + if ((pos - low) > (high - pos)) { + // we are in the lower half of the levels, so sift down first and then up + // sifting down + for (; pos < high; pos++) { + res = sylvan_varswap(pos); + if (sylvan_reorder_issuccess(res) == 0) break; + cursize = get_nodes_count(); + reorder_db->config.varswap_count++; + if (should_terminate_sifting(&reorder_db->config)) break; + if ((double) cursize > (double) bestsize * reorder_db->config.max_growth) { + pos++; + break; + } + if (cursize < bestsize) { + bestsize = cursize; + bestpos = pos; + } + } + if (sylvan_reorder_issuccess(res)) { + // sifting up + for (; pos > low; pos--) { + res = sylvan_varswap(pos - 1); + if (sylvan_reorder_issuccess(res) == 0) break; + cursize = get_nodes_count(); + reorder_db->config.varswap_count++; + if (should_terminate_sifting(&reorder_db->config)) break; + if ((double) cursize > (double) bestsize * reorder_db->config.max_growth) { + pos--; + break; + } + if (cursize < bestsize) { + bestsize = cursize; + bestpos = pos; + } + } + } + } else { + // we are in the upper half of the levels, so sift up first and then down + // sifting up + for (; pos > low; pos--) { + res = sylvan_varswap(pos - 1); + if (sylvan_reorder_issuccess(res) == 0) break; + cursize = get_nodes_count(); + reorder_db->config.varswap_count++; + if (should_terminate_sifting(&reorder_db->config)) break; + if ((double) cursize > (double) bestsize * reorder_db->config.max_growth) { + pos--; + break; + } + if (cursize < bestsize) { + bestsize = cursize; + bestpos = pos; + } + + } + if (sylvan_reorder_issuccess(res)) { + // sifting down + for (; pos < high; pos++) { + res = sylvan_varswap(pos); + if (sylvan_reorder_issuccess(res) == 0) break; + cursize = get_nodes_count(); + reorder_db->config.varswap_count++; + if (should_terminate_sifting(&reorder_db->config)) break; + if ((double) cursize > (double) bestsize * reorder_db->config.max_growth) { + pos++; + break; + } + if (cursize < bestsize) { + bestsize = cursize; + bestpos = pos; + } + } + } + } + reorder_result_t old_res = res; + + // optimum variable position restoration + for (; pos < bestpos; pos++) { + res = sylvan_varswap(pos); + if (sylvan_reorder_issuccess(res) == 0) break; + reorder_db->config.varswap_count++; + } + for (; pos > bestpos; pos--) { + res = sylvan_varswap(pos - 1); + if (sylvan_reorder_issuccess(res) == 0) break; + reorder_db->config.varswap_count++; + } + + cursize = get_nodes_count(); + + if (!sylvan_reorder_issuccess(res) || !sylvan_reorder_issuccess(old_res)) break; + reorder_db->config.total_num_var++; + + // if we managed to reduce size call progress hooks + if (bestsize < cursize) { + reorder_db_call_progress_hooks(); + } + + if (should_terminate_reordering(&reorder_db->config)) break; + } + + return res; +} + +TASK_IMPL_2(reorder_result_t, sylvan_bounded_sift, uint32_t, low, uint32_t, high) +{ + // if high == 0, then we sift all variables + if (high == 0) high = reorder_db->levels.count - 1; + + // count all variable levels + size_t level_counts[reorder_db->levels.count]; + for (size_t i = 0; i < reorder_db->levels.count; i++) { + level_counts[i] = mrc_var_nnodes_get(&reorder_db->mrc, reorder_db->levels.level_to_order[i]); + } + // mark and sort variable levels based on the threshold + int ordered_levels[reorder_db->levels.count]; + levels_mark_threshold(&reorder_db->levels, ordered_levels, level_counts, reorder_db->config.threshold); + levels_gnome_sort(&reorder_db->levels, ordered_levels, level_counts); + + // remember the order of the levels, since it will change during the sifting + uint32_t level_to_order[reorder_db->levels.count]; + for (size_t i = 0; i < reorder_db->levels.count; i++) { + level_to_order[i] = reorder_db->levels.level_to_order[i]; + } + + reorder_result_t res = SYLVAN_REORDER_SUCCESS; + sifting_state_t s_state; + + s_state.pos = 0; + s_state.best_pos = 0; + s_state.size = (int) get_nodes_count(); + s_state.best_size = s_state.size; + s_state.low = low; + s_state.high = high; + +#if STATS + printf("\n"); + interact_print(&reorder_db->matrix); + + for (size_t i = 0; i < levels_count_get(&reorder_db->levels); i++) { + int lvl = ordered_levels[i]; + printf("level %d \t has %zu nodes\n", lvl, level_counts[lvl]); + } + printf("\n"); +#endif + + for (int i = 0; i < (int) reorder_db->levels.count; i++) { + int lvl = ordered_levels[i]; + if (lvl == -1) break; + s_state.pos = reorder_db->levels.order_to_level[level_to_order[lvl]]; + if (s_state.pos < s_state.low || s_state.pos > s_state.high) continue; + + reorder_db->config.varswap_count = 0; + + s_state.best_pos = s_state.pos; + s_state.best_size = s_state.size; +#if STATS + printf("sifting level %d with pos %d\n", s_state.pos, lvl); +#endif + if (s_state.pos == s_state.low) { + res = sylvan_siftdown(&s_state); + if (!sylvan_reorder_issuccess(res)) goto siftingFailed; + // at this point pos --> high unless bounding occurred. + // move backward and stop at best position. + res = sylvan_siftback(&s_state); + if (!sylvan_reorder_issuccess(res)) goto siftingFailed; + } else if (s_state.pos == s_state.high) { + res = sylvan_siftup(&s_state); + if (!sylvan_reorder_issuccess(res)) goto siftingFailed; + // at this point pos --> low unless bounding occurred. + // move backward and stop at best position. + res = sylvan_siftback(&s_state); + if (!sylvan_reorder_issuccess(res)) goto siftingFailed; + } else if ((s_state.pos - s_state.low) > (s_state.high - s_state.pos)) { + // we are in the lower half, so sift down first and then up + res = sylvan_siftdown(&s_state); + if (!sylvan_reorder_issuccess(res)) goto siftingFailed; + res = sylvan_siftup(&s_state); + if (!sylvan_reorder_issuccess(res)) goto siftingFailed; + res = sylvan_siftback(&s_state); + if (!sylvan_reorder_issuccess(res)) goto siftingFailed; + } else { + // we are in the upper half, so sift up first and then down + res = sylvan_siftup(&s_state); + if (!sylvan_reorder_issuccess(res)) goto siftingFailed; + res = sylvan_siftdown(&s_state); + if (!sylvan_reorder_issuccess(res)) goto siftingFailed; + res = sylvan_siftback(&s_state); + if (!sylvan_reorder_issuccess(res)) goto siftingFailed; + } + + if (should_terminate_reordering(&reorder_db->config)) break; + + // if we managed to reduce size call progress hooks + if (s_state.best_size < s_state.size) { + reorder_db_call_progress_hooks(); + } + + reorder_db->config.total_num_var++; + +#if STATS + if (i > 1) exit(1); +#endif + roaring_bitmap_run_optimize(reorder_db->mrc.node_ids); + + continue; + + siftingFailed: +#if INFO + sylvan_print_reorder_res(res); +#endif + if (res == SYLVAN_REORDER_P2_CREATE_FAIL || res == SYLVAN_REORDER_P3_CLEAR_FAIL || + res == SYLVAN_REORDER_NOT_ENOUGH_MEMORY) { + + sylvan_post_reorder(); + sylvan_gc(); + sylvan_pre_reorder(SYLVAN_REORDER_BOUNDED_SIFT); + + return sylvan_bounded_sift(low, high); + } else { + return res; + } + } + + return res; +} \ No newline at end of file diff --git a/src/sylvan_reorder.h b/src/sylvan_reorder.h new file mode 100644 index 00000000..5770c599 --- /dev/null +++ b/src/sylvan_reorder.h @@ -0,0 +1,196 @@ +#ifndef SYLVAN_VAR_REORDER_H +#define SYLVAN_VAR_REORDER_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +/** + * @brief Callback type + */ +LACE_TYPEDEF_CB(void, re_hook_cb); + +/** + @brief Type of reordering algorithm. +*/ +typedef enum { + SYLVAN_REORDER_SIFT, + SYLVAN_REORDER_BOUNDED_SIFT, +} reordering_type_t; + +typedef int (*re_term_cb)(); + +typedef enum reorder_result { + /// the operation was aborted and rolled back + SYLVAN_REORDER_ROLLBACK = 1, + /// success + SYLVAN_REORDER_SUCCESS = 0, + //// cannot clear in phase 0, no marked nodes remaining + SYLVAN_REORDER_P0_CLEAR_FAIL = -1, + //// cannot rehash in phase 1, no marked nodes remaining + SYLVAN_REORDER_P1_REHASH_FAIL = -2, + /// cannot rehash in phase 1, and marked nodes remaining + SYLVAN_REORDER_P1_REHASH_FAIL_MARKED = -3, + /// cannot rehash in phase 2, no marked nodes remaining + SYLVAN_REORDER_P2_REHASH_FAIL = -4, + /// cannot create node in phase 2 (ergo marked nodes remaining) + SYLVAN_REORDER_P2_CREATE_FAIL = -5, + /// cannot create mapnode in phase 2 (ergo marked nodes remaining) + SYLVAN_REORDER_P2_MAPNODE_CREATE_FAIL = -6, + /// cannot rehash and cannot create node in phase 2 + SYLVAN_REORDER_P2_REHASH_AND_CREATE_FAIL = -7, + //// cannot rehash in phase 3, maybe there are marked nodes remaining + SYLVAN_REORDER_P3_REHASH_FAIL = -8, + //// cannot clear in phase 3, maybe there are marked nodes remaining + SYLVAN_REORDER_P3_CLEAR_FAIL = -9, + /// the operation failed fast because there are no registered variables + SYLVAN_REORDER_NO_REGISTERED_VARS = -10, + /// the operation failed fast because the varswap was not initialised + SYLVAN_REORDER_NOT_INITIALISED = -11, + /// the operation failed fast because the varswap was already running + SYLVAN_REORDER_ALREADY_RUNNING = -12, + /// the operation did not even start because there was not enough memory + SYLVAN_REORDER_NOT_ENOUGH_MEMORY = -13, +} reorder_result_t; + +/** + * @brief Provide description for given result. + * + * @details Requires buffer with length at least equal to 100 + * + * @param tag + * @param result based on which the description is determined + * @param buf buffer into which the description will be copied + * @param buf_len + */ +void sylvan_reorder_resdescription(reorder_result_t result, char *buf, size_t buf_len); + +/** + * @brief Add a hook that is called before dynamic variable reordering begins. + */ +void sylvan_re_hook_prere(re_hook_cb callback); + +/** + * @brief Add a hook that is called after dynamic variable reordering is finished. + */ +void sylvan_re_hook_postre(re_hook_cb callback); + +/** + * @brief Add a hook that is called after dynamic variable reordering managed to reduce number of nodes. + */ +void sylvan_re_hook_progre(re_hook_cb callback); + +/** + * @brief Add a hook that is called regularly to see whether sifting should terminate. + */ +void sylvan_re_hook_termre(re_term_cb callback); + +/** + * @brief Initialize the dynamic variable reordering. + */ +void sylvan_init_reorder(void); + +/** + * @brief Quit the dynamic variable reordering. + */ +void sylvan_quit_reorder(void); + + +/** + * @brief Set threshold for the number of nodes per level to consider during the reordering. + * @details If the number of nodes per level is less than the threshold, the level is skipped during the reordering. + * @param threshold The threshold for the number of nodes per level. +*/ +void sylvan_set_reorder_nodes_threshold(uint32_t threshold); + +/** + * @brief Set the maximum growth coefficient. + * @details The maximum growth coefficient is used to calculate the maximum growth of the number of nodes during the reordering. + * If the number of nodes grows more than the maximum growth coefficient , sift up/down is terminated. + * @param max_growth The maximum growth coefficient. +*/ +void sylvan_set_reorder_maxgrowth(float max_growth); + +/** + * @brief Set the maximum number of swaps per sifting. + * @param max_swap The maximum number of swaps per sifting. +*/ +void sylvan_set_reorder_maxswap(uint32_t max_swap); + +/** + * @brief Set the maximum number of vars swapped per sifting. + * @param max_var The maximum number of vars swapped per sifting. + */ +void sylvan_set_reorder_maxvar(uint32_t max_var); + +/** + * @brief Set the time limit in minutes for the reordering. + * @param time_limit The time limit for the reordering. + */ +void sylvan_set_reorder_timelimit_min(double time_limit); + +/** + * @brief Set the time limit in seconds for the reordering. + * @param time_limit The time limit for the reordering. + */ +void sylvan_set_reorder_timelimit_sec(double time_limit); + +/** + * @brief Set the time limit in milliseconds for the reordering. + * @param time_limit The time limit for the reordering. + */ +void sylvan_set_reorder_timelimit_ms(double time_limit); + +/** + * @brief Set the the flag to print the progress of the reordering. + * @param verbose The flag to print the progress of the reordering. + */ +void sylvan_set_reorder_verbose(int is_verbose); + +/** + * @brief Set the the flag to print the progress of the reordering. + * @param verbose The flag to print the progress of the reordering. + */ +void sylvan_set_reorder_type(reordering_type_t type); + +void sylvan_set_reorder_print(bool is_on); + +/** + * @brief Reduce the heap size in the entire forest. + * + * @details Implementation of Rudell's sifting algorithm. + * This function performs stop-the-world operation similar to garbage collection. + * It proceeds as follows: + * 1. Order all the variables according to the number of entries in each unique table. + * 2. Sift the variable up and down, remembering each time the total size of the bdd size. + * 3. Select the best permutation. + * 4. Repeat 2 and 3 for all variables in given range. + * + * @sideeffect order of variables is changed, mappings level -> order and order -> level are updated + */ +void sylvan_reduce_heap(reordering_type_t type); + +/** + * @brief Maybe reduce the heap size in the entire forest. + */ +void sylvan_test_reduce_heap(); + +TASK_DECL_1(reorder_result_t, sylvan_reorder_perm, const uint32_t*); +/** + @brief Reorder the variables in the BDDs according to the given permutation. + + @details The permutation is an array of BDD labels, where the i-th element is the label + of the variable that should be moved to position i. The size + of the array should be equal or greater to the number of variables + currently in use. + */ +#define sylvan_reorder_perm(permutation) RUN(sylvan_reorder_perm, permutation) + +#ifdef __cplusplus +} +#endif /* __cplusplus */ + +#endif //SYLVAN_VAR_REORDER_H + diff --git a/src/sylvan_reorder_int.c b/src/sylvan_reorder_int.c new file mode 100644 index 00000000..17b94294 --- /dev/null +++ b/src/sylvan_reorder_int.c @@ -0,0 +1,484 @@ +#include + +#include +#include + +#define STATS 0 // useful information w.r.t. dynamic reordering for debugging +#define INFO 0 // useful information w.r.t. dynamic reordering + +static inline int is_db_available() +{ + if (reorder_db == NULL) return 0; + return 1; +} + +static double wctime() +{ + struct timeval tv; + gettimeofday(&tv, NULL); + return (tv.tv_sec + 1E-6 * tv.tv_usec); +} + +static inline double wctime_sec_elapsed(double t_start) +{ + return wctime() - t_start; +} + +static inline double wctime_ms_elapsed(double start) +{ + return wctime_sec_elapsed(start) * 1000; +} + +reorder_db_t reorder_db_init() +{ + if (reorder_db != NULL && reorder_db->is_initialised) return reorder_db; + + reorder_db_t db = (reorder_db_t) malloc(sizeof(struct reorder_db_s)); + if (db == NULL) { + fprintf(stderr, "reorder_db_init: Unable to allocate memory: %s!\n", strerror(errno)); + exit(1); + } + + db->mrc = (mrc_t) { + .node_ids = roaring_bitmap_create(), + .nnodes = 0, + .ref_nodes = (atomic_counters32_t) { + .container = NULL, + .size = 0, + }, + .var_nnodes = (atomic_counters32_t) { + .container = NULL, + .size = 0, + }, + .ext_ref_nodes = (atomic_bitmap_t) { + .container = NULL, + .size = 0, + }, + }; + + db->call_count = 0; + + db->matrix = (interact_t) { + .container = NULL, + .size = 0, + }; + + db->levels = (levels_t) { + .table = NULL, + .count = 0, + .level_to_order = NULL, + .order_to_level = NULL, + }; + + + db->is_initialised = 1; + db->config = (reorder_config_t) {}; + db->is_reordering = false; + + reorder_set_default_config(&db->config); + + sylvan_register_quit(&sylvan_quit_reorder); + levels_gc_add_mark_managed_refs(); + + return db; +} + +void reorder_db_deinit(reorder_db_t self) +{ + if (!self->is_initialised) return; + self->is_initialised = 0; + if (is_db_available() == 0) return; + mrc_deinit(&self->mrc); + interact_deinit(&self->matrix); + free(reorder_db); + levels_reset(&self->levels); +} + +static re_hook_entry_t prere_list; +static re_hook_entry_t postre_list; +static re_hook_entry_t progre_list; +static re_term_entry_t termre_list; + +void sylvan_re_hook_prere(re_hook_cb callback) +{ + re_hook_entry_t e = (re_hook_entry_t) malloc(sizeof(struct re_hook_entry)); + e->cb = callback; + e->next = prere_list; + prere_list = e; +} + +void sylvan_re_hook_postre(re_hook_cb callback) +{ + re_hook_entry_t e = (re_hook_entry_t) malloc(sizeof(struct re_hook_entry)); + e->cb = callback; + e->next = postre_list; + postre_list = e; +} + +void sylvan_re_hook_progre(re_hook_cb callback) +{ + re_hook_entry_t e = (re_hook_entry_t) malloc(sizeof(struct re_hook_entry)); + e->cb = callback; + e->next = progre_list; + progre_list = e; +} + +void sylvan_re_hook_termre(re_term_cb callback) +{ + re_term_entry_t e = (re_term_entry_t) malloc(sizeof(struct re_term_entry)); + e->cb = callback; + e->next = termre_list; + termre_list = e; +} + +VOID_TASK_IMPL_0(reorder_db_call_progress_hooks) +{ + for (re_hook_entry_t e = progre_list; e != NULL; e = e->next) { + WRAP(e->cb); + } +} + +inline uint64_t get_nodes_count() +{ + return mrc_nnodes_get(&reorder_db->mrc) + 2; +} + +TASK_IMPL_1(reorder_result_t, sylvan_siftdown, sifting_state_t *, s_state) +{ + if (!reorder_db->is_initialised) return SYLVAN_REORDER_NOT_INITIALISED; + reorder_result_t res; + int R; // upper bound on node decrease + int limitSize; + BDDVAR xIndex; + BDDVAR yIndex; + BDDVAR x; + BDDVAR y; + + s_state->size = (int) get_nodes_count(); + xIndex = reorder_db->levels.level_to_order[s_state->pos]; + + limitSize = s_state->size; + R = 0; + // Initialize the upper bound + for (y = s_state->high; y > s_state->pos; y--) { + yIndex = reorder_db->levels.level_to_order[y]; + if (interact_test(&reorder_db->matrix, xIndex, yIndex)) { + R += (int) mrc_var_nnodes_get(&reorder_db->mrc, y); + } + } + + for (; s_state->pos < s_state->high && s_state->size - R < limitSize; ++s_state->pos) { + x = s_state->pos; + y = s_state->pos + 1; + // Update the upper bound on node decrease + yIndex = reorder_db->levels.level_to_order[y]; + if (interact_test(&reorder_db->matrix, xIndex, yIndex)) { + R -= (int) mrc_var_nnodes_get(&reorder_db->mrc, y); + } + res = sylvan_varswap(x); + s_state->size = (int) get_nodes_count(); + if (!sylvan_reorder_issuccess(res)) return res; + reorder_db->config.varswap_count++; + + // check the max allowed size growth + if ((double) (s_state->size) > (double) s_state->best_size * reorder_db->config.max_growth) { + ++s_state->pos; + break; + } + + // update best position + if (s_state->size <= s_state->best_size) { + s_state->best_size = s_state->size; + s_state->best_pos = s_state->pos; + } + + if (s_state->size < limitSize) limitSize = s_state->size; + if (should_terminate_sifting(&reorder_db->config)) break; + } + + if (s_state->size <= s_state->best_size) { + s_state->best_size = s_state->size; + s_state->best_pos = s_state->pos; + } + + return SYLVAN_REORDER_SUCCESS; +} + +TASK_IMPL_1(reorder_result_t, sylvan_siftup, sifting_state_t *, s_state) +{ + if (!reorder_db->is_initialised) return SYLVAN_REORDER_NOT_INITIALISED; + reorder_result_t res; + int L; // lower bound on DD size + int limitSize; + BDDVAR xIndex; + BDDVAR yIndex; + BDDVAR x; + BDDVAR y; + + s_state->size = (int) get_nodes_count(); + yIndex = reorder_db->levels.level_to_order[s_state->pos]; + + limitSize = L = s_state->size; + for (x = s_state->low + 1; x < s_state->pos; x++) { + xIndex = reorder_db->levels.level_to_order[x]; + if (interact_test(&reorder_db->matrix, xIndex, yIndex)) { + L -= (int) mrc_var_nnodes_get(&reorder_db->mrc, x); + } + } + y = s_state->pos; + L -= (int) mrc_var_nnodes_get(&reorder_db->mrc, y); + + for (; s_state->pos > s_state->low && L <= limitSize; --s_state->pos) { + x = s_state->pos - 1; + y = s_state->pos; + xIndex = reorder_db->levels.level_to_order[x]; + res = sylvan_varswap(x); + if (!sylvan_reorder_issuccess(res)) return res; + s_state->size = (int) get_nodes_count(); + reorder_db->config.varswap_count++; + // check the max allowed size growth + if ((double) (s_state->size) > (double) s_state->best_size * reorder_db->config.max_growth) { + --s_state->pos; + break; + } + // update the best position + if (s_state->size <= s_state->best_size) { + s_state->best_size = s_state->size; + s_state->best_pos = s_state->pos; + } + // Update the lower bound on DD size + if (interact_test(&reorder_db->matrix, xIndex, yIndex)) { + L += (int) mrc_var_nnodes_get(&reorder_db->mrc, y); + } + if ((int) s_state->size < limitSize) limitSize = (int) s_state->size; + if (should_terminate_sifting(&reorder_db->config)) break; + } + if (s_state->size <= s_state->best_size) { + s_state->best_size = s_state->size; + s_state->best_pos = s_state->pos; + } + return SYLVAN_REORDER_SUCCESS; +} + +TASK_IMPL_1(reorder_result_t, sylvan_siftback, sifting_state_t *, s_state) +{ + reorder_result_t res = SYLVAN_REORDER_SUCCESS; + if (!reorder_db->is_initialised) return SYLVAN_REORDER_NOT_INITIALISED; + if (s_state->pos == s_state->best_pos) return res; + for (; s_state->pos <= s_state->best_pos; s_state->pos++) { + if (s_state->size == s_state->best_size) return res; + if (s_state->pos == UINT32_MAX) return res; + res = sylvan_varswap(s_state->pos); + s_state->size = (int) get_nodes_count(); + if (!sylvan_reorder_issuccess(res)) return res; + reorder_db->config.varswap_count++; + } + for (; s_state->pos >= s_state->best_pos; s_state->pos--) { + if (s_state->pos == 0) break; + if (s_state->size == s_state->best_size) return res; + res = sylvan_varswap(s_state->pos - 1); + s_state->size = (int) get_nodes_count(); + if (!sylvan_reorder_issuccess(res)) return res; + reorder_db->config.varswap_count++; + } + return res; +} + +VOID_TASK_IMPL_1(sylvan_pre_reorder, reordering_type_t, type) +{ + reorder_db->is_reordering = true; + reorder_db->config.t_start_sifting = wctime(); + reorder_db->config.total_num_var = 0; + + sylvan_clear_cache(); + mrc_collect_node_ids(&reorder_db->mrc, nodes); + mrc_init(&reorder_db->mrc, reorder_db->levels.count, nodes->table_size); + interact_init(&reorder_db->matrix, &reorder_db->levels, &reorder_db->mrc, reorder_db->levels.count, nodes->table_size); + + if (reorder_db->config.print_stat == true) { + char buff[100]; + sylvan_reorder_type_description(type, buff, 100); + printf("BDD reordering with %s: from %zu to ... ", buff, llmsset_count_marked(nodes)); + } + + reorder_db->call_count++; + + sylvan_stats_count(SYLVAN_RE_COUNT); + sylvan_timer_start(SYLVAN_RE); + + for (re_hook_entry_t e = prere_list; e != NULL; e = e->next) { + WRAP(e->cb); + } +} + +VOID_TASK_IMPL_0(sylvan_post_reorder) +{ + size_t after_size = get_nodes_count() - 2; + + // new size threshold for next reordering is double the size of non-terminal nodes + the terminal nodes + size_t new_size_threshold = (after_size + 1) * SYLVAN_REORDER_SIZE_RATIO; + + if (reorder_db->call_count < SYLVAN_REORDER_LIMIT || new_size_threshold > reorder_db->config.size_threshold) { + reorder_db->config.size_threshold = new_size_threshold; + } else { + reorder_db->config.size_threshold += SYLVAN_REORDER_LIMIT; + } + + mrc_deinit(&reorder_db->mrc); + interact_deinit(&reorder_db->matrix); + + if (reorder_db->config.print_stat == true) { + double end = wctime() - reorder_db->config.t_start_sifting; + printf("%zu nodes in %f sec\n", after_size, end); + } + + for (re_hook_entry_t e = postre_list; e != NULL; e = e->next) { + WRAP(e->cb); + } + + sylvan_timer_stop(SYLVAN_RE); + + reorder_db->is_reordering = false; +} + +void sylvan_reorder_resdescription(reorder_result_t result, char *buf, size_t buf_len) +{ + (void) buf_len; + assert(buf_len >= 100); + switch (result) { + case SYLVAN_REORDER_ROLLBACK: + sprintf(buf, "SYLVAN_REORDER: the operation was aborted and rolled back (%d)", result); + break; + case SYLVAN_REORDER_SUCCESS: + sprintf(buf, "SYLVAN_REORDER: success (%d)", result); + break; + case SYLVAN_REORDER_P0_CLEAR_FAIL: + sprintf(buf, "SYLVAN_REORDER: cannot rehash in phase 0, no marked nodes remaining (%d)", result); + break; + case SYLVAN_REORDER_P1_REHASH_FAIL: + sprintf(buf, "SYLVAN_REORDER: cannot rehash in phase 1, no marked nodes remaining (%d)", result); + break; + case SYLVAN_REORDER_P1_REHASH_FAIL_MARKED: + sprintf(buf, "SYLVAN_REORDER: cannot rehash in phase 1, marked nodes remaining (%d)", result); + break; + case SYLVAN_REORDER_P2_REHASH_FAIL: + sprintf(buf, "SYLVAN_REORDER: cannot rehash in phase 2, no marked nodes remaining (%d)", result); + break; + case SYLVAN_REORDER_P2_CREATE_FAIL: + sprintf(buf, "SYLVAN_REORDER: cannot create node in phase 2, marked nodes remaining (%d)", result); + break; + case SYLVAN_REORDER_P2_MAPNODE_CREATE_FAIL: + sprintf(buf, "SYLVAN_REORDER: cannot create mapnode in phase 2, marked nodes remaining (%d)", result); + break; + case SYLVAN_REORDER_P2_REHASH_AND_CREATE_FAIL: + sprintf(buf, "SYLVAN_REORDER: cannot rehash and cannot create node in phase 2 (%d)", result); + break; + case SYLVAN_REORDER_P3_REHASH_FAIL: + sprintf(buf, "SYLVAN_REORDER: cannot rehash in phase 3, maybe there are marked nodes remaining (%d)", + result); + break; + case SYLVAN_REORDER_P3_CLEAR_FAIL: + sprintf(buf, "SYLVAN_REORDER: cannot clear in phase 3, maybe there are marked nodes remaining (%d)", + result); + break; + case SYLVAN_REORDER_NO_REGISTERED_VARS: + sprintf(buf, "SYLVAN_REORDER: the operation failed fast because there are no registered variables (%d)", + result); + break; + case SYLVAN_REORDER_NOT_INITIALISED: + sprintf(buf, "SYLVAN_REORDER: please make sure you first initialize reordering (%d)", result); + break; + case SYLVAN_REORDER_ALREADY_RUNNING: + sprintf(buf, "SYLVAN_REORDER: cannot start reordering when it is already running (%d)", result); + break; + case SYLVAN_REORDER_NOT_ENOUGH_MEMORY: + sprintf(buf, "SYLVAN_REORDER: not enough memory (%d)", result); + break; + default: + sprintf(buf, "SYLVAN_REORDER: UNKNOWN ERROR (%d)", result); + break; + } +} + +void sylvan_print_reorder_res(reorder_result_t result) +{ + char buff[100]; + sylvan_reorder_resdescription(result, buff, 100); + if (!sylvan_reorder_issuccess(result)) fprintf(stderr, "%s\n", buff); + else fprintf(stdout, "%s\n", buff); +} + +void sylvan_reorder_type_description(reordering_type_t type, char *buf, size_t buf_len) +{ + (void) buf_len; + assert(buf_len >= 100); + switch (type) { + case SYLVAN_REORDER_BOUNDED_SIFT: + sprintf(buf, "sifting"); + break; + case SYLVAN_REORDER_SIFT: + sprintf(buf, "unbounded sifting"); + } +} + +int should_terminate_sifting(const struct reorder_config *reorder_config) +{ + for (re_term_entry_t e = termre_list; e != NULL; e = e->next) { + if (e->cb()) { +#if INFO + printf("sifting exit: termination_cb\n"); +#endif + return 1; + } + } + if (reorder_config->varswap_count > reorder_config->max_swap) { +#if INFO + printf("sifting exit: reached %u from the total_num_swap %u\n", + reorder_config->varswap_count, + reorder_config->max_swap); +#endif + return 1; + } + + double t_elapsed = wctime_ms_elapsed(reorder_config->t_start_sifting); + if (t_elapsed > reorder_config->time_limit_ms && reorder_config->t_start_sifting != 0) { +#if INFO + printf("sifting exit: reached %fms from the time_limit %.2fms\n", + t_elapsed, + reorder_config->time_limit_ms); +#endif + return 1; + } + return 0; +} + +int should_terminate_reordering(const struct reorder_config *reorder_config) +{ + for (re_term_entry_t e = termre_list; e != NULL; e = e->next) { + if (e->cb()) { +#if INFO + printf("reordering exit: termination_cb\n"); +#endif + return 1; + } + } + + if (reorder_config->total_num_var > reorder_config->max_var) { +#if INFO + printf("reordering exit: reached %u from the total_num_var %u\n", + reorder_config->total_num_var, + reorder_config->max_var); +#endif + return 1; + } + double t_elapsed = wctime_ms_elapsed(reorder_config->t_start_sifting); + if (t_elapsed > reorder_config->time_limit_ms && reorder_config->t_start_sifting != 0) { +#if INFO + printf("reordering exit: reached %fms from the time_limit %.2zums\n", + t_elapsed, + (size_t) reorder_config->time_limit_ms); +#endif + return 1; + } + return 0; +} \ No newline at end of file diff --git a/src/sylvan_reorder_int.h b/src/sylvan_reorder_int.h new file mode 100644 index 00000000..8c46ebad --- /dev/null +++ b/src/sylvan_reorder_int.h @@ -0,0 +1,129 @@ +#ifndef SYLVAN_VAR_REORDER_DB_H +#define SYLVAN_VAR_REORDER_DB_H + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +typedef struct re_term_entry +{ + struct re_term_entry *next; + re_term_cb cb; +} *re_term_entry_t; + +typedef struct re_hook_entry +{ + struct re_hook_entry *next; + re_hook_cb cb; +} *re_hook_entry_t; + +typedef struct sifting_state +{ + uint32_t pos; + int size; + uint32_t best_pos; + int best_size; + uint32_t low; + uint32_t high; +} sifting_state_t; + +typedef struct reorder_config +{ + double t_start_sifting; // start time of the sifting + uint32_t threshold; // threshold for number of nodes per level + double max_growth; // coefficient used to calculate maximum growth + uint32_t max_swap; // maximum number of swaps per sifting + uint32_t varswap_count; // number of swaps completed + uint32_t max_var; // maximum number of vars swapped per sifting + uint32_t total_num_var; // number of vars sifted + double time_limit_ms; // time limit in milliseconds + reordering_type_t type; // type of reordering algorithm + bool print_stat; // flag to print the progress of the reordering + size_t size_threshold; // reorder if this size is reached +} reorder_config_t; + +typedef struct reorder_db_s +{ + mrc_t mrc; // reference counters used for the unique table nodes + interact_t matrix; // bitmap used for storing the square variable interaction matrix (use sylvan_interact with it) + levels_t levels; // levels of the unique table nodes + reorder_config_t config; // configuration for the sifting + size_t call_count; // number of reordering calls + bool is_initialised; // is dynamic reordering initialised + bool is_reordering; // is dynamic reordering in progress +} *reorder_db_t; + +reorder_db_t reorder_db_init(); + +void reorder_db_deinit(reorder_db_t self); + +static inline void reorder_set_default_config(reorder_config_t *configs) +{ + configs->threshold = SYLVAN_REORDER_NODES_THRESHOLD; + configs->max_growth = SYLVAN_REORDER_GROWTH; + configs->max_swap = SYLVAN_REORDER_MAX_SWAPS; + configs->max_var = SYLVAN_REORDER_MAX_VAR; + configs->time_limit_ms = SYLVAN_REORDER_TIME_LIMIT_MS; + configs->type = SYLVAN_REORDER_TYPE_DEFAULT; + configs->print_stat = SYLVAN_REORDER_PRINT_STAT; + configs->size_threshold = SYLVAN_REORDER_SIZE_THRESHOLD; +} + +static inline int sylvan_reorder_issuccess(reorder_result_t result) +{ + return result == SYLVAN_REORDER_SUCCESS || + result == SYLVAN_REORDER_NOT_INITIALISED || + result == SYLVAN_REORDER_ROLLBACK; +} + +uint64_t get_nodes_count(); + + +/** + * @brief Sift given variable up from its current level to the target level. + * @sideeffect order of variables is changed + */ +TASK_DECL_1(reorder_result_t, sylvan_siftdown, sifting_state_t*); +#define sylvan_siftdown(state) CALL(sylvan_siftdown, state) + +/** + * @brief Sift given variable down from its current level to the target level. + * @sideeffect order of variables is changed + */ +TASK_DECL_1(reorder_result_t, sylvan_siftup, sifting_state_t*); +#define sylvan_siftup(state) CALL(sylvan_siftup, state) + +/** + * @brief Sift a variable to its best level. + * @param pos - variable to sift + * @param target_pos - target position (w.r.t. dynamic variable reordering) + */ +TASK_DECL_1(reorder_result_t, sylvan_siftback, sifting_state_t*); +#define sylvan_siftback(state) CALL(sylvan_siftback, state) + +#define sylvan_pre_reorder(type) RUN(sylvan_pre_reorder, type) +VOID_TASK_DECL_1(sylvan_pre_reorder, reordering_type_t) + +#define sylvan_post_reorder() RUN(sylvan_post_reorder) +VOID_TASK_DECL_0(sylvan_post_reorder) + +void sylvan_reorder_resdescription(reorder_result_t result, char *buf, size_t buf_len); + +void sylvan_print_reorder_res(reorder_result_t result); + +void sylvan_reorder_type_description(reordering_type_t type, char *buf, size_t buf_len); + +int should_terminate_sifting(const struct reorder_config *reorder_config); + +int should_terminate_reordering(const struct reorder_config *reorder_config); + +VOID_TASK_DECL_0(reorder_db_call_progress_hooks) + +#define reorder_db_call_progress_hooks() CALL(reorder_db_call_progress_hooks) + + +#ifdef __cplusplus +} +#endif /* __cplusplus */ + +#endif //SYLVAN_VAR_REORDER_DB_H \ No newline at end of file diff --git a/src/sylvan_stats.c b/src/sylvan_stats.c index 31e7eb2b..3e74ad70 100644 --- a/src/sylvan_stats.c +++ b/src/sylvan_stats.c @@ -119,6 +119,11 @@ struct {1, SYLVAN_GC_COUNT, "GC executions"}, {3, SYLVAN_GC, "Total time spent"}, + {0, 0, "Variable reordering"}, + {1, SYLVAN_RE_COUNT, "RE executions"}, + {1, SYLVAN_RE_SWAP_COUNT, "RE swaps"}, + {3, SYLVAN_RE, "Total time spent"}, + {-1, -1, NULL}, }; diff --git a/src/sylvan_stats.h b/src/sylvan_stats.h index 1715cde9..44fb8160 100644 --- a/src/sylvan_stats.h +++ b/src/sylvan_stats.h @@ -106,6 +106,8 @@ typedef enum { /* Other counters */ SYLVAN_GC_COUNT, + SYLVAN_RE_COUNT, + SYLVAN_RE_SWAP_COUNT, LLMSSET_LOOKUP, SYLVAN_COUNTER_COUNTER @@ -116,6 +118,7 @@ typedef enum { typedef enum { SYLVAN_GC, + SYLVAN_RE, SYLVAN_TIMER_COUNTER } Sylvan_Timers; diff --git a/src/sylvan_table.h b/src/sylvan_table.h index aef2e2a2..7425f377 100644 --- a/src/sylvan_table.h +++ b/src/sylvan_table.h @@ -123,8 +123,10 @@ llmsset_set_size(llmsset_t dbs, size_t size) /* Warning: if size is not a power of two, you will get interesting behavior */ dbs->mask = dbs->table_size - 1; #endif +#if SYLVAN_USE_LINEAR_PROBING /* Set threshold: number of cache lines to probe before giving up on node insertion */ dbs->threshold = 192 - 2 * __builtin_clzll(dbs->table_size); +#endif } } @@ -182,6 +184,21 @@ TASK_DECL_1(int, llmsset_rehash, llmsset_t); */ int llmsset_rehash_bucket(const llmsset_t dbs, uint64_t d_idx); +#if !SYLVAN_USE_LINEAR_PROBING +VOID_TASK_DECL_0(llmsset_reset_all_regions) +#define llmsset_reset_all_regions() RUN(llmsset_reset_all_regions) + +/** + * Clear a single bucket (hash part). + */ +int llmsset_clear_one_hash(llmsset_t dbs, uint64_t index); + +/** + * Clear a single bucket (data part). + */ +void llmsset_clear_one_data(llmsset_t dbs, uint64_t index); +#endif + /** * Retrieve number of marked buckets. */ diff --git a/src/sylvan_table_chaining.c b/src/sylvan_table_chaining.c new file mode 100644 index 00000000..7689cd6e --- /dev/null +++ b/src/sylvan_table_chaining.c @@ -0,0 +1,583 @@ +/* + * Copyright 2011-2016 Formal Methods and Tools, University of Twente + * Copyright 2016-2017 Tom van Dijk, Johannes Kepler University Linz + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include // for errno +#include // memset +#include // for mmap + +#define MASK_HASH ((uint64_t)0xffffff0000000000) // 24 bits for the hash +#define MASK_INDEX ((uint64_t)0x000000ffffffffff) // 40 bits for the index + +DECLARE_THREAD_LOCAL(my_region, uint64_t); + +VOID_TASK_0(llmsset_reset_region) +{ + LOCALIZE_THREAD_LOCAL(my_region, uint64_t); + my_region = (uint64_t) -1; // no region + SET_THREAD_LOCAL(my_region, my_region); +} + +VOID_TASK_IMPL_0(llmsset_reset_all_regions) +{ + clear_aligned(nodes->bitmap1, nodes->max_size / (512 * 8)); + TOGETHER(llmsset_reset_region); +} + +static uint64_t +claim_data_bucket(const llmsset_t dbs) +{ + // get my region, based on which worker are we + // every worker has a region, and every region has 8 buckets + LOCALIZE_THREAD_LOCAL(my_region, uint64_t); + + for (;;) { + if (my_region != (uint64_t) -1) { + // find empty bucket in region + _Atomic (uint64_t) *ptr = dbs->bitmap2 + (my_region * 8); + int i = 0; + // With 64 bytes per cacheline, there are 8 64-bit values per cacheline. + for (; i < 8;) { + uint64_t v = atomic_load_explicit(ptr, memory_order_relaxed); + if (v != 0xffffffffffffffffLL) { + int j = __builtin_clzll(~v); + *ptr |= (0x8000000000000000LL >> j); + size_t index = (8 * my_region + i) * 64 + j; + return index; + } + i++; + ptr++; + } + } else { + // special case on startup or after garbage collection + // several lines below, my_region will be increased by 1 and the entry claimed. + my_region = -1 + (lace_get_worker()->worker * (dbs->table_size / (64 * 8))) / lace_workers(); + } + + const uint64_t n_regions = (dbs->table_size / (64 * 8)); + uint64_t count = n_regions; + for (;;) { + // check if table maybe full + if (count-- == 0) { + // at this point, we claim the table is full but in reality it just means + // that there are no regions left to claim + return (uint64_t) -1; + } + my_region += 1; + if (my_region >= n_regions) my_region = 0; + // try to claim the region + _Atomic (uint64_t) *ptr = dbs->bitmap1 + (my_region / 64); + uint64_t mask = 0x8000000000000000LL >> (my_region & 63); + uint64_t v; + restart: + v = atomic_load_explicit(ptr, memory_order_relaxed); + if (v & mask) { + continue; // is already taken by some other worker + } + if (atomic_compare_exchange_weak(ptr, &v, v | mask)) { + break; // success! is now claimed by me + } else { + goto restart; + } + } + // assign thread local variable to me + SET_THREAD_LOCAL(my_region, my_region); + } +} + +static void +release_data_bucket(const llmsset_t dbs, uint64_t index) +{ + _Atomic (uint64_t) *ptr = dbs->bitmap2 + (index / 64); + uint64_t mask = 0x8000000000000000LL >> (index & 63); + atomic_fetch_and(ptr, ~mask); +} + +static void +set_custom_bucket(const llmsset_t dbs, uint64_t index, int on) +{ + uint64_t * ptr = dbs->bitmapc + (index / 64); + uint64_t mask = 0x8000000000000000LL >> (index & 63); + if (on) *ptr |= mask; + else *ptr &= ~mask; +} + +static int +is_custom_bucket(const llmsset_t dbs, uint64_t index) +{ + uint64_t * ptr = dbs->bitmapc + (index / 64); + uint64_t mask = 0x8000000000000000LL >> (index & 63); + return (*ptr & mask) ? 1 : 0; +} + +static inline uint64_t +compute_hash(const llmsset_t dbs, uint64_t a, uint64_t b, const int custom) +{ + uint64_t hash = 14695981039346656037LLU; + if (custom) hash = dbs->hash_cb(a, b, hash); + else hash = sylvan_tabhash16(a, b, hash); + return hash; +} + +static inline uint64_t +llmsset_lookup2(const llmsset_t dbs, uint64_t a, uint64_t b, int *created, const int custom) +{ + uint64_t hash = compute_hash(dbs, a, b, custom); + uint64_t masked_hash = hash & MASK_HASH; + + // reminder + // table.a = index of first entry + // table.b = hash of entry + index of next entry + +#if LLMSSET_MASK + _Atomic (uint64_t) *first_ptr = &dbs->table[2 * (hash & dbs->mask)]; +#else + _Atomic(uint64_t)* first_ptr = &dbs->table[2*(hash % dbs->table_size)]; +#endif + + uint64_t first_idx = atomic_load_explicit(first_ptr, memory_order_relaxed); + uint64_t end = 0; + uint64_t bucket_idx = first_idx; + uint64_t claimed_idx = 0; // stores where the new data [will be] stored + + // stop when we encounter + for (;;) { + if (bucket_idx == end) { + // we did not find existing node in our table, and reached end of chain + // or the chain was empty + // thus, try to insert new node + if (claimed_idx == 0) { + // claim data bucket + claimed_idx = claim_data_bucket(dbs); + if (claimed_idx == (uint64_t) -1) return 0; // failed to claim a data bucket + // call custom create callback (might update a and b, but should hash to the same!) + if (custom) dbs->create_cb(&a, &b); + // write the data + uint64_t * dataptr = ((uint64_t *) dbs->data) + 2 * claimed_idx; + dataptr[0] = a; + dataptr[1] = b; + } + // Set item index in the chain + atomic_store_explicit(dbs->table + 2 * claimed_idx + 1, masked_hash | first_idx, memory_order_relaxed); + + // preserve the original first_idx (new "end" in case someone add something) + end = first_idx; + // now update the chain start (first_ptr) with claimed_idx + if (atomic_compare_exchange_strong(first_ptr, &first_idx, claimed_idx)) { + if (custom) set_custom_bucket(dbs, claimed_idx, custom); + *created = 1; + return claimed_idx; + } else { + // first_idx now holds the new value! + bucket_idx = first_idx; // continue with the new first index + } + } + + uint64_t * dataptr = ((uint64_t *) dbs->data) + 2 * bucket_idx; + uint64_t hash_chain = atomic_load_explicit(dbs->table + 2 * bucket_idx + 1, memory_order_relaxed); + + if (masked_hash == (hash_chain & MASK_HASH)) { + // check if we already have this node in the table + // if so, release the ownership if it was owned before, and return the bucket index + if (custom) { + // we do not use custom nodes for this test + if (dbs->equals_cb(a, b, dataptr[0], dataptr[1])) { + if (claimed_idx != 0) { + dbs->destroy_cb(a, b); + release_data_bucket(dbs, claimed_idx); + } + *created = 0; + return bucket_idx; + } + } else { + if (dataptr[0] == a && dataptr[1] == b) { + if (claimed_idx != 0) { + release_data_bucket(dbs, claimed_idx); + } + *created = 0; + return bucket_idx; + } + } + } + + bucket_idx = hash_chain & MASK_INDEX; // next item index in the chain + sylvan_stats_count(LLMSSET_LOOKUP); + } +} + +uint64_t +llmsset_lookup(const llmsset_t dbs, const uint64_t a, const uint64_t b, int *created) +{ + return llmsset_lookup2(dbs, a, b, created, 0); +} + +uint64_t +llmsset_lookupc(const llmsset_t dbs, const uint64_t a, const uint64_t b, int *created) +{ + return llmsset_lookup2(dbs, a, b, created, 1); +} + +int +llmsset_rehash_bucket(const llmsset_t dbs, uint64_t d_idx) +{ + // This is like lookup, except we assume that + // - the data index is not duplicate + // - the data is not yet in the table + uint64_t * dataptr = ((uint64_t *) dbs->data) + 2 * d_idx; + uint64_t hash = compute_hash(dbs, dataptr[0], dataptr[1], is_custom_bucket(dbs, d_idx)); + uint64_t masked_hash = hash & MASK_HASH; + +#if LLMSSET_MASK + _Atomic (uint64_t) *first_ptr = &dbs->table[2 * (hash & dbs->mask)]; +#else + _Atomic(uint64_t)* first_ptr = &dbs->table[2*(hash % dbs->table_size)]; +#endif + + uint64_t first_idx = atomic_load_explicit(first_ptr, memory_order_relaxed); + for (;;) { + atomic_store_explicit(dbs->table + 2 * d_idx + 1, masked_hash | first_idx, memory_order_relaxed); + if (atomic_compare_exchange_strong(first_ptr, &first_idx, d_idx)) return 1; + } +} + + +/** + * Clear a single bucket hash. + * (do not run parallel with lookup!!!) + * (for dynamic variable reordering) + * (lock-free, but not wait-free) + */ +int +llmsset_clear_one_hash(const llmsset_t dbs, uint64_t d_idx) +{ + uint64_t * dataptr = ((uint64_t *) dbs->data) + 2 * d_idx; + // compute the hash to find the ``head'' of the chain + uint64_t hash = compute_hash(dbs, dataptr[0], dataptr[1], is_custom_bucket(dbs, d_idx)); + +#if LLMSSET_MASK + _Atomic (uint64_t) *first_ptr = dbs->table + 2 * (hash & dbs->mask); +#else + _Atomic(uint64_t)* first_ptr = dbs->table + 2 * (hash % dbs->table_size); +#endif + + // lock the head of the chain using CAS to -1 (-1 as magical value meaning LOCKED) + uint64_t first_idx = atomic_load_explicit(first_ptr, memory_order_relaxed); + for (;;) { + while (first_idx == (uint64_t) -1) { + // already locked, spin-wait until unlocked + first_idx = atomic_load_explicit(first_ptr, memory_order_relaxed); + } + // not locked; check if not 0 (that would mean data is not in the hash table) + if (first_idx == 0) return 0; + // OK, use CAS to lock the chain + if (atomic_compare_exchange_strong(first_ptr, &first_idx, (uint64_t) -1)) { + break; + } + } + + // set next_idx to the next bucket in the chain + uint64_t next_idx = atomic_load_explicit(dbs->table + 2 * d_idx + 1, memory_order_relaxed); + next_idx &= MASK_INDEX; + + if (first_idx == d_idx) { + // simple case: the head is d_idx + atomic_store_explicit(first_ptr, next_idx, memory_order_seq_cst); + return 1; + } else { + // the head is not d_idx, so follow the chain... + uint64_t idx = first_idx; + for (;;) { + if (idx == 0) { + // if idx equals 0, then the item was not in the hash table. return 0. + // for example it was never created, or already removed... + atomic_store_explicit(first_ptr, first_idx, memory_order_seq_cst); + return 0; + } + + _Atomic (uint64_t) *chain_ptr = dbs->table + 2 * idx + 1; + uint64_t chain = atomic_load_explicit(chain_ptr, memory_order_relaxed); + idx = chain & MASK_INDEX; + + if (idx == d_idx) { // found our predecessor + // update the chain + atomic_store_explicit(chain_ptr, (chain & MASK_HASH) | next_idx, memory_order_seq_cst); + // unlock + atomic_store_explicit(first_ptr, first_idx, memory_order_seq_cst); + return 1; + } + } + } +} + +/** + * Clear a single bucket data. + */ +void llmsset_clear_one_data(llmsset_t dbs, uint64_t index) +{ + release_data_bucket(dbs, index); + if (is_custom_bucket(dbs, index)) { + uint64_t * d_ptr = ((uint64_t *) dbs->data) + 2 * index; + dbs->destroy_cb(d_ptr[0], d_ptr[1]); + } +} + +llmsset_t +llmsset_create(size_t initial_size, size_t max_size) +{ + llmsset_t dbs = (llmsset_t) alloc_aligned(sizeof(struct llmsset)); + if (dbs == 0) { + fprintf(stderr, "llmsset_create: Unable to allocate memory!\n"); + exit(1); + } + +#if LLMSSET_MASK + /* Check if initial_size and max_size are powers of 2 */ + if (__builtin_popcountll(initial_size) != 1) { + fprintf(stderr, "llmsset_create: initial_size is not a power of 2!\n"); + exit(1); + } + + if (__builtin_popcountll(max_size) != 1) { + fprintf(stderr, "llmsset_create: max_size is not a power of 2!\n"); + exit(1); + } +#endif + + if (initial_size > max_size) { + fprintf(stderr, "llmsset_create: initial_size > max_size!\n"); + exit(1); + } + + // minimum size is now 512 buckets (region size, but of course, n_workers * 512 is suggested as minimum) + + if (initial_size < 512) { + fprintf(stderr, "llmsset_create: initial_size too small!\n"); + exit(1); + } + + dbs->max_size = max_size; + llmsset_set_size(dbs, initial_size); + + /* This implementation of "resizable hash table" allocates the max_size table in virtual memory, + but only uses the "actual size" part in real memory */ + + dbs->table = (_Atomic (uint64_t) *) alloc_aligned(dbs->max_size * 16); + dbs->data = (uint8_t *) alloc_aligned(dbs->max_size * 16); + + /* Also allocate bitmaps. Each region is 64*8 = 512 buckets. + Overhead of bitmap1: 1 bit per 4096 bucket. + Overhead of bitmap2: 1 bit per bucket. + Overhead of bitmapc: 1 bit per bucket. */ + + dbs->bitmap1 = (_Atomic (uint64_t) *) alloc_aligned(dbs->max_size / (512 * 8)); + dbs->bitmap2 = (_Atomic (uint64_t) *) alloc_aligned(dbs->max_size / 8); + dbs->bitmapc = (uint64_t *) alloc_aligned(dbs->max_size / 8); + + if (dbs->table == 0 || dbs->data == 0 || dbs->bitmap1 == 0 || dbs->bitmap2 == 0 || dbs->bitmapc == 0) { + fprintf(stderr, "llmsset_create: Unable to allocate memory: %s!\n", strerror(errno)); + exit(1); + } + +#if defined(madvise) && defined(MADV_RANDOM) + madvise(dbs->table, dbs->max_size * 16, MADV_RANDOM); +#endif + + // forbid first two positions (index 0 and 1) + dbs->bitmap2[0] = 0xc000000000000000LL; + + dbs->hash_cb = NULL; + dbs->equals_cb = NULL; + dbs->create_cb = NULL; + dbs->destroy_cb = NULL; + + // yes, ugly. for now, we use a global thread-local value. + // that is a problem with multiple tables. + // so, for now, do NOT use multiple tables!! + + INIT_THREAD_LOCAL(my_region); + TOGETHER(llmsset_reset_region); + + // initialize hashtab + sylvan_init_hash(); + + return dbs; +} + +void +llmsset_free(llmsset_t dbs) +{ + free_aligned(dbs->table, dbs->max_size * 16); + free_aligned(dbs->data, dbs->max_size * 16); + free_aligned(dbs->bitmap1, dbs->max_size / (512 * 8)); + free_aligned(dbs->bitmap2, dbs->max_size / 8); + free_aligned(dbs->bitmapc, dbs->max_size / 8); + free_aligned(dbs, sizeof(struct llmsset)); +} + +VOID_TASK_IMPL_1(llmsset_clear, llmsset_t, dbs) +{ + CALL(llmsset_clear_data, dbs); + CALL(llmsset_clear_hashes, dbs); +} + +VOID_TASK_IMPL_1(llmsset_clear_data, llmsset_t, dbs) +{ + clear_aligned(dbs->bitmap1, dbs->max_size / (512 * 8)); + clear_aligned(dbs->bitmap2, dbs->max_size / 8); + + // forbid first two positions (index 0 and 1) + dbs->bitmap2[0] = 0xc000000000000000LL; + + TOGETHER(llmsset_reset_region); +} + +VOID_TASK_IMPL_1(llmsset_clear_hashes, llmsset_t, dbs) +{ + clear_aligned(dbs->table, dbs->max_size * 16); +} + +int +llmsset_is_marked(const llmsset_t dbs, uint64_t index) +{ + _Atomic (uint64_t) *ptr = dbs->bitmap2 + (index / 64); + uint64_t mask = 0x8000000000000000LL >> (index & 63); + return (atomic_load_explicit(ptr, memory_order_relaxed) & mask) ? 1 : 0; +} + +int +llmsset_mark(const llmsset_t dbs, uint64_t index) +{ + _Atomic (uint64_t) *ptr = dbs->bitmap2 + (index / 64); + uint64_t mask = 0x8000000000000000LL >> (index & 63); + for (;;) { + uint64_t v = *ptr; + if (v & mask) return 0; + if (atomic_compare_exchange_weak(ptr, &v, v | mask)) return 1; + } +} + +TASK_3(int, llmsset_rehash_par, llmsset_t, dbs, size_t, first, size_t, count) +{ + if (count > 512) { + SPAWN(llmsset_rehash_par, dbs, first, count / 2); + int bad = CALL(llmsset_rehash_par, dbs, first + count / 2, count - count / 2); + return bad + SYNC(llmsset_rehash_par); + } else { + int bad = 0; + _Atomic (uint64_t) *ptr = dbs->bitmap2 + (first / 64); + uint64_t mask = 0x8000000000000000LL >> (first & 63); + for (size_t k = 0; k < count; k++) { + if (atomic_load_explicit(ptr, memory_order_relaxed) & mask) { + if (llmsset_rehash_bucket(dbs, first + k) == 0) bad++; + } + mask >>= 1; + if (mask == 0) { + ptr++; + mask = 0x8000000000000000LL; + } + } + return bad; + } +} + +TASK_IMPL_1(int, llmsset_rehash, llmsset_t, dbs) +{ + return CALL(llmsset_rehash_par, dbs, 0, dbs->table_size); +} + +TASK_3(size_t, llmsset_count_marked_par, llmsset_t, dbs, size_t, first, size_t, count) +{ + if (count > 512) { + size_t split = count / 2; + SPAWN(llmsset_count_marked_par, dbs, first, split); + size_t right = CALL(llmsset_count_marked_par, dbs, first + split, count - split); + size_t left = SYNC(llmsset_count_marked_par); + return left + right; + } else { + size_t result = 0; + _Atomic (uint64_t) *ptr = dbs->bitmap2 + (first / 64); + if (count == 512) { + result += __builtin_popcountll(atomic_load_explicit(ptr + 0, memory_order_relaxed)); + result += __builtin_popcountll(atomic_load_explicit(ptr + 1, memory_order_relaxed)); + result += __builtin_popcountll(atomic_load_explicit(ptr + 2, memory_order_relaxed)); + result += __builtin_popcountll(atomic_load_explicit(ptr + 3, memory_order_relaxed)); + result += __builtin_popcountll(atomic_load_explicit(ptr + 4, memory_order_relaxed)); + result += __builtin_popcountll(atomic_load_explicit(ptr + 5, memory_order_relaxed)); + result += __builtin_popcountll(atomic_load_explicit(ptr + 6, memory_order_relaxed)); + result += __builtin_popcountll(atomic_load_explicit(ptr + 7, memory_order_relaxed)); + } else { + uint64_t mask = 0x8000000000000000LL >> (first & 63); + for (size_t k = 0; k < count; k++) { + if (atomic_load_explicit(ptr, memory_order_relaxed) & mask) result += 1; + mask >>= 1; + if (mask == 0) { + ptr++; + mask = 0x8000000000000000LL; + } + } + } + return result; + } +} + +TASK_IMPL_1(size_t, llmsset_count_marked, llmsset_t, dbs) +{ + return CALL(llmsset_count_marked_par, dbs, 0, dbs->table_size); +} + +VOID_TASK_3(llmsset_destroy_par, llmsset_t, dbs, size_t, first, size_t, count) +{ + if (count > 1024) { + size_t split = count / 2; + SPAWN(llmsset_destroy_par, dbs, first, split); + CALL(llmsset_destroy_par, dbs, first + split, count - split); + SYNC(llmsset_destroy_par); + } else { + for (size_t k = first; k < first + count; k++) { + _Atomic (uint64_t) *ptr2 = dbs->bitmap2 + (k / 64); + uint64_t * ptrc = dbs->bitmapc + (k / 64); + uint64_t mask = 0x8000000000000000LL >> (k & 63); + + // if not marked but is custom + if ((*ptr2 & mask) == 0 && (*ptrc & mask)) { + uint64_t * d_ptr = ((uint64_t *) dbs->data) + 2 * k; + dbs->destroy_cb(d_ptr[0], d_ptr[1]); + *ptrc &= ~mask; + } + } + } +} + +VOID_TASK_IMPL_1(llmsset_destroy_unmarked, llmsset_t, dbs) +{ + if (dbs->destroy_cb == NULL) return; // no custom function + CALL(llmsset_destroy_par, dbs, 0, dbs->table_size); +} + +/** + * Set custom functions + */ +void llmsset_set_custom(const llmsset_t dbs, llmsset_hash_cb hash_cb, llmsset_equals_cb equals_cb, + llmsset_create_cb create_cb, llmsset_destroy_cb destroy_cb) +{ + dbs->hash_cb = hash_cb; + dbs->equals_cb = equals_cb; + dbs->create_cb = create_cb; + dbs->destroy_cb = destroy_cb; +} \ No newline at end of file diff --git a/src/sylvan_varswap.c b/src/sylvan_varswap.c new file mode 100644 index 00000000..1d4761ee --- /dev/null +++ b/src/sylvan_varswap.c @@ -0,0 +1,468 @@ +#include +#include + +#define TASK_SIZE 1024 +/** + * @brief Check if a node is dependent on node with label or +1 + */ +static inline int is_node_dependent_on(mtbddnode_t node, BDDVAR var) +{ + MTBDD f0 = mtbddnode_getlow(node); + if (!mtbdd_isleaf(f0)) { + uint32_t vf0 = mtbdd_getvar(f0); + if (vf0 == var || vf0 == var + 1) return 1; + } + MTBDD f1 = mtbddnode_gethigh(node); + if (!mtbdd_isleaf(f1)) { + uint32_t vf1 = mtbdd_getvar(f1); + if (vf1 == var || vf1 == var + 1) return 1; + } + return 0; +} + +#if !SYLVAN_USE_LINEAR_PROBING +/*! + \brief Adjacent variable swap phase 0 (Chaining compatible) + \details Clear hashes of nodes with var and var+1, Removes exactly the nodes + that will be changed from the hash table. +*/ +VOID_TASK_DECL_6(sylvan_varswap_p0, uint32_t, size_t, size_t, _Atomic (reorder_result_t) *, roaring_bitmap_t*, + roaring_bitmap_t*) + +#define sylvan_varswap_p0(pos, result, ids, p1) CALL(sylvan_varswap_p0, pos, 0, nodes->table_size, result, ids, p1) +#endif + +/*! + @brief Adjacent variable swap phase 2 + @details Handle all trivial cases where no node is created, mark cases that are not trivial. + @return number of nodes that were marked +*/ +VOID_TASK_DECL_6(sylvan_varswap_p1, uint32_t, size_t, size_t, _Atomic (reorder_result_t) *, roaring_bitmap_t*, + roaring_bitmap_t*) + +#define sylvan_varswap_p1(pos, result, p1, p2) CALL(sylvan_varswap_p1, pos, 0, nodes->table_size, result, p1, p2) + +/*! + @brief Adjacent variable swap phase 2 + @details Handle the not so trivial cases. (creates new nodes) +*/ +VOID_TASK_DECL_5(sylvan_varswap_p2, size_t, size_t, _Atomic (reorder_result_t) *, roaring_bitmap_t*, roaring_bitmap_t*) + +#define sylvan_varswap_p2(result, ids, p2) CALL(sylvan_varswap_p2, 0, nodes->table_size, result, ids, p2) + +/*! + @brief Adjacent variable swap phase 3 + @details Recovery phase, restore the nodes that were marked in phase 1. +*/ +VOID_TASK_DECL_3(sylvan_varswap_recovery, uint32_t, _Atomic (reorder_result_t) *, roaring_bitmap_t*) + +#define sylvan_varswap_recovery(pos, result, node_ids) CALL(sylvan_varswap_recovery, pos, result, node_ids) + + +TASK_IMPL_1(reorder_result_t, sylvan_varswap, uint32_t, pos) +{ + if (pos == sylvan_invalid) return SYLVAN_REORDER_NO_REGISTERED_VARS; + + if ((double) get_nodes_count() > (double) llmsset_get_size(nodes) * SYLVAN_REORDER_MIN_MEM_REQ) { + return SYLVAN_REORDER_NOT_ENOUGH_MEMORY; + } + + _Atomic (reorder_result_t) result = SYLVAN_REORDER_SUCCESS; + sylvan_stats_count(SYLVAN_RE_SWAP_COUNT); + + roaring_bitmap_t p2_ids; + roaring_bitmap_init_cleared(&p2_ids); + + /// Phase 0: clear hashes of nodes with and or all nodes if linear probing is used +#if SYLVAN_USE_LINEAR_PROBING + llmsset_clear_hashes(nodes); + /// Phase 1: handle all trivial cases where no node is created, add cases that are not trivial to + sylvan_varswap_p1(pos, &result, reorder_db->mrc.node_ids, &p2_ids); +#else + roaring_bitmap_t p1_ids; + roaring_bitmap_init_cleared(&p1_ids); + sylvan_varswap_p0(pos, &result, reorder_db->mrc.node_ids, &p1_ids); + if (sylvan_reorder_issuccess(result) == 0) return result; // fail fast + /// Phase 1: handle all trivial cases where no node is created, add cases that are not trivial to + sylvan_varswap_p1(pos, &result, reorder_db->mrc.node_ids, &p2_ids); +#endif + + if (sylvan_reorder_issuccess(result) == 0) return result; // fail fast + + if (roaring_bitmap_get_cardinality(&p2_ids) > 0) { + /// Phase 2: handle the not so trivial cases (creates new nodes) + sylvan_varswap_p2(&result, &p2_ids, reorder_db->mrc.node_ids); + if (sylvan_reorder_issuccess(result) == 0) { + /// Phase 3: recovery + sylvan_varswap_recovery(pos, &result, reorder_db->mrc.node_ids); + } + } + +#if SYLVAN_USE_LINEAR_PROBING + // collect garbage (dead nodes) + mrc_gc(&reorder_db->mrc, reorder_db->mrc.node_ids); +#else + // collect garbage (dead nodes) + mrc_gc(&reorder_db->mrc, &p1_ids); +#endif + + levels_swap(&reorder_db->levels, pos, pos + 1); + + return result; +} + +#if !SYLVAN_USE_LINEAR_PROBING +/** + * Implementation of the zero phase of variable swapping. + * For all nodes, make and rehash. + * + * Removes exactly the nodes that will be changed from the hash table. + */ +VOID_TASK_IMPL_6(sylvan_varswap_p0, + uint32_t, var, + size_t, first, + size_t, count, + _Atomic (reorder_result_t)*, result, + roaring_bitmap_t*, node_ids, + roaring_bitmap_t*, p1_ids) +{ + if (count > TASK_SIZE) { + // standard reduction pattern with local roaring bitmaps collecting new node indices + size_t split = count / 2; + roaring_bitmap_t a; + roaring_bitmap_init_cleared(&a); + SPAWN(sylvan_varswap_p0, var, first, split, result, node_ids, &a); + roaring_bitmap_t b; + roaring_bitmap_init_cleared(&b); + CALL(sylvan_varswap_p0, var, first + split, count - split, result, node_ids, &b); + roaring_bitmap_or_inplace(p1_ids, &b); + roaring_bitmap_clear(&b); + SYNC(sylvan_varswap_p0); + roaring_bitmap_or_inplace(p1_ids, &a); + roaring_bitmap_clear(&a); + return; + } + roaring_uint32_iterator_t it; + roaring_init_iterator(node_ids, &it); + roaring_move_uint32_iterator_equalorlarger(&it, first); + + const size_t end = first + count; + while (it.has_value && it.current_value < end) { + if (atomic_load_explicit(result, memory_order_relaxed) != SYLVAN_REORDER_SUCCESS) return; // fail fast + size_t index = it.current_value; + roaring_advance_uint32_iterator(&it); + mtbddnode_t node = MTBDD_GETNODE(index); + if (mtbddnode_isleaf(node)) continue; // a leaf + uint32_t nvar = mtbddnode_getvariable(node); + if (nvar == var || nvar == (var + 1)) { + roaring_bitmap_add(p1_ids, index); + if (llmsset_clear_one_hash(nodes, index) < 0) { + atomic_store(result, SYLVAN_REORDER_P0_CLEAR_FAIL); + return; + } + } + } +} + +#endif + +/** + * Implementation of the first phase of variable swapping. + * For all nodes, set variable label to and rehash. + * For all nodes not depending on , set variable label to and rehash. + * For all nodes depending on , stay and mark. (no rehash) + * Returns number of marked nodes left. + * + * This algorithm is also used for the recovery phase 1. This is an identical + * phase, except marked nodes are unmarked. If the recovery flag is set, then only + * nodes are rehashed. + */ +VOID_TASK_IMPL_6(sylvan_varswap_p1, + uint32_t, var, + size_t, first, + size_t, count, + _Atomic (reorder_result_t)*, result, + roaring_bitmap_t*, p1_ids, + roaring_bitmap_t*, p2_ids) +{ + if (count > TASK_SIZE) { + size_t split = count / 2; + roaring_bitmap_t a; + roaring_bitmap_init_cleared(&a); + SPAWN(sylvan_varswap_p1, var, first, split, result, p1_ids, &a); + roaring_bitmap_t b; + roaring_bitmap_init_cleared(&b); + CALL(sylvan_varswap_p1, var, first + split, count - split, result, p1_ids, &b); + roaring_bitmap_or_inplace(p2_ids, &b); + roaring_bitmap_clear(&b); + SYNC(sylvan_varswap_p1); + roaring_bitmap_or_inplace(p2_ids, &a); + roaring_bitmap_clear(&a); + return; + } + + // initialize the iterator on stack to speed it up and bind lifetime to this scope + roaring_uint32_iterator_t it; + roaring_init_iterator(p1_ids, &it); + if (!roaring_move_uint32_iterator_equalorlarger(&it, first)) return; + + // standard reduction pattern with local variables to avoid hotspots + int var_diff = 0; + int var_plus_one_diff = 0; + + const size_t end = first + count; + while (it.has_value && it.current_value < end) { + if (atomic_load_explicit(result, memory_order_relaxed) != SYLVAN_REORDER_SUCCESS) { + return; // fail fast + } + + size_t index = it.current_value; + roaring_advance_uint32_iterator(&it); + + mtbddnode_t node = MTBDD_GETNODE(index); + uint32_t nvar = mtbddnode_getvariable(node); + + if (nvar == (var + 1)) { + // if , then replace with and rehash + var_diff++; + var_plus_one_diff--; + mtbddnode_setvariable(node, var); + if (llmsset_rehash_bucket(nodes, index) != 1) { + atomic_store(result, SYLVAN_REORDER_P1_REHASH_FAIL); + return; + } + continue; + } else if (nvar != var) { + continue; // not or + } + + if (mtbddnode_ismapnode(node)) { + MTBDD f0 = mtbddnode_getlow(node); + if (f0 == mtbdd_false) { + // we are at the end of a chain + var_plus_one_diff++; + var_diff--; + mtbddnode_setvariable(node, var + 1); + if (llmsset_rehash_bucket(nodes, index) != 1) { + atomic_store(result, SYLVAN_REORDER_P1_REHASH_FAIL); + return; + } + } else { + // not the end of a chain, so f0 is the next in chain + uint32_t vf0 = mtbdd_getvar(f0); + if (vf0 > var + 1) { + // next in chain wasn't ... + var_plus_one_diff++; + var_diff--; + mtbddnode_setvariable(node, var + 1); + if (llmsset_rehash_bucket(nodes, index) != 1) { + atomic_store(result, SYLVAN_REORDER_P1_REHASH_FAIL); + return; + } + } else { + // add for phase 2 + roaring_bitmap_add(p2_ids, index); + } + } + } else { + if (is_node_dependent_on(node, var)) { + // add for phase 2 + roaring_bitmap_add(p2_ids, index); + } else { + var_plus_one_diff++; + var_diff--; + mtbddnode_setvariable(node, var + 1); + if (llmsset_rehash_bucket(nodes, index) != 1) { + atomic_store(result, SYLVAN_REORDER_P1_REHASH_FAIL); + return; + } + } + } + } + + if (var_diff != 0) mrc_var_nnodes_add(&reorder_db->mrc, var, var_diff); + if (var_plus_one_diff != 0) mrc_var_nnodes_add(&reorder_db->mrc, var + 1, var_plus_one_diff); +} + +#define index(x) ((x) & SYLVAN_TABLE_MASK_INDEX) +/** + * Implementation of second phase of variable swapping. + * For all nodes marked in the first phase: + * - determine F00, F01, F10, F11 + * - obtain nodes F0 [var+1,F00,F10] and F1 [var+1, F01, F11] + * (and F0<>F1, trivial proof) + * - in-place substitute outgoing edges with new F0 and F1 + * - and rehash into hash table + * Returns 0 if there was no error, or 1 if nodes could not be + * rehashed, or 2 if nodes could not be created, or 3 if both. + */ +VOID_TASK_IMPL_5(sylvan_varswap_p2, + size_t, first, + size_t, count, + _Atomic (reorder_result_t)*, result, + roaring_bitmap_t*, p2_ids, + roaring_bitmap_t*, node_ids) +{ + if (count > TASK_SIZE) { + size_t split = count / 2; + // standard reduction pattern with local roaring bitmaps collecting new node indices + roaring_bitmap_t a; + roaring_bitmap_init_cleared(&a); + SPAWN(sylvan_varswap_p2, first, split, result, p2_ids, &a); + roaring_bitmap_t b; + roaring_bitmap_init_cleared(&b); + CALL(sylvan_varswap_p2, first + split, count - split, result, p2_ids, &b); + roaring_bitmap_or_inplace(node_ids, &b); + roaring_bitmap_clear(&b); + SYNC(sylvan_varswap_p2); + roaring_bitmap_or_inplace(node_ids, &a); + roaring_bitmap_clear(&a); + return; + } + + roaring_uint32_iterator_t it; + roaring_init_iterator(p2_ids, &it); + if (!roaring_move_uint32_iterator_equalorlarger(&it, first)) return; + + int new_nnodes = 0; + unsigned short var_new_nnodes[reorder_db->levels.count]; + memset(&var_new_nnodes, 0x00, sizeof(unsigned short) * reorder_db->levels.count); + + const size_t end = first + count; + while (it.has_value && it.current_value < end) { + if (atomic_load_explicit(result, memory_order_relaxed) != SYLVAN_REORDER_SUCCESS) { + return; // fail fast + } + size_t index = it.current_value; + roaring_advance_uint32_iterator(&it); + mtbddnode_t node = MTBDD_GETNODE(index); + + BDDVAR var = mtbddnode_getvariable(node); + if (mtbddnode_ismapnode(node)) { + MTBDD newf, f1, f0, f01, f00; + int created = 0; + + // it is a map node, swap places with next in chain + f0 = mtbddnode_getlow(node); + f1 = mtbddnode_gethigh(node); + mtbddnode_t n0 = MTBDD_GETNODE(f0); + f00 = node_getlow(f0, n0); + f01 = node_gethigh(f0, n0); + + newf = mtbdd_varswap_makemapnode(var + 1, f00, f1, &created); + if (newf == mtbdd_invalid) { + atomic_store(result, SYLVAN_REORDER_P2_MAPNODE_CREATE_FAIL); + return; + } + mtbddnode_makemapnode(node, var, f0, f01); + llmsset_rehash_bucket(nodes, index); + + mrc_ref_nodes_add(&reorder_db->mrc, index(f0), -1); + mrc_ref_nodes_add(&reorder_db->mrc, index(newf), 1); + + if (created) { + new_nnodes++; + var_new_nnodes[var + 1]++; + mrc_ref_nodes_add(&reorder_db->mrc, index(f00), 1); + mrc_ref_nodes_add(&reorder_db->mrc, index(f1), 1); + roaring_bitmap_add(node_ids, index(newf)); + } + } else { + MTBDD newf1, newf0, f1, f0, f11, f10, f01, f00; + int created0, created1 = 0; + + // obtain cofactors + f0 = mtbddnode_getlow(node); + f1 = mtbddnode_gethigh(node); + + f01 = f00 = f0; + if (!mtbdd_isleaf(f0) && mtbdd_getvar(f0) == var) { + f00 = mtbdd_getlow(f0); + f01 = mtbdd_gethigh(f0); + } + + f11 = f10 = f1; + if (!mtbdd_isleaf(f1) && mtbdd_getvar(f1) == var) { + f10 = mtbdd_getlow(f1); + f11 = mtbdd_gethigh(f1); + } + + // The new nodes required at level i (i.e., (xi, F01, F11) and (xi, F00, F10)) may be + // degenerate nodes (e.g., in the case that F11 = F01 or F10 == F00), + // or may already exist in the DAG as required to implement other functions. + + newf1 = mtbdd_varswap_makenode(var + 1, f01, f11, &created1); + if (newf1 == mtbdd_invalid) { + atomic_store(result, SYLVAN_REORDER_P2_CREATE_FAIL); + return; + } + + newf0 = mtbdd_varswap_makenode(var + 1, f00, f10, &created0); + if (newf0 == mtbdd_invalid) { + atomic_store(result, SYLVAN_REORDER_P2_CREATE_FAIL); + return; + } + + // update node, which also removes the mark + mtbddnode_makenode(node, var, newf0, newf1); + llmsset_rehash_bucket(nodes, index); + + mrc_ref_nodes_add(&reorder_db->mrc, index(f1), -1); + mrc_ref_nodes_add(&reorder_db->mrc, index(newf1), 1); + + if (created1) { + new_nnodes++; + var_new_nnodes[var + 1]++; + mrc_ref_nodes_add(&reorder_db->mrc, index(f11), 1); + mrc_ref_nodes_add(&reorder_db->mrc, index(f01), 1); + roaring_bitmap_add(node_ids, index(newf1)); + } + + mrc_ref_nodes_add(&reorder_db->mrc, index(f0), -1); + mrc_ref_nodes_add(&reorder_db->mrc, index(newf0), 1); + if (created0) { + new_nnodes++; + var_new_nnodes[var + 1]++; + mrc_ref_nodes_add(&reorder_db->mrc, index(f00), 1); + mrc_ref_nodes_add(&reorder_db->mrc, index(f10), 1); + roaring_bitmap_add(node_ids, index(newf0)); + } + } + + } + + if (new_nnodes > 0) mrc_nnodes_add(&reorder_db->mrc, new_nnodes); + for (size_t i = 0; i < reorder_db->levels.count; ++i) { + if (var_new_nnodes[i] > 0) mrc_var_nnodes_add(&reorder_db->mrc, i, var_new_nnodes[i]); + } +} + +VOID_TASK_IMPL_3(sylvan_varswap_recovery, uint32_t, pos, _Atomic (reorder_result_t)*, result, roaring_bitmap_t*, node_ids) +{ + printf("\nReordering: Running recovery after running out of memory...\n"); + + roaring_bitmap_t p2_ids; + roaring_bitmap_init_cleared(&p2_ids); + +#if SYLVAN_USE_LINEAR_PROBING + // clear the entire table + llmsset_clear_hashes(nodes); + // at this point we already have nodes marked from P2 so we will unmark them now in P1 + sylvan_varswap_p1(pos, result, node_ids, &p2_ids); +#else + roaring_bitmap_t p1_ids; + roaring_bitmap_init_cleared(&p1_ids); + // clear hashes of nodes with and + sylvan_varswap_p0(pos, result, node_ids, &p1_ids); + if (sylvan_reorder_issuccess(*result) == 0) return; // fail fast + // at this point we already have nodes marked from P2 so we will unmark them now in P1 + sylvan_varswap_p1(pos, result, &p1_ids, &p2_ids); +#endif + + if (sylvan_reorder_issuccess(*result) == 0) return; // fail fast + if (roaring_bitmap_get_cardinality(&p2_ids) > 0 && sylvan_reorder_issuccess(*result)) { + // do the not so trivial cases (but won't create new nodes this time) + sylvan_varswap_p2(result, &p2_ids, reorder_db->mrc.node_ids); + if (sylvan_reorder_issuccess(*result) == 0) return; // fail fast + } +} \ No newline at end of file diff --git a/src/sylvan_varswap.h b/src/sylvan_varswap.h new file mode 100644 index 00000000..05860492 --- /dev/null +++ b/src/sylvan_varswap.h @@ -0,0 +1,18 @@ +#ifndef SYLVAN_SYLVAN_VAR_SWAP_H +#define SYLVAN_SYLVAN_VAR_SWAP_H + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + + /** + * @brief Swaps two consecutive variables in the entire forest. + */ +TASK_DECL_1(reorder_result_t, sylvan_varswap, uint32_t) +#define sylvan_varswap(p) CALL(sylvan_varswap, p) + +#ifdef __cplusplus +} +#endif /* __cplusplus */ + +#endif //SYLVAN_SYLVAN_VAR_SWAP_H diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 343bcfb1..e3c37804 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,23 +1,20 @@ include(CTest) -add_executable(test_basic) -target_sources(test_basic PRIVATE test_basic.c) -target_link_libraries(test_basic PRIVATE sylvan::sylvan) -target_compile_features(test_basic PRIVATE c_std_11) -target_compile_options(test_basic PRIVATE -Wall -Wextra -Werror -Wno-deprecated) +macro(set_compilation_settings NAME) + target_include_directories(${NAME} PRIVATE ${CMAKE_CURRENT_LIST_DIR}) + target_link_libraries(${NAME} PRIVATE sylvan::sylvan) + target_compile_features(${NAME} PRIVATE c_std_11 cxx_std_11) + target_compile_options(${NAME} PRIVATE -Wall -Wextra -Werror -Wno-deprecated -Wno-unused) + add_test(${NAME} ${NAME}) +endmacro(set_compilation_settings) -add_executable(test_cxx) -target_sources(test_cxx PRIVATE test_cxx.cpp) -target_link_libraries(test_cxx PRIVATE sylvan::sylvan) -target_compile_features(test_cxx PRIVATE cxx_std_11) -target_compile_options(test_cxx PRIVATE -Wall -Wextra -Werror -Wno-deprecated) +macro(add_test_file NAME SOURCE) + add_executable(${NAME} ${SOURCE}) + set_compilation_settings(${NAME}) +endmacro(add_test_file) -add_executable(test_zdd) -target_sources(test_zdd PRIVATE test_zdd.c) -target_link_libraries(test_zdd PRIVATE sylvan::sylvan) -target_compile_features(test_zdd PRIVATE c_std_11) -target_compile_options(test_zdd PRIVATE -Wall -Wextra -Werror -Wno-deprecated) - -add_test(test_basic test_basic) -add_test(test_cxx test_cxx) -add_test(test_zdd test_zdd) +add_test_file(test_basic test_basic.c) +add_test_file(test_cxx test_cxx.cpp) +add_test_file(test_zdd test_zdd.c) +add_test_file(test_bitmap test_bitmap.c) +add_test_file(test_reorder test_reorder.c) \ No newline at end of file diff --git a/test/test_bitmap.c b/test/test_bitmap.c new file mode 100644 index 00000000..e6fe5227 --- /dev/null +++ b/test/test_bitmap.c @@ -0,0 +1,176 @@ +#include +#include "test_assert.h" +#include +#include + +int test_forward_iterator(size_t i, size_t j, size_t size) +{ + bitmap_t bitmap = { + .buckets = NULL, + .size = 0 + }; + bitmap_init(&bitmap, size); + + for (size_t k = i; k < j; k++) { + bitmap_set(&bitmap, k); + } + + for (size_t k = i; k < j; k++) { + assert(bitmap_get(&bitmap, k)); + } + + test_assert(bitmap_first(&bitmap) == i); + + size_t k = i; + size_t index = bitmap_first(&bitmap); + + while (index != npos) { + test_assert(index == k); + index = bitmap_next(&bitmap, index); + k++; + } + + test_assert(bitmap_count(&bitmap) == j - i); + + bitmap_deinit(&bitmap); + + return 0; +} + +int test_backwards_iterator(size_t i, size_t j, size_t size) +{ + bitmap_t bitmap = { + .buckets = NULL, + .size = 0 + }; + bitmap_init(&bitmap, size); + + for (size_t k = i; k < j; k++) { + bitmap_set(&bitmap, k); + } + + for (size_t k = i; k < j; k++) { + assert(bitmap_get(&bitmap, k)); + } + + test_assert(bitmap_last(&bitmap) == j); + + size_t k = j; + size_t index = bitmap_last(&bitmap); + while (index != npos) { + test_assert(index == k); + index = bitmap_prev(&bitmap, index); + k--; + } + + test_assert(bitmap_count(&bitmap) == j - i); + + bitmap_deinit(&bitmap); + + return 0; +} + +int test_atomic_forward_iterator(size_t i, size_t j, size_t size) +{ + atomic_bitmap_t bitmap = { + .container = NULL, + .size = 0 + }; + atomic_bitmap_init(&bitmap, size); + + for (size_t k = i; k < j; k++) { + atomic_bitmap_set(&bitmap, k, memory_order_seq_cst); + } + + for (size_t k = i; k < j; k++) { + assert(atomic_bitmap_get(&bitmap, k, memory_order_seq_cst)); + } + + test_assert(atomic_bitmap_first(&bitmap) == i); + + size_t k = i; + size_t index = atomic_bitmap_first(&bitmap); + while (index != npos) { + test_assert(index == k); + index = atomic_bitmap_next(&bitmap, index); + k++; + } + + atomic_bitmap_deinit(&bitmap); + + return 0; +} + +int test_atomic_backwards_iterator(size_t i, size_t j, size_t size) +{ + atomic_bitmap_t bitmap = { + .container = NULL, + .size = 0 + }; + atomic_bitmap_init(&bitmap, size); + + for (size_t k = i; k < j; k++) { + atomic_bitmap_set(&bitmap, k, memory_order_relaxed); + } + + for (size_t k = i; k < j; k++) { + assert(atomic_bitmap_get(&bitmap, k, memory_order_seq_cst)); + } + + test_assert(atomic_bitmap_last(&bitmap) == j); + + size_t k = j; + size_t index = atomic_bitmap_last(&bitmap); + while (index != npos) { + test_assert(index == k); + index = atomic_bitmap_prev(&bitmap, index); + k--; + } + + atomic_bitmap_deinit(&bitmap);; + + return 0; +} + +static inline size_t _rand() +{ + return rand() % 7919; // some not small prime number +} + +int runtests(size_t ntests) +{ + printf("test_forward_iterator\n"); + for (size_t j = 0; j < ntests; j++) { + size_t i = _rand(); + j = i + _rand(); + size_t size = j + 10; + if (test_forward_iterator(i, j, size)) return 1; + } + printf("test_backwards_iterator\n"); + for (size_t j = 0; j < ntests; j++) { + size_t i = _rand(); + j = i + _rand(); + size_t size = j + 10; + if (test_backwards_iterator(i, j, size)) return 1; + } + printf("test_atomic_forward_iterator\n"); + for (size_t j = 0; j < ntests; j++) { + size_t i = _rand(); + j = i + _rand(); + size_t size = j + 10; + if (test_atomic_forward_iterator(i, j, size)) return 1; + } + printf("test_atomic_backwards_iterator\n"); + for (size_t j = 0; j < ntests; j++) { + size_t i = _rand(); + j = i + _rand(); + size_t size = j + 10; + if (test_atomic_backwards_iterator(i, j, size)) return 1; + } + return 0; +} + +int main() +{ + return runtests(100); +} \ No newline at end of file diff --git a/test/test_reorder.c b/test/test_reorder.c new file mode 100644 index 00000000..f3c1b1cc --- /dev/null +++ b/test/test_reorder.c @@ -0,0 +1,659 @@ +#include +#include +#include + +#include +#include +#include "test_assert.h" + +/* Obtain current wallclock time */ +static double +wctime() +{ + struct timeval tv; + gettimeofday(&tv, NULL); + return (tv.tv_sec + 1E-6 * tv.tv_usec); +} + +static double t_start; +#define INFO(s, ...) fprintf(stdout, "\r[% 8.2f] " s, wctime()-t_start, ##__VA_ARGS__) +#define Abort(s, ...) { fprintf(stderr, "\r[% 8.2f] " s, wctime()-t_start, ##__VA_ARGS__); exit(-1); } + +void _sylvan_start(); + +void _sylvan_quit(); + +#define create_example_bdd(is_optimal) RUN(create_example_bdd, is_optimal) +TASK_1(BDD, create_example_bdd, size_t, is_optimal) +{ +// BDD is from the paper: +// Randal E. Bryant Graph-Based Algorithms for Boolean Function Manipulation, +// IEEE Transactions on Computers, 1986 http://www.cs.cmu.edu/~bryant/pubdir/ieeetc86.pdf + + // the variable indexing is relative to the current level + BDD v0 = sylvan_ithvar(0); + BDD v1 = sylvan_ithvar(1); + BDD v2 = sylvan_ithvar(2); + BDD v3 = sylvan_ithvar(3); + BDD v4 = sylvan_ithvar(4); + BDD v5 = sylvan_ithvar(5); + + if (is_optimal) { + // optimal order 0, 1, 2, 3, 4, 5 + // minimum 8 nodes including 2 terminal nodes + return sylvan_or(sylvan_and(v0, v1), sylvan_or(sylvan_and(v2, v3), sylvan_and(v4, v5))); + } else { + // not optimal order 0, 3, 1, 4, 2, 5 + // minimum 16 nodes including 2 terminal nodes + return sylvan_or(sylvan_and(v0, v3), sylvan_or(sylvan_and(v1, v4), sylvan_and(v2, v5))); + } +} + +#define create_example_map(is_optimal) RUN(create_example_map, is_optimal) +TASK_1(BDDMAP, create_example_map, size_t, is_optimal) +{ + BDDMAP map = sylvan_map_empty(); + BDD bdd = create_example_bdd(is_optimal); + map = sylvan_map_add(map, 0, bdd); + return map; +} + +TASK_0(int, test_varswap) +{ + // we need to delete all data so we reset sylvan + _sylvan_quit(); + _sylvan_start(); + + /* test ithvar, switch 6 and 7 */ + BDD one = sylvan_ithvar(6); + BDD two = sylvan_ithvar(7); + + test_assert(levels_level_to_order(&reorder_db->levels, 6) == 6); + test_assert(sylvan_level_to_order(7) == 7); + test_assert(sylvan_order_to_level(6) == 6); + test_assert(sylvan_order_to_level(7) == 7); + test_assert(one == sylvan_ithvar(6)); + test_assert(two == sylvan_ithvar(7)); + test_assert(mtbdd_getvar(one) == 6); + test_assert(mtbdd_getvar(two) == 7); + + sylvan_pre_reorder(SYLVAN_REORDER_SIFT); + + test_assert(sylvan_varswap(6) == SYLVAN_REORDER_SUCCESS); + + sylvan_post_reorder(); + + test_assert(sylvan_level_to_order(7) == 6); + test_assert(sylvan_level_to_order(6) == 7); + test_assert(sylvan_order_to_level(7) == 6); + test_assert(sylvan_order_to_level(6) == 7); + test_assert(mtbdd_getvar(one) == 7); + test_assert(mtbdd_getvar(two) == 6); + test_assert(one == sylvan_ithvar(7)); + test_assert(two == sylvan_ithvar(6)); + + return 0; +} + +TASK_0(int, test_varswap_down) +{ + // we need to delete all data so we reset sylvan + _sylvan_quit(); + _sylvan_start(); + + MTBDD zero = sylvan_ithvar(0); + MTBDD one = sylvan_ithvar(1); + MTBDD two = sylvan_ithvar(2); + MTBDD three = sylvan_ithvar(3); + + /* swap down manually var 0 to level 3 */ + test_assert(sylvan_level_to_order(0) == 0); + test_assert(sylvan_level_to_order(1) == 1); + test_assert(sylvan_level_to_order(2) == 2); + test_assert(sylvan_level_to_order(3) == 3); + + test_assert(sylvan_order_to_level(0) == 0); + test_assert(sylvan_order_to_level(1) == 1); + test_assert(sylvan_order_to_level(2) == 2); + test_assert(sylvan_order_to_level(3) == 3); + + test_assert(zero == sylvan_ithvar(0)); + test_assert(one == sylvan_ithvar(1)); + test_assert(two == sylvan_ithvar(2)); + test_assert(three == sylvan_ithvar(3)); + + sylvan_pre_reorder(SYLVAN_REORDER_SIFT); + + // (0), 1, 2, 3 + test_assert(sylvan_varswap(0) == SYLVAN_REORDER_SUCCESS); + test_assert(sylvan_varswap(1) == SYLVAN_REORDER_SUCCESS); + test_assert(sylvan_varswap(2) == SYLVAN_REORDER_SUCCESS); + // 1, 2, 3, (0) + + sylvan_post_reorder(); + + test_assert(sylvan_level_to_order(0) == 1); + test_assert(sylvan_level_to_order(1) == 2); + test_assert(sylvan_level_to_order(2) == 3); + test_assert(sylvan_level_to_order(3) == 0); + + test_assert(sylvan_order_to_level(1) == 0); + test_assert(sylvan_order_to_level(2) == 1); + test_assert(sylvan_order_to_level(3) == 2); + test_assert(sylvan_order_to_level(0) == 3); + + test_assert(zero == sylvan_ithvar(3)); + test_assert(one == sylvan_ithvar(0)); + test_assert(two == sylvan_ithvar(1)); + test_assert(three == sylvan_ithvar(2)); + + return 0; +} + +TASK_0(int, test_varswap_up) +{ + // we need to delete all data so we reset sylvan + _sylvan_quit(); + _sylvan_start(); + + + MTBDD zero = sylvan_ithvar(0); + MTBDD one = sylvan_ithvar(1); + MTBDD two = sylvan_ithvar(2); + MTBDD three = sylvan_ithvar(3); + + /* swap up manually var 3 to level 0 */ + test_assert(zero == sylvan_ithvar(0)); + test_assert(one == sylvan_ithvar(1)); + test_assert(two == sylvan_ithvar(2)); + test_assert(three == sylvan_ithvar(3)); + + sylvan_pre_reorder(SYLVAN_REORDER_SIFT); + + // 0, 1, 2, (3) + test_assert(sylvan_varswap(2) == SYLVAN_REORDER_SUCCESS); + test_assert(sylvan_varswap(1) == SYLVAN_REORDER_SUCCESS); + test_assert(sylvan_varswap(0) == SYLVAN_REORDER_SUCCESS); + // (3), 0, 1, 2 + + sylvan_post_reorder(); + + test_assert(sylvan_level_to_order(0) == 3); + test_assert(sylvan_level_to_order(1) == 0); + test_assert(sylvan_level_to_order(2) == 1); + test_assert(sylvan_level_to_order(3) == 2); + + test_assert(sylvan_order_to_level(3) == 0); + test_assert(sylvan_order_to_level(0) == 1); + test_assert(sylvan_order_to_level(1) == 2); + test_assert(sylvan_order_to_level(2) == 3); + + test_assert(zero == sylvan_ithvar(1)); + test_assert(one == sylvan_ithvar(2)); + test_assert(two == sylvan_ithvar(3)); + test_assert(three == sylvan_ithvar(0)); + + return 0; +} + +TASK_0(int, test_sift_down) +{ + // we need to delete all data so we reset sylvan + _sylvan_quit(); + _sylvan_start(); + + MTBDD zero = sylvan_ithvar(0); + MTBDD one = sylvan_ithvar(1); + MTBDD two = sylvan_ithvar(2); + MTBDD three = sylvan_ithvar(3); + + // we need to make relation between the variables otherwise the lower bounds will make sifting down skip the variables swaps + MTBDD bdd = sylvan_and(sylvan_and(sylvan_and(zero, one), two), three); + mtbdd_protect(&bdd); + + /* swap down manually var 0 to level 3 */ + test_assert(sylvan_level_to_order(0) == 0); + test_assert(sylvan_level_to_order(1) == 1); + test_assert(sylvan_level_to_order(2) == 2); + test_assert(sylvan_level_to_order(3) == 3); + + test_assert(sylvan_order_to_level(0) == 0); + test_assert(sylvan_order_to_level(1) == 1); + test_assert(sylvan_order_to_level(2) == 2); + test_assert(sylvan_order_to_level(3) == 3); + + test_assert(zero == sylvan_ithvar(0)); + test_assert(one == sylvan_ithvar(1)); + test_assert(two == sylvan_ithvar(2)); + test_assert(three == sylvan_ithvar(3)); + + sifting_state_t state; + state.low = 0; + state.high = 3; + + state.size = 0; + state.pos = 0; + + state.best_size = 770; + state.best_pos = 3; + + sylvan_pre_reorder(SYLVAN_REORDER_BOUNDED_SIFT); + + // (0), 1, 2, 3 + test_assert(sylvan_siftdown(&state) == SYLVAN_REORDER_SUCCESS); + // 1, 2, 3, (0) + + sylvan_post_reorder(); + + test_assert(sylvan_level_to_order(0) == 1); + test_assert(sylvan_level_to_order(1) == 2); + test_assert(sylvan_level_to_order(2) == 3); + test_assert(sylvan_level_to_order(3) == 0); + + test_assert(sylvan_order_to_level(1) == 0); + test_assert(sylvan_order_to_level(2) == 1); + test_assert(sylvan_order_to_level(3) == 2); + test_assert(sylvan_order_to_level(0) == 3); + + return 0; +} + +TASK_0(int, test_sift_up) +{ + // we need to delete all data so we reset sylvan + _sylvan_quit(); + _sylvan_start(); + + MTBDD zero = sylvan_ithvar(0); + MTBDD one = sylvan_ithvar(1); + MTBDD two = sylvan_ithvar(2); + MTBDD three = sylvan_ithvar(3); + + // we need to make relation between the variables otherwise the lower bounds will make sifting skip the variables swaps + MTBDD bdd = sylvan_and(sylvan_and(sylvan_and(zero, one), two), three); + mtbdd_protect(&bdd); + + /* swap up manually var 3 to level 0 */ + test_assert(zero == sylvan_ithvar(0)); + test_assert(one == sylvan_ithvar(1)); + test_assert(two == sylvan_ithvar(2)); + test_assert(three == sylvan_ithvar(3)); + + sifting_state_t state; + state.low = 0; + state.high = 1; + + state.size = 90; + state.best_size = 0; + + state.pos = 1; + state.best_pos = 0; + + sylvan_pre_reorder(SYLVAN_REORDER_BOUNDED_SIFT); + + // 0, (1), 2, 3 + test_assert(sylvan_siftup(&state) == SYLVAN_REORDER_SUCCESS); + // (1), 0, 2, 3 + + sylvan_post_reorder(); + + test_assert(sylvan_level_to_order(0) == 1); + test_assert(sylvan_level_to_order(1) == 0); + test_assert(sylvan_level_to_order(2) == 2); + test_assert(sylvan_level_to_order(3) == 3); + + test_assert(sylvan_order_to_level(1) == 0); + test_assert(sylvan_order_to_level(0) == 1); + test_assert(sylvan_order_to_level(2) == 2); + test_assert(sylvan_order_to_level(3) == 3); + + return 0; +} + +TASK_0(int, test_sift_back) +{ + // we need to delete all data so we reset sylvan + _sylvan_quit(); + _sylvan_start(); + + MTBDD zero = sylvan_ithvar(0); + MTBDD one = sylvan_ithvar(1); + MTBDD two = sylvan_ithvar(2); + MTBDD three = sylvan_ithvar(3); + + /* swap up manually var 3 to level 0 */ + test_assert(zero == sylvan_ithvar(0)); + test_assert(one == sylvan_ithvar(1)); + test_assert(two == sylvan_ithvar(2)); + test_assert(three == sylvan_ithvar(3)); + + sifting_state_t state; + state.low = 0; + state.high = 3; + + state.size = 999; + state.pos = 3; + + state.best_size = 1; + state.best_pos = 0; + + sylvan_pre_reorder(SYLVAN_REORDER_BOUNDED_SIFT); + + // 0, 1, 2, (3) + test_assert(sylvan_siftback(&state) == SYLVAN_REORDER_SUCCESS); + // (3), 0, 1, 2 + + sylvan_post_reorder(); + + test_assert(sylvan_level_to_order(0) == 3); + test_assert(sylvan_level_to_order(1) == 0); + test_assert(sylvan_level_to_order(2) == 1); + test_assert(sylvan_level_to_order(3) == 2); + + test_assert(sylvan_order_to_level(3) == 0); + test_assert(sylvan_order_to_level(0) == 1); + test_assert(sylvan_order_to_level(1) == 2); + test_assert(sylvan_order_to_level(2) == 3); + + test_assert(zero == sylvan_ithvar(1)); + test_assert(one == sylvan_ithvar(2)); + test_assert(two == sylvan_ithvar(3)); + test_assert(three == sylvan_ithvar(0)); + + state.size = 999; + state.pos = 0; + + state.best_size = 1; + state.best_pos = 4; + + sylvan_pre_reorder(SYLVAN_REORDER_BOUNDED_SIFT); + + // (3), 0, 1, 2 + test_assert(sylvan_siftback(&state) == SYLVAN_REORDER_SUCCESS); + // 0, 1, 2, (3) + + sylvan_post_reorder(); + + test_assert(zero == sylvan_ithvar(0)); + test_assert(one == sylvan_ithvar(1)); + test_assert(two == sylvan_ithvar(2)); + test_assert(three == sylvan_ithvar(3)); + + return 0; +} + +TASK_0(int, test_reorder_perm) +{ + // we need to delete all data so we reset sylvan + _sylvan_quit(); + _sylvan_start(); + + MTBDD zero = sylvan_ithvar(0); + MTBDD one = sylvan_ithvar(1); + MTBDD two = sylvan_ithvar(2); + MTBDD three = sylvan_ithvar(3); + + /* reorder the variables according to the variable permutation*/ + test_assert(zero == sylvan_ithvar(0)); + test_assert(one == sylvan_ithvar(1)); + test_assert(two == sylvan_ithvar(2)); + test_assert(three == sylvan_ithvar(3)); + + uint32_t perm[4] = {3, 0, 2, 1}; + + sylvan_pre_reorder(SYLVAN_REORDER_BOUNDED_SIFT); + + test_assert(sylvan_reorder_perm(perm) == SYLVAN_REORDER_SUCCESS); + + sylvan_post_reorder(); + + test_assert(sylvan_level_to_order(0) == perm[0]); + test_assert(sylvan_level_to_order(1) == perm[1]); + test_assert(sylvan_level_to_order(2) == perm[2]); + test_assert(sylvan_level_to_order(3) == perm[3]); + + test_assert(sylvan_order_to_level(perm[0]) == 0); + test_assert(sylvan_order_to_level(perm[1]) == 1); + test_assert(sylvan_order_to_level(perm[2]) == 2); + test_assert(sylvan_order_to_level(perm[3]) == 3); + + test_assert(zero == sylvan_ithvar(1)); + test_assert(one == sylvan_ithvar(3)); + test_assert(two == sylvan_ithvar(2)); + test_assert(three == sylvan_ithvar(0)); + + return 0; +} + +TASK_0(int, test_reorder) +{ + // we need to delete all data so we reset sylvan + _sylvan_quit(); + _sylvan_start(); + + BDD bdd = create_example_bdd(0); + sylvan_protect(&bdd); + + size_t not_optimal_order_size = sylvan_nodecount(bdd); + + sylvan_reduce_heap(SYLVAN_REORDER_SIFT); + + size_t not_optimal_order_reordered_size = sylvan_nodecount(bdd); + + test_assert(not_optimal_order_reordered_size < not_optimal_order_size); + + uint32_t perm[6] = {0, 1, 2, 3, 4, 5}; + int identity = 1; + // check if the new order is identity with the old order + for (size_t i = 0; i < reorder_db->levels.count; i++) { + if (sylvan_order_to_level(i) != perm[i]) { + identity = 0; + break; + } + } + +// if we gave it not optimal ordering then the new ordering should not be identity + test_assert(identity == 0); + + test_assert(sylvan_reorder_perm(perm) == SYLVAN_REORDER_SUCCESS); + + size_t not_optimal_size_again = sylvan_nodecount(bdd); + test_assert(not_optimal_order_size == not_optimal_size_again); + + for (size_t i = 0; i < reorder_db->levels.count; i++) { + test_assert(sylvan_order_to_level(i) == perm[i]); + } + + sylvan_unprotect(&bdd); + + return 0; +} + +TASK_0(int, test_map_reorder) +{ + // we need to delete all data so we reset sylvan + _sylvan_quit(); + _sylvan_start(); + + BDDMAP map = create_example_map(0); + sylvan_protect(&map); + + size_t size_before = sylvan_nodecount(map); + sylvan_reduce_heap(SYLVAN_REORDER_SIFT); + size_t size_after = sylvan_nodecount(map); + + test_assert(size_after < size_before); + sylvan_unprotect(&map); + + return 0; +} + +TASK_0(int, test_interact) +{ + // we need to delete all data so we reset sylvan + _sylvan_quit(); + _sylvan_start(); + + MTBDD bdd2 = create_example_bdd(0); + sylvan_ref(bdd2); + + BDD bdd1 = sylvan_or(sylvan_ithvar(6), sylvan_ithvar(7)); + sylvan_ref(bdd1); + + sylvan_pre_reorder(SYLVAN_REORDER_BOUNDED_SIFT); + + assert(interact_test(&reorder_db->matrix, 6, 7)); + assert(interact_test(&reorder_db->matrix, 7, 6)); + + for (size_t i = 0; i < reorder_db->levels.count; ++i) { + for (size_t j = i + 1; j < reorder_db->levels.count - 2; ++j) { + // test interaction of variables belonging to bdd2 + assert(interact_test(&reorder_db->matrix, i, j)); + assert(interact_test(&reorder_db->matrix, j, i)); + // test interaction of variables not belonging to bdd2 + assert(!interact_test(&reorder_db->matrix, 6, j)); + assert(!interact_test(&reorder_db->matrix, 6, i)); + assert(!interact_test(&reorder_db->matrix, 7, j)); + assert(!interact_test(&reorder_db->matrix, 7, i)); + } + } + + sylvan_post_reorder(); + interact_deinit(&reorder_db->matrix); + + sylvan_deref(bdd1); + sylvan_deref(bdd2); + return 0; +} + +TASK_0(int, test_ref_nodes) +{ + // we need to delete all data so we reset sylvan + _sylvan_quit(); + _sylvan_start(); + + MTBDD bdd = create_example_bdd(1); + sylvan_ref(bdd); + + MTBDD zero = bdd; + MTBDD one = mtbdd_gethigh(zero); + MTBDD two = mtbdd_getlow(zero); + MTBDD three = mtbdd_gethigh(two); + MTBDD four = mtbdd_getlow(two); + MTBDD five = mtbdd_gethigh(four); + + sylvan_pre_reorder(SYLVAN_REORDER_BOUNDED_SIFT); + + test_assert(0 == mrc_ref_nodes_get(&reorder_db->mrc, zero)); + test_assert(1 == mrc_ref_nodes_get(&reorder_db->mrc, one)); + test_assert(2 == mrc_ref_nodes_get(&reorder_db->mrc, two)); + test_assert(1 == mrc_ref_nodes_get(&reorder_db->mrc, three)); + test_assert(2 == mrc_ref_nodes_get(&reorder_db->mrc, four)); + test_assert(1 == mrc_ref_nodes_get(&reorder_db->mrc, five)); + + sylvan_post_reorder(); + + sylvan_deref(bdd); + + return 0; +} + +TASK_1(int, runtests, size_t, ntests) +{ + printf("testing varswap...\n"); + for (size_t j = 0; j < ntests; j++) if (RUN(test_varswap)) return 1; + printf("testing varswap_down...\n"); + for (size_t j = 0; j < ntests; j++) if (RUN(test_varswap_down)) return 1; + printf("testing varswap_up...\n"); + for (size_t j = 0; j < ntests; j++) if (RUN(test_varswap_up)) return 1; + printf("testing sift_down...\n"); + for (size_t j = 0; j < ntests; j++) if (RUN(test_sift_down)) return 1; + printf("testing sift_up...\n"); + for (size_t j = 0; j < ntests; j++) if (RUN(test_sift_up)) return 1; + printf("testing sift_back...\n"); + for (size_t j = 0; j < ntests; j++) if (RUN(test_sift_back)) return 1; + printf("testing reorder_perm...\n"); + for (size_t j = 0; j < ntests; j++) if (RUN(test_reorder_perm)) return 1; + printf("testing reorder...\n"); + for (size_t j = 0; j < ntests; j++) if (RUN(test_reorder)) return 1; + printf("testing map_reorder...\n"); + for (size_t j = 0; j < ntests; j++) if (RUN(test_map_reorder)) return 1; + printf("testing interact...\n"); + for (size_t j = 0; j < ntests; j++) if (RUN(test_interact)) return 1; + printf("testing ref_nodes...\n"); + for (size_t j = 0; j < ntests; j++) if (RUN(test_ref_nodes)) return 1; + return 0; +} + +static int terminate_reordering = 0; + +VOID_TASK_0(reordering_start) +{ +#ifndef NDEBUG + size_t size = llmsset_count_marked(nodes); + printf("RE: start: %zu size\n", size); +#endif +} + +VOID_TASK_0(reordering_progress) +{ +#ifndef NDEBUG + size_t size = llmsset_count_marked(nodes); + printf("RE: progress: %zu size\n", size); +#endif +} + +VOID_TASK_0(reordering_end) +{ +#ifndef NDEBUG + size_t size = llmsset_count_marked(nodes); + printf("RE: end: %zu size\n", size); +#endif +} + +int should_reordering_terminate() +{ + return terminate_reordering; +} + +void _sylvan_start() +{ + sylvan_set_limits(1LL << 23, 1, 0); + sylvan_init_package(); + sylvan_init_mtbdd(); + sylvan_init_reorder(); + sylvan_gc_enable(); + sylvan_set_reorder_print(false); + sylvan_set_reorder_nodes_threshold(1); // keep it 1, otherwise we skip levels which will fail the test expectations +} + +void _sylvan_quit() +{ + sylvan_quit(); +} + +int main() +{ + setlocale(LC_NUMERIC, "en_US.utf-8"); + t_start = wctime(); + + lace_start(1, 0); + + _sylvan_start(); + + sylvan_re_hook_prere(TASK(reordering_start)); + sylvan_re_hook_postre(TASK(reordering_end)); + sylvan_re_hook_progre(TASK(reordering_progress)); + sylvan_re_hook_termre(should_reordering_terminate); + + size_t ntests = 1; + + int res = RUN(runtests, ntests); + + sylvan_stats_report(stdout); + + _sylvan_quit(); + lace_stop(); + + return res; +}