Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,7 @@ add_library(core
src/core.cpp
src/query.cpp
src/query_execution.cpp
src/join.cpp
src/storage.cpp
src/metadata.cpp
src/file_utils.cpp
Expand Down
172 changes: 172 additions & 0 deletions include/join.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
#ifndef JOIN_HPP
#define JOIN_HPP

#include <llvm/ADT/DenseSet.h>

#include <memory>

#include "query.hpp"

namespace tundradb {

/**
* @brief Input data for join ID computation
*
* Captures the state accumulated during edge traversal so that
* a JoinStrategy can decide which target (and source) IDs survive.
*/
struct JoinInput {
// All node IDs currently in the source schema within query state
const llvm::DenseSet<int64_t>& source_ids;

// All node IDs that exist in the target table (full scan of target schema)
const llvm::DenseSet<int64_t>& all_target_ids;

// Source nodes that had at least one matching edge
const llvm::DenseSet<int64_t>& matched_source_ids;

// Target nodes that were reached via matching edges
const llvm::DenseSet<int64_t>& matched_target_ids;

// Target IDs already accumulated from a previous traversal that shares
// the same target alias (e.g. multi-pattern queries). Empty on the first
// pass.
const llvm::DenseSet<int64_t>& existing_target_ids;

// Source nodes that had NO matching edge
const llvm::DenseSet<int64_t>& unmatched_source_ids;

// Whether source and target resolve to the same concrete schema
bool is_self_join;
};

/**
* @brief Output of join ID computation
*/
struct JoinOutput {
// Final set of target node IDs to store in query_state.ids[target]
llvm::DenseSet<int64_t> target_ids;

// Source IDs that should be removed from query_state (INNER join pruning)
llvm::DenseSet<int64_t> source_ids_to_remove;

// Whether the source table needs to be rebuilt after pruning
bool rebuild_source_table = false;
};

/**
* @brief Strategy interface for computing join results
*
* Each join type (INNER, LEFT, RIGHT, FULL) implements this interface
* to determine which node IDs should be included in the query result.
*
* The strategy only computes IDs - it does not modify QueryState or
* touch Arrow tables. That keeps it pure, testable, and composable.
*/
class JoinStrategy {
public:
virtual ~JoinStrategy() = default;

/**
* Compute which target/source IDs survive this join.
*/
[[nodiscard]] virtual JoinOutput compute(const JoinInput& input) const = 0;

/**
* Human-readable name for logging / debugging.
*/
[[nodiscard]] virtual const char* name() const noexcept = 0;
};

/**
* INNER JOIN
*
* Only matched targets survive.
* Unmatched sources are pruned (and the source table is rebuilt).
*
* When existing_target_ids is non-empty (multi-pattern), the result is
* the intersection of existing and newly matched target IDs.
*/
class InnerJoinStrategy final : public JoinStrategy {
public:
[[nodiscard]] JoinOutput compute(const JoinInput& input) const override;
[[nodiscard]] const char* name() const noexcept override { return "INNER"; }
};

/**
* LEFT JOIN
*
* All source nodes are kept. Target IDs are the union of matched
* targets and any previously accumulated targets (multi-pattern).
*/
class LeftJoinStrategy final : public JoinStrategy {
public:
[[nodiscard]] JoinOutput compute(const JoinInput& input) const override;
[[nodiscard]] const char* name() const noexcept override { return "LEFT"; }
};

/**
* RIGHT JOIN (self-join variant)
*
* target_ids = all_targets − matched_sources
*
* For self-joins the source and target live in the same schema, so
* we exclude matched *source* IDs to prevent a node appearing both
* as a matched source and as an unmatched target.
*/
class RightJoinSelfStrategy final : public JoinStrategy {
public:
[[nodiscard]] JoinOutput compute(const JoinInput& input) const override;
[[nodiscard]] const char* name() const noexcept override {
return "RIGHT_SELF";
}
};

/**
* RIGHT JOIN (cross-schema variant)
*
* target_ids = matched_targets ∪ (all_targets − matched_targets)
* = all_targets (but computed in two steps so logging is clear)
*
* For cross-schema joins, IDs live in separate namespaces, so we compare
* within the target schema only.
*/
class RightJoinCrossSchemaStrategy final : public JoinStrategy {
public:
[[nodiscard]] JoinOutput compute(const JoinInput& input) const override;
[[nodiscard]] const char* name() const noexcept override {
return "RIGHT_CROSS";
}
};

/**
* FULL OUTER JOIN
*
* Combines the RIGHT logic (all targets survive) with the LEFT logic
* (all sources survive). Delegates the target-side computation to an
* inner RIGHT strategy (self or cross-schema).
*/
class FullJoinStrategy final : public JoinStrategy {
public:
explicit FullJoinStrategy(std::unique_ptr<JoinStrategy> right_strategy);

[[nodiscard]] JoinOutput compute(const JoinInput& input) const override;
[[nodiscard]] const char* name() const noexcept override { return "FULL"; }

private:
std::unique_ptr<JoinStrategy> right_strategy_;
};

/**
* @brief Creates the appropriate JoinStrategy for a given TraverseType
* and join context (self-join vs. cross-schema).
*/
class JoinStrategyFactory {
public:
static std::unique_ptr<JoinStrategy> create(TraverseType type,
bool is_self_join);
};

} // namespace tundradb

#endif // JOIN_HPP
25 changes: 25 additions & 0 deletions include/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,31 @@
#include "types.hpp"

namespace tundradb {

template <class SetA, class SetB, class OutSet>
void dense_intersection(const SetA& a, const SetB& b, OutSet& out) {
const auto& small = a.size() < b.size() ? a : b;
const auto& large = a.size() < b.size() ? b : a;
out.clear();
out.reserve(std::min(a.size(), b.size()));
for (const auto& x : small) {
if (large.contains(x)) {
out.insert(x);
}
}
}

template <class SetA, class SetB, class OutSet>
void dense_difference(const SetA& a, const SetB& b, OutSet& out) {
out.clear();
out.reserve(a.size());
for (const auto& x : a) {
if (!b.contains(x)) {
out.insert(x);
}
}
}

static std::string generate_uuid() {
uuid_t uuid;
uuid_generate(uuid);
Expand Down
Loading