Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -141,14 +141,14 @@ ctest --test-dir build --output-on-failure --verbose
- [x] **Phase 2**: In-memory query execution (SeqScan, Filter, Projection)
- [x] **Phase 3**: Disk-based storage with buffer pool
- [x] **Phase 4**: BTree indexes with query planner integration
- [ ] **Phase 5**: JOIN operations
- [x] **Phase 5**: JOIN operations
- [x] Parse `INNER JOIN ... ON ...` with qualified column references
- [x] Execute joins via `NestedLoopJoin`
- [x] Add rule-based join algorithm choice (`NestedLoopJoin` vs `HashJoin`)
- [x] Support `JOIN + WHERE` (single-table pushdown + post-join filter)
- [x] Add correctness checks (ambiguous columns, swapped `ON` sides, type-mismatch safety)
- [ ] Add `EXPLAIN` command in REPL to print physical plan (`SeqScan`/`IndexScan`/`Join` path)
- [ ] Add join-condition index matching (index-aware join access path, e.g. index nested-loop opportunities)
- [x] Add `EXPLAIN` command in REPL to print physical plan (`SeqScan`/`IndexScan`/`Join` path)
- [x] Add join-condition index matching (`IndexNestedLoopJoin` when index exists on join column)
- [ ] **Phase 6**: Transactions

## Architecture
Expand Down
1 change: 1 addition & 0 deletions src/execution/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ add_library(execution
index_scan.cpp
nested_loop_join.cpp
hash_join.cpp
index_nested_loop_join.cpp
)

target_link_libraries(execution PUBLIC optimizer storage common)
55 changes: 54 additions & 1 deletion src/execution/executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,8 @@ namespace sql
return ExecuteCreateIndex(static_cast<CreateIndexStatement *>(stmt));
case StatementType::DROP_TABLE:
return ExecuteDropTable(static_cast<DropTableStatement *>(stmt));
case StatementType::EXPLAIN_STMT:
return ExecuteExplain(static_cast<ExplainStatement *>(stmt));
default:
return {false, "Unsupported statement type", {}, {}};
}
Expand Down Expand Up @@ -388,6 +390,36 @@ namespace sql
EnsureJoinContextTable(left, right);
return std::make_unique<HashJoin>(left, right, left_col, right_col, node->join_build_right);
}
case PhysicalPlanType::INDEX_NESTED_LOOP_JOIN:
{
Table *left = catalog_->GetTable(node->table_name);
Table *right = catalog_->GetTable(node->right_table_name);
if (left == nullptr || right == nullptr)
throw std::runtime_error("JOIN table not found while building operator tree");

// Resolve which ON-clause column belongs to which table (handles swapped ON order).
const auto [left_col, right_col] = ResolveJoinColumns(node, left, right);

// join_right_as_outer: true → right is outer, left is inner (has index)
// false → left is outer, right is inner (has index)
const bool right_is_outer = node->join_right_as_outer;
Table *outer_table = right_is_outer ? right : left;
Table *inner_table = right_is_outer ? left : right;
const std::string outer_col = right_is_outer ? right_col : left_col;
const std::string inner_col = right_is_outer ? left_col : right_col;
const std::string &inner_table_name = right_is_outer ? node->table_name : node->right_table_name;

BTree *inner_index = catalog_->GetIndex(inner_table_name, inner_col);
if (inner_index == nullptr)
throw std::runtime_error("Expected index not found on " + inner_table_name + "." + inner_col);

EnsureJoinContextTable(left, right);
// outer_is_left: left table is outer when right_is_outer=false
const bool outer_is_left = !right_is_outer;
return std::make_unique<IndexNestedLoopJoin>(
outer_table, inner_table, inner_index,
outer_col, inner_col, outer_is_left);
}
case PhysicalPlanType::FILTER:
{
if (node->children.empty())
Expand All @@ -408,7 +440,8 @@ namespace sql
auto child = BuildOperatorTree(node->children[0].get(), table, join_table);
Table *projection_join_table = join_table;
if (node->children[0]->type == PhysicalPlanType::NESTED_LOOP_JOIN ||
node->children[0]->type == PhysicalPlanType::HASH_JOIN)
node->children[0]->type == PhysicalPlanType::HASH_JOIN ||
node->children[0]->type == PhysicalPlanType::INDEX_NESTED_LOOP_JOIN)
{
projection_join_table = catalog_->GetTable(node->children[0]->right_table_name);
}
Expand Down Expand Up @@ -692,4 +725,24 @@ namespace sql
return {true, "Table '" + drop->table + "' dropped.", {}, {}};
}

ExecutionResult Executor::ExecuteExplain(ExplainStatement *explain)
{
ExecutionResult result;
try
{
materialized_tables_.clear();
join_context_table_.reset();
Optimizer optimizer;
auto physical_plan = optimizer.BuildPhysicalPlan(explain->select.get(), catalog_);
result.success = true;
result.message = optimizer.ExplainPhysicalPlan(physical_plan.get());
}
catch (const std::exception &e)
{
result.success = false;
result.message = e.what();
}
return result;
}

} // namespace sql
2 changes: 2 additions & 0 deletions src/execution/executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "execution/index_scan.h"
#include "execution/nested_loop_join.h"
#include "execution/hash_join.h"
#include "execution/index_nested_loop_join.h"
#include "parser/ast.h"
#include "optimizer/optimizer.h"
#include "catalog/catalog.h"
Expand Down Expand Up @@ -44,6 +45,7 @@ namespace sql
ExecutionResult ExecuteUpdate(UpdateStatement *update);
ExecutionResult ExecuteCreateIndex(CreateIndexStatement *create);
ExecutionResult ExecuteDropTable(DropTableStatement *drop);
ExecutionResult ExecuteExplain(ExplainStatement *explain);

// Evaluate an expression (reused for INSERT values, UPDATE SET, etc.)
Value EvaluateExpr(const Expression *expr, const Tuple *tuple = nullptr, Table *table = nullptr) const;
Expand Down
76 changes: 76 additions & 0 deletions src/execution/index_nested_loop_join.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#include "execution/index_nested_loop_join.h"
#include <stdexcept>

namespace sql
{

namespace
{
std::string StripQualifier(const std::string &name)
{
size_t dot = name.find('.');
if (dot == std::string::npos)
return name;
return name.substr(dot + 1);
}
} // namespace

void IndexNestedLoopJoin::Open()
{
if (outer_table_ == nullptr || inner_table_ == nullptr || inner_index_ == nullptr)
throw std::runtime_error("IndexNestedLoopJoin: null table or index");

outer_col_idx_ = outer_table_->GetColumnIndex(StripQualifier(outer_col_));
if (outer_col_idx_ < 0)
throw std::runtime_error("IndexNestedLoopJoin: unknown outer column: " + outer_col_);

outer_cursor_ = 0;
inner_matches_.clear();
inner_cursor_ = 0;
}

bool IndexNestedLoopJoin::Next(Tuple *tuple)
{
const auto &outer_rows = outer_table_->GetTuples();

while (true)
{
// Consume remaining inner matches for current outer row
while (inner_cursor_ < inner_matches_.size())
{
size_t inner_row_idx = inner_matches_[inner_cursor_++];
const Tuple &inner_row = inner_table_->GetTuple(inner_row_idx);

std::vector<Value> joined;
const Tuple &left_row = outer_is_left_ ? current_outer_ : inner_row;
const Tuple &right_row = outer_is_left_ ? inner_row : current_outer_;
joined.reserve(left_row.GetValueCount() + right_row.GetValueCount());
for (size_t i = 0; i < left_row.GetValueCount(); ++i)
joined.push_back(left_row.GetValue(i));
for (size_t i = 0; i < right_row.GetValueCount(); ++i)
joined.push_back(right_row.GetValue(i));

*tuple = Tuple(std::move(joined));
return true;
}

// Advance to next outer row
if (outer_cursor_ >= outer_rows.size())
return false;

current_outer_ = outer_rows[outer_cursor_++];
Value probe_key = current_outer_.GetValue(static_cast<size_t>(outer_col_idx_));
inner_matches_ = inner_index_->Search(probe_key);

Copilot AI Mar 26, 2026

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IndexNestedLoopJoin calls BTree::Search(probe_key) without guarding against type mismatch between outer_col and the indexed inner_col. Value::operator< throws on differing types, which can cause joins like INTEGER = VARCHAR to raise at runtime (NLJ currently treats this as “no match”). Consider checking the join key types (via schemas or by comparing probe_key.GetType()) and skipping/returning no matches when types differ, instead of calling into the BTree with an incompatible key.

Suggested change
inner_matches_ = inner_index_->Search(probe_key);
try
{
inner_matches_ = inner_index_->Search(probe_key);
}
catch (const std::exception &)
{
// Treat type mismatches or other search errors as "no matches"
inner_matches_.clear();
}

Copilot uses AI. Check for mistakes.
inner_cursor_ = 0;
}
}

void IndexNestedLoopJoin::Close()
{
outer_cursor_ = 0;
inner_matches_.clear();
inner_cursor_ = 0;
outer_col_idx_ = -1;
}

} // namespace sql
52 changes: 52 additions & 0 deletions src/execution/index_nested_loop_join.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#pragma once

#include "execution/operator.h"
#include "storage/table.h"
#include "storage/btree.h"
#include <string>
#include <vector>

namespace sql
{

// Index Nested-Loop Join: for each outer row, probes inner table via BTree index.
// Output column order is always (left_table_cols..., right_table_cols...) regardless
// of which side is outer; outer_is_left controls concatenation order.
class IndexNestedLoopJoin : public Operator
{
public:
// outer_table: table iterated row by row
// inner_table: table probed via index
// inner_index: BTree on inner_table.inner_col
// outer_col: column name in outer_table used as probe key
// inner_col: indexed column name in inner_table
// outer_is_left: if true, output is (outer || inner); else (inner || outer)
IndexNestedLoopJoin(Table *outer_table, Table *inner_table,
BTree *inner_index,
std::string outer_col, std::string inner_col,
bool outer_is_left)
: outer_table_(outer_table), inner_table_(inner_table),
inner_index_(inner_index),
outer_col_(std::move(outer_col)), inner_col_(std::move(inner_col)),
outer_is_left_(outer_is_left) {}

void Open() override;
bool Next(Tuple *tuple) override;
void Close() override;

private:
Table *outer_table_;
Table *inner_table_;
BTree *inner_index_;
std::string outer_col_;
std::string inner_col_;
bool outer_is_left_;

int outer_col_idx_ = -1;
size_t outer_cursor_ = 0;
Tuple current_outer_;
std::vector<size_t> inner_matches_;
size_t inner_cursor_ = 0;
};

} // namespace sql
2 changes: 2 additions & 0 deletions src/lexer/lexer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,8 @@ namespace sql
{"NOT", TokenType::NOT},
{"DROP", TokenType::DROP},
{"JOIN", TokenType::JOIN},
{"INNER", TokenType::INNER},
{"EXPLAIN", TokenType::EXPLAIN},
{"TRUE", TokenType::TRUE},
{"FALSE", TokenType::FALSE}};

Expand Down
2 changes: 2 additions & 0 deletions src/lexer/token.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ namespace sql
NOT,
DROP,
JOIN,
EXPLAIN,
INNER,

Comment on lines 35 to 38

Copilot AI Mar 26, 2026

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TokenType gained EXPLAIN and INNER, but TokenToString() in src/lexer/token.cpp doesn’t include these cases. That will degrade parser/lexer error messages (they’ll show UNKNOWN for these expected tokens). Add the missing switch cases in TokenToString to keep diagnostics accurate.

Copilot uses AI. Check for mistakes.
// Identifiers and Literals
IDENTIFIER,
Expand Down
84 changes: 72 additions & 12 deletions src/optimizer/optimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -386,22 +386,69 @@ namespace sql
const size_t left_count = table->GetTupleCount();
const size_t right_count = right_table->GetTupleCount();

// Rule-based join choice:
// - HASH_JOIN for larger equi-joins
// - NESTED_LOOP_JOIN for small inputs
const size_t total_rows = left_count + right_count;
const bool use_hash_join = total_rows >= 16;

auto join = std::make_unique<PhysicalPlanNode>(
use_hash_join ? PhysicalPlanType::HASH_JOIN : PhysicalPlanType::NESTED_LOOP_JOIN);
// Resolve which join column belongs to which table.
// The ON clause may have columns in either order (e.g., ON right.x = left.y),
// so we must check before looking up indexes.
const auto left_side = ResolveColumnSide(*select->join_left_column,
select->table, *select->join_table,
table, right_table);
const auto right_side = ResolveColumnSide(*select->join_right_column,
select->table, *select->join_table,
table, right_table);

// Determine the actual column name for each table side.
std::string col_on_left_table; // column belonging to select->table
std::string col_on_right_table; // column belonging to select->join_table
if (left_side == PredicateTableSide::LEFT && right_side == PredicateTableSide::RIGHT)
{
col_on_left_table = StripQualifier(*select->join_left_column);
col_on_right_table = StripQualifier(*select->join_right_column);
}
else if (left_side == PredicateTableSide::RIGHT && right_side == PredicateTableSide::LEFT)
{
col_on_left_table = StripQualifier(*select->join_right_column);
col_on_right_table = StripQualifier(*select->join_left_column);
}
// else: ambiguous or unresolved — skip INLJ

// Check if either join column has an index → prefer INDEX_NESTED_LOOP_JOIN.
std::unique_ptr<PhysicalPlanNode> join;
if (!col_on_left_table.empty() && !col_on_right_table.empty())
{
BTree *left_index = catalog->GetIndex(select->table, col_on_left_table);
BTree *right_index = catalog->GetIndex(*select->join_table, col_on_right_table);

if (right_index != nullptr)
{
// Right table has index → right is inner, left is outer.
join = std::make_unique<PhysicalPlanNode>(PhysicalPlanType::INDEX_NESTED_LOOP_JOIN);
join->join_right_as_outer = false; // left is outer
}
else if (left_index != nullptr)
{
// Left table has index → left is inner, right is outer.
join = std::make_unique<PhysicalPlanNode>(PhysicalPlanType::INDEX_NESTED_LOOP_JOIN);
join->join_right_as_outer = true; // right is outer
}
}

if (join == nullptr)
{
// Rule-based choice between HASH_JOIN and NESTED_LOOP_JOIN.
const size_t total_rows = left_count + right_count;
const bool use_hash_join = total_rows >= 16;
join = std::make_unique<PhysicalPlanNode>(
use_hash_join ? PhysicalPlanType::HASH_JOIN : PhysicalPlanType::NESTED_LOOP_JOIN);
// Iterate smaller table in outer loop for nested loop.
join->join_right_as_outer = right_count < left_count;
// Build hash table on smaller side for hash join.
join->join_build_right = right_count <= left_count;
}

join->table_name = select->table;
join->right_table_name = *select->join_table;
join->join_left_column = *select->join_left_column;
join->join_right_column = *select->join_right_column;
// Rule-based choice: iterate smaller table in outer loop for nested loop.
join->join_right_as_outer = right_count < left_count;
// Rule-based choice: build hash table on smaller side for hash join.
join->join_build_right = right_count <= left_count;
join->children.push_back(std::move(left_input));
join->children.push_back(std::move(right_input));
current = std::move(join);
Expand Down Expand Up @@ -484,6 +531,19 @@ namespace sql
<< ", build=" << (node->join_build_right ? "right" : "left")
<< ")";
break;
case PhysicalPlanType::INDEX_NESTED_LOOP_JOIN:
{
const std::string &outer = node->join_right_as_outer ? node->right_table_name : node->table_name;
const std::string &inner = node->join_right_as_outer ? node->table_name : node->right_table_name;
const std::string &inner_col = node->join_right_as_outer ? node->join_left_column : node->join_right_column;
out << pad << "IndexNestedLoopJoin(outer=" << outer
<< ", inner=" << inner
<< "(index on " << inner_col << ")"
<< ", on=" << node->join_left_column
<< " = " << node->join_right_column
<< ")";
break;
}
case PhysicalPlanType::FILTER:
out << pad << "Filter";
if (node->predicate != nullptr)
Expand Down
5 changes: 3 additions & 2 deletions src/optimizer/optimizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ namespace sql
INDEX_SCAN,
NESTED_LOOP_JOIN,
HASH_JOIN,
INDEX_NESTED_LOOP_JOIN,
FILTER,
PROJECTION
};
Expand All @@ -65,10 +66,10 @@ namespace sql
std::optional<Value> high_key;
bool high_inclusive = true;

// NESTED_LOOP_JOIN
// NESTED_LOOP_JOIN / HASH_JOIN / INDEX_NESTED_LOOP_JOIN
std::string join_left_column;
std::string join_right_column;
bool join_right_as_outer = false;
bool join_right_as_outer = false; // NLJ: right is outer loop; INLJ: right is outer (left has index)

// HASH_JOIN
bool join_build_right = true;
Expand Down
Loading
Loading