From 105f45449b7fd6987bae017940abab4c0a8121f7 Mon Sep 17 00:00:00 2001 From: Arun Sharma Date: Wed, 3 Jun 2026 08:47:33 -0700 Subject: [PATCH] Add Gremlin traversal extension --- CMakeLists.txt | 1 + extension_config.cmake | 2 +- gremlin/CMakeLists.txt | 9 + gremlin/src/function/CMakeLists.txt | 7 + gremlin/src/function/gremlin_query.cpp | 227 +++++++++++++++++++ gremlin/src/include/function/gremlin_query.h | 15 ++ gremlin/src/include/main/gremlin_extension.h | 17 ++ gremlin/src/main/CMakeLists.txt | 7 + gremlin/src/main/gremlin_extension.cpp | 35 +++ gremlin/test/basic.test | 25 ++ 10 files changed, 344 insertions(+), 1 deletion(-) create mode 100644 gremlin/CMakeLists.txt create mode 100644 gremlin/src/function/CMakeLists.txt create mode 100644 gremlin/src/function/gremlin_query.cpp create mode 100644 gremlin/src/include/function/gremlin_query.h create mode 100644 gremlin/src/include/main/gremlin_extension.h create mode 100644 gremlin/src/main/CMakeLists.txt create mode 100644 gremlin/src/main/gremlin_extension.cpp create mode 100644 gremlin/test/basic.test diff --git a/CMakeLists.txt b/CMakeLists.txt index 43e853e..8268a29 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -88,6 +88,7 @@ add_extension_if_enabled_and_skip_32bit("azure") add_extension_if_enabled_and_skip_32bit("unity_catalog") add_extension_if_enabled("json") add_extension_if_enabled("fts") +add_extension_if_enabled("gremlin") add_extension_if_enabled("vector") add_extension_if_enabled("llm") add_extension_if_enabled("httpfs") diff --git a/extension_config.cmake b/extension_config.cmake index b182655..a52dd00 100644 --- a/extension_config.cmake +++ b/extension_config.cmake @@ -1,4 +1,4 @@ -set(EXTENSION_LIST adbc azure delta duckdb fts httpfs iceberg json llm postgres sqlite unity_catalog vector neo4j algo) +set(EXTENSION_LIST adbc azure delta duckdb fts gremlin httpfs iceberg json llm postgres sqlite unity_catalog vector neo4j algo) #set(EXTENSION_STATIC_LINK_LIST fts) foreach(extension IN LISTS EXTENSION_STATIC_LINK_LIST) diff --git a/gremlin/CMakeLists.txt b/gremlin/CMakeLists.txt new file mode 100644 index 0000000..66fbfcf --- /dev/null +++ b/gremlin/CMakeLists.txt @@ -0,0 +1,9 @@ +include_directories( + ${PROJECT_SOURCE_DIR}/src/include + ${CMAKE_BINARY_DIR}/src/include + src/include) + +add_subdirectory(src/function) +add_subdirectory(src/main) + +build_extension_lib(${BUILD_STATIC_EXTENSION} "gremlin") diff --git a/gremlin/src/function/CMakeLists.txt b/gremlin/src/function/CMakeLists.txt new file mode 100644 index 0000000..59aa95b --- /dev/null +++ b/gremlin/src/function/CMakeLists.txt @@ -0,0 +1,7 @@ +add_library(lbug_gremlin_extension_function + OBJECT + gremlin_query.cpp) + +set(GREMLIN_EXTENSION_OBJECT_FILES + ${GREMLIN_EXTENSION_OBJECT_FILES} $ + PARENT_SCOPE) diff --git a/gremlin/src/function/gremlin_query.cpp b/gremlin/src/function/gremlin_query.cpp new file mode 100644 index 0000000..935f80a --- /dev/null +++ b/gremlin/src/function/gremlin_query.cpp @@ -0,0 +1,227 @@ +#include "function/gremlin_query.h" + +#include +#include + +#include "common/exception/runtime.h" +#include "function/table/bind_data.h" +#include "function/table/bind_input.h" +#include "function/table/simple_table_function.h" +#include "function/table/table_function.h" + +namespace lbug { +namespace gremlin_extension { + +using namespace lbug::common; +using namespace lbug::function; +using namespace lbug::main; + +namespace { + +struct GremlinQueryBindData final : TableFuncBindData { + std::string query; + + explicit GremlinQueryBindData(std::string query) + : TableFuncBindData{binder::expression_vector{}, 0 /* maxOffset */}, + query{std::move(query)} {} + + std::unique_ptr copy() const override { + return std::make_unique(*this); + } +}; + +struct Traversal { + std::string hasKey; + std::string hasValue; + std::vector outLabels; + std::string valuesKey; +}; + +class GremlinParser { +public: + explicit GremlinParser(std::string query) : query{std::move(query)} {} + + Traversal parse() { + consumeWhitespace(); + consumeToken("g"); + consumeToken("."); + consumeCall("V"); + consumeWhitespace(); + Traversal traversal; + while (!isAtEnd()) { + consumeToken("."); + const auto step = parseIdentifier(); + consumeWhitespace(); + consumeToken("("); + if (step == "has") { + if (!traversal.hasKey.empty()) { + throw RuntimeException{"GREMLIN supports a single has(key, value) step."}; + } + traversal.hasKey = parseString(); + consumeWhitespace(); + consumeToken(","); + traversal.hasValue = parseString(); + } else if (step == "out") { + traversal.outLabels.push_back(parseString()); + } else if (step == "values") { + traversal.valuesKey = parseString(); + consumeWhitespace(); + consumeToken(")"); + consumeWhitespace(); + if (!isAtEnd()) { + throw RuntimeException{"GREMLIN values(key) must be the final step."}; + } + validate(traversal); + return traversal; + } else { + throw RuntimeException{"GREMLIN supports only has(key, value), out(label), and " + "values(key) after g.V()."}; + } + consumeWhitespace(); + consumeToken(")"); + consumeWhitespace(); + } + validate(traversal); + return traversal; + } + +private: + bool isAtEnd() const { return pos >= query.size(); } + + void consumeWhitespace() { + while (!isAtEnd() && std::isspace(static_cast(query[pos]))) { + pos++; + } + } + + void consumeToken(const std::string& token) { + consumeWhitespace(); + if (query.substr(pos, token.size()) != token) { + throw RuntimeException{"Invalid GREMLIN traversal near '" + query.substr(pos) + "'."}; + } + pos += token.size(); + } + + void consumeCall(const std::string& name) { + consumeToken(name); + consumeToken("("); + consumeToken(")"); + } + + std::string parseIdentifier() { + consumeWhitespace(); + const auto start = pos; + while (!isAtEnd() && + (std::isalnum(static_cast(query[pos])) || query[pos] == '_')) { + pos++; + } + if (start == pos) { + throw RuntimeException{"Expected GREMLIN step name."}; + } + return query.substr(start, pos - start); + } + + std::string parseString() { + consumeWhitespace(); + if (isAtEnd() || (query[pos] != '"' && query[pos] != '\'')) { + throw RuntimeException{"Expected GREMLIN string literal."}; + } + const auto quote = query[pos++]; + std::string result; + while (!isAtEnd()) { + const auto ch = query[pos++]; + if (ch == quote) { + return result; + } + if (ch == '\\') { + if (isAtEnd()) { + throw RuntimeException{"Unterminated GREMLIN string escape."}; + } + result.push_back(query[pos++]); + } else { + result.push_back(ch); + } + } + throw RuntimeException{"Unterminated GREMLIN string literal."}; + } + + static void validate(const Traversal& traversal) { + if (traversal.hasKey.empty() || traversal.valuesKey.empty()) { + throw RuntimeException{ + "GREMLIN traversal must contain has(key, value) and final values(key) steps."}; + } + } + +private: + std::string query; + size_t pos = 0; +}; + +static std::string quoteIdentifier(const std::string& identifier) { + std::string result = "`"; + for (const auto ch : identifier) { + if (ch == '`') { + result += "``"; + } else { + result.push_back(ch); + } + } + result += "`"; + return result; +} + +static std::string quoteStringLiteral(const std::string& value) { + std::string result = "'"; + for (const auto ch : value) { + if (ch == '\'') { + result += "\\'"; + } else if (ch == '\\') { + result += "\\\\"; + } else { + result.push_back(ch); + } + } + result += "'"; + return result; +} + +static std::string translateToCypher(const std::string& gremlinQuery) { + const auto traversal = GremlinParser{gremlinQuery}.parse(); + std::ostringstream cypher; + cypher << "MATCH (v0"; + for (auto i = 0u; i < traversal.outLabels.size(); i++) { + cypher << ")-[:" << quoteIdentifier(traversal.outLabels[i]) << "]->(v" << (i + 1); + } + cypher << ") WHERE v0." << quoteIdentifier(traversal.hasKey) << " = " + << quoteStringLiteral(traversal.hasValue) << " RETURN v" << traversal.outLabels.size() + << "." << quoteIdentifier(traversal.valuesKey) << " AS " + << quoteIdentifier(traversal.valuesKey) << ";"; + return cypher.str(); +} + +static std::unique_ptr bindFunc(ClientContext* /*context*/, + const TableFuncBindInput* input) { + return std::make_unique(input->getLiteralVal(0)); +} + +static std::string rewriteQuery(ClientContext& /*context*/, const TableFuncBindData& bindData) { + return translateToCypher(bindData.constPtrCast()->query); +} + +} // namespace + +function_set GremlinQueryFunction::getFunctionSet() { + function_set functionSet; + auto func = std::make_unique(name, std::vector{LogicalTypeID::STRING}); + func->tableFunc = TableFunction::emptyTableFunc; + func->bindFunc = bindFunc; + func->initSharedStateFunc = SimpleTableFunc::initSharedState; + func->initLocalStateFunc = TableFunction::initEmptyLocalState; + func->rewriteFunc = rewriteQuery; + func->canParallelFunc = [] { return false; }; + functionSet.push_back(std::move(func)); + return functionSet; +} + +} // namespace gremlin_extension +} // namespace lbug diff --git a/gremlin/src/include/function/gremlin_query.h b/gremlin/src/include/function/gremlin_query.h new file mode 100644 index 0000000..0aee678 --- /dev/null +++ b/gremlin/src/include/function/gremlin_query.h @@ -0,0 +1,15 @@ +#pragma once + +#include "function/function.h" + +namespace lbug { +namespace gremlin_extension { + +struct GremlinQueryFunction { + static constexpr const char* name = "GREMLIN"; + + static function::function_set getFunctionSet(); +}; + +} // namespace gremlin_extension +} // namespace lbug diff --git a/gremlin/src/include/main/gremlin_extension.h b/gremlin/src/include/main/gremlin_extension.h new file mode 100644 index 0000000..e390dd8 --- /dev/null +++ b/gremlin/src/include/main/gremlin_extension.h @@ -0,0 +1,17 @@ +#pragma once + +#include "extension/extension.h" + +namespace lbug { +namespace gremlin_extension { + +class GremlinExtension final : public extension::Extension { +public: + static constexpr char EXTENSION_NAME[] = "GREMLIN"; + +public: + static void load(main::ClientContext* context); +}; + +} // namespace gremlin_extension +} // namespace lbug diff --git a/gremlin/src/main/CMakeLists.txt b/gremlin/src/main/CMakeLists.txt new file mode 100644 index 0000000..438e9e3 --- /dev/null +++ b/gremlin/src/main/CMakeLists.txt @@ -0,0 +1,7 @@ +add_library(gremlin_extension_main + OBJECT + gremlin_extension.cpp) + +set(GREMLIN_EXTENSION_OBJECT_FILES + ${GREMLIN_EXTENSION_OBJECT_FILES} $ + PARENT_SCOPE) diff --git a/gremlin/src/main/gremlin_extension.cpp b/gremlin/src/main/gremlin_extension.cpp new file mode 100644 index 0000000..38b095b --- /dev/null +++ b/gremlin/src/main/gremlin_extension.cpp @@ -0,0 +1,35 @@ +#include "main/gremlin_extension.h" + +#include "function/gremlin_query.h" +#include "main/client_context.h" +#include "main/database.h" + +namespace lbug { +namespace gremlin_extension { + +using namespace extension; + +void GremlinExtension::load(main::ClientContext* context) { + auto& db = *context->getDatabase(); + ExtensionUtils::addStandaloneTableFunc(db); +} + +} // namespace gremlin_extension +} // namespace lbug + +#if defined(BUILD_DYNAMIC_LOAD) +extern "C" { +#if defined(_WIN32) +#define INIT_EXPORT __declspec(dllexport) +#else +#define INIT_EXPORT __attribute__((visibility("default"))) +#endif +INIT_EXPORT void init(lbug::main::ClientContext* context) { + lbug::gremlin_extension::GremlinExtension::load(context); +} + +INIT_EXPORT const char* name() { + return lbug::gremlin_extension::GremlinExtension::EXTENSION_NAME; +} +} +#endif diff --git a/gremlin/test/basic.test b/gremlin/test/basic.test new file mode 100644 index 0000000..b2b4915 --- /dev/null +++ b/gremlin/test/basic.test @@ -0,0 +1,25 @@ +-DATASET CSV empty +-BUFFER_POOL_SIZE 1024000000 + +-- + +-CASE GremlinOutOutValues + +-LOAD_DYNAMIC_EXTENSION gremlin + +-STATEMENT CREATE NODE TABLE person(id INT64, name STRING, PRIMARY KEY(id)); +---- ok + +-STATEMENT CREATE REL TABLE knows(FROM person TO person); +---- ok + +-STATEMENT CREATE (:person {id: 0, name: "gremlin"}), (:person {id: 1, name: "marko"}), (:person {id: 2, name: "lop"}), (:person {id: 3, name: "josh"}); +---- ok + +-STATEMENT MATCH (a:person {id: 0}), (b:person {id: 1}), (c:person {id: 2}), (d:person {id: 3}) CREATE (a)-[:knows]->(b), (b)-[:knows]->(c), (b)-[:knows]->(d); +---- ok + +-STATEMENT CALL GREMLIN("g.V().has('name','gremlin').out('knows').out('knows').values('name')"); +---- 2 +josh +lop