Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions cpp/src/gandiva/expression_registry.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

#include "gandiva/expression_registry.h"

#include "arrow/extension/uuid.h"
#include "gandiva/function_registry.h"
#include "gandiva/llvm_types.h"

Expand Down Expand Up @@ -166,6 +167,10 @@ static void AddArrowTypesToVector(arrow::Type::type type, DataTypeVector& vector
case arrow::Type::type::INTERVAL_DAY_TIME:
vector.push_back(arrow::day_time_interval());
break;
case arrow::Type::type::EXTENSION:
// Add UUID extension type
vector.push_back(arrow::extension::uuid());
break;
default:
// Unsupported types. test ensures that
// when one of these are added build breaks.
Expand Down
5 changes: 5 additions & 0 deletions cpp/src/gandiva/function_registry_string.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

#include "gandiva/function_registry_string.h"

#include "arrow/extension/uuid.h"
#include "gandiva/function_registry_common.h"

namespace gandiva {
Expand Down Expand Up @@ -201,6 +202,10 @@ std::vector<NativeFunction> GetStringFunctionRegistry() {
utf8(), kResultNullIfNull, "castVARCHAR_decimal128_int64",
NativeFunction::kNeedsContext),

NativeFunction("castUUID", {}, DataTypeVector{utf8()}, arrow::extension::uuid(),
kResultNullIfNull, "castUUID_utf8",
NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),

NativeFunction("crc32", {}, DataTypeVector{utf8()}, int64(), kResultNullIfNull,
"gdv_fn_crc_32_utf8", NativeFunction::kNeedsContext),

Expand Down
4 changes: 4 additions & 0 deletions cpp/src/gandiva/gdv_function_stubs.h
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,10 @@ const char* translate_utf8_utf8_utf8(int64_t context, const char* in, int32_t in
const char* from, int32_t from_len, const char* to,
int32_t to_len, int32_t* out_len);

GANDIVA_EXPORT
const char* castUUID_utf8(int64_t context, const char* data, int32_t data_len,
int32_t* out_len);

GANDIVA_EXPORT
gdv_timestamp to_utc_timezone_timestamp(int64_t context, gdv_timestamp time_milliseconds,
const char* timezone, int32_t length);
Expand Down
90 changes: 90 additions & 0 deletions cpp/src/gandiva/gdv_string_function_stubs.cc
Original file line number Diff line number Diff line change
Expand Up @@ -757,6 +757,84 @@ const char* translate_utf8_utf8_utf8(int64_t context, const char* in, int32_t in
}
}

// Helper function to convert a hex character to its numeric value
static inline int hex_char_to_int(char c) {
if (c >= '0' && c <= '9') {
return c - '0';
} else if (c >= 'a' && c <= 'f') {
return c - 'a' + 10;
} else if (c >= 'A' && c <= 'F') {
return c - 'A' + 10;
}
return -1; // Invalid hex character
}

GANDIVA_EXPORT
const char* castUUID_utf8(int64_t context, const char* data, int32_t data_len,
int32_t* out_len) {
*out_len = 16; // UUID is always 16 bytes

// Allocate output buffer
char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 16));
if (ret == nullptr) {
gdv_fn_context_set_error_msg(context, "Could not allocate memory for UUID");
*out_len = 0;
return "";
}

// Parse UUID string
// Expected format: "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" (36 chars)
// or without hyphens: "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" (32 chars)

if (data_len == 36) {
// Format with hyphens: validate hyphen positions
if (data[8] != '-' || data[13] != '-' || data[18] != '-' || data[23] != '-') {
gdv_fn_context_set_error_msg(context, "Invalid UUID format: hyphens at wrong positions");
*out_len = 0;
return "";
}

// Parse hex digits, skipping hyphens
int byte_idx = 0;
for (int i = 0; i < 36 && byte_idx < 16; i++) {
if (data[i] == '-') continue;

// Parse two hex digits
int high = hex_char_to_int(data[i]);
int low = hex_char_to_int(data[i + 1]);

if (high < 0 || low < 0) {
gdv_fn_context_set_error_msg(context, "Invalid hex digit in UUID");
*out_len = 0;
return "";
}

ret[byte_idx++] = static_cast<char>((high << 4) | low);
i++; // Skip the second hex digit
}
} else if (data_len == 32) {
// Format without hyphens
for (int i = 0; i < 16; i++) {
int high = hex_char_to_int(data[i * 2]);
int low = hex_char_to_int(data[i * 2 + 1]);

if (high < 0 || low < 0) {
gdv_fn_context_set_error_msg(context, "Invalid hex digit in UUID");
*out_len = 0;
return "";
}

ret[i] = static_cast<char>((high << 4) | low);
}
} else {
gdv_fn_context_set_error_msg(context, "Invalid UUID string length: expected 32 or 36 characters");
*out_len = 0;
return "";
}

return ret;
}

namespace gandiva {

arrow::Status ExportedStringFunctions::AddMappings(Engine* engine) const {
Expand Down Expand Up @@ -986,6 +1064,18 @@ arrow::Status ExportedStringFunctions::AddMappings(Engine* engine) const {
engine->AddGlobalMappingForFunc("translate_utf8_utf8_utf8",
types->i8_ptr_type() /*return_type*/, args,
reinterpret_cast<void*>(translate_utf8_utf8_utf8));

// castUUID_utf8
args = {
types->i64_type(), // context
types->i8_ptr_type(), // const char* data
types->i32_type(), // data_len
types->i32_ptr_type() // out_len
};

engine->AddGlobalMappingForFunc("castUUID_utf8",
types->i8_ptr_type() /*return_type*/, args,
reinterpret_cast<void*>(castUUID_utf8));
return arrow::Status::OK();
}
} // namespace gandiva
5 changes: 4 additions & 1 deletion cpp/src/gandiva/llvm_types.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@

#include "gandiva/llvm_types.h"

#include "arrow/extension/uuid.h"

namespace gandiva {

// LLVM doesn't distinguish between signed and unsigned types.
Expand All @@ -42,7 +44,8 @@ LLVMTypes::LLVMTypes(llvm::LLVMContext& context) : context_(context) {
{arrow::Type::type::BINARY, i8_ptr_type()},
{arrow::Type::type::DECIMAL, i128_type()},
{arrow::Type::type::INTERVAL_MONTHS, i32_type()},
{arrow::Type::type::INTERVAL_DAY_TIME, i64_type()}};
{arrow::Type::type::INTERVAL_DAY_TIME, i64_type()},
{arrow::Type::type::EXTENSION, i8_ptr_type()}};
}

} // namespace gandiva
83 changes: 83 additions & 0 deletions cpp/src/gandiva/precompiled/string_ops.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3034,4 +3034,87 @@ int32_t instr_utf8(const char* string, int32_t string_len, const char* substring
}
return 0;
}


// Helper function to convert a hex character to its numeric value
FORCE_INLINE
int hex_char_to_int(char c) {
if (c >= '0' && c <= '9') {
return c - '0';
} else if (c >= 'a' && c <= 'f') {
return c - 'a' + 10;
} else if (c >= 'A' && c <= 'F') {
return c - 'A' + 10;
}
return -1; // Invalid hex character
}

// Cast VARCHAR to UUID (FixedSizeBinary(16))
// Expected input format: "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" (36 characters with hyphens)
// or "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" (32 characters without hyphens)
FORCE_INLINE
const char* castUUID_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
gdv_int32* out_len) {
*out_len = 16; // UUID is always 16 bytes

// Allocate output buffer
char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 16));
if (ret == nullptr) {
gdv_fn_context_set_error_msg(context, "Could not allocate memory for UUID");
*out_len = 0;
return "";
}

// Parse UUID string
// Expected format: "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" (36 chars)
// or without hyphens: "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" (32 chars)

if (data_len == 36) {
// Format with hyphens: validate hyphen positions
if (data[8] != '-' || data[13] != '-' || data[18] != '-' || data[23] != '-') {
gdv_fn_context_set_error_msg(context, "Invalid UUID format: hyphens at wrong positions");
*out_len = 0;
return "";
}

// Parse hex digits, skipping hyphens
int byte_idx = 0;
for (int i = 0; i < 36 && byte_idx < 16; i++) {
if (data[i] == '-') continue;

// Parse two hex digits
int high = hex_char_to_int(data[i]);
int low = hex_char_to_int(data[i + 1]);

if (high < 0 || low < 0) {
gdv_fn_context_set_error_msg(context, "Invalid hex digit in UUID");
*out_len = 0;
return "";
}

ret[byte_idx++] = static_cast<char>((high << 4) | low);
i++; // Skip the second hex digit
}
} else if (data_len == 32) {
// Format without hyphens
for (int i = 0; i < 16; i++) {
int high = hex_char_to_int(data[i * 2]);
int low = hex_char_to_int(data[i * 2 + 1]);

if (high < 0 || low < 0) {
gdv_fn_context_set_error_msg(context, "Invalid hex digit in UUID");
*out_len = 0;
return "";
}

ret[i] = static_cast<char>((high << 4) | low);
}
} else {
gdv_fn_context_set_error_msg(context, "Invalid UUID string length: expected 32 or 36 characters");
*out_len = 0;
return "";
}

return ret;
}
} // extern "C"
116 changes: 116 additions & 0 deletions cpp/src/gandiva/precompiled/string_ops_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2704,4 +2704,120 @@ TEST(TestStringOps, TestInstr) {
result = instr_utf8(s1.c_str(), s1_len, s2.c_str(), s2_len);
EXPECT_EQ(result, 8);
}


TEST(TestStringOps, TestCastUUID) {
gandiva::ExecutionContext ctx;
uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
gdv_int32 out_len = 0;

// Test valid UUID with hyphens (36 characters)
const char* uuid_with_hyphens = "550e8400-e29b-41d4-a716-446655440000";
const char* result = castUUID_utf8(ctx_ptr, uuid_with_hyphens, 36, &out_len);
EXPECT_EQ(out_len, 16);
EXPECT_FALSE(ctx.has_error());

// Expected bytes for the UUID above
unsigned char expected[] = {0x55, 0x0e, 0x84, 0x00, 0xe2, 0x9b, 0x41, 0xd4,
0xa7, 0x16, 0x44, 0x66, 0x55, 0x44, 0x00, 0x00};
EXPECT_EQ(memcmp(result, expected, 16), 0);

// Test valid UUID without hyphens (32 characters)
const char* uuid_without_hyphens = "550e8400e29b41d4a716446655440000";
result = castUUID_utf8(ctx_ptr, uuid_without_hyphens, 32, &out_len);
EXPECT_EQ(out_len, 16);
EXPECT_FALSE(ctx.has_error());
EXPECT_EQ(memcmp(result, expected, 16), 0);

// Test UUID with uppercase hex digits
const char* uuid_uppercase = "550E8400-E29B-41D4-A716-446655440000";
result = castUUID_utf8(ctx_ptr, uuid_uppercase, 36, &out_len);
EXPECT_EQ(out_len, 16);
EXPECT_FALSE(ctx.has_error());
EXPECT_EQ(memcmp(result, expected, 16), 0);

// Test UUID with mixed case
const char* uuid_mixed = "550e8400-E29B-41d4-A716-446655440000";
result = castUUID_utf8(ctx_ptr, uuid_mixed, 36, &out_len);
EXPECT_EQ(out_len, 16);
EXPECT_FALSE(ctx.has_error());
EXPECT_EQ(memcmp(result, expected, 16), 0);

// Test all zeros UUID
const char* uuid_zeros = "00000000-0000-0000-0000-000000000000";
result = castUUID_utf8(ctx_ptr, uuid_zeros, 36, &out_len);
EXPECT_EQ(out_len, 16);
EXPECT_FALSE(ctx.has_error());
unsigned char zeros[16] = {0};
EXPECT_EQ(memcmp(result, zeros, 16), 0);

// Test all Fs UUID
const char* uuid_fs = "ffffffff-ffff-ffff-ffff-ffffffffffff";
result = castUUID_utf8(ctx_ptr, uuid_fs, 36, &out_len);
EXPECT_EQ(out_len, 16);
EXPECT_FALSE(ctx.has_error());
unsigned char fs[16];
memset(fs, 0xff, 16);
EXPECT_EQ(memcmp(result, fs, 16), 0);

// Test invalid length (too short)
const char* uuid_short = "550e8400-e29b-41d4-a716";
result = castUUID_utf8(ctx_ptr, uuid_short, 23, &out_len);
EXPECT_EQ(out_len, 0);
EXPECT_TRUE(ctx.has_error());
EXPECT_THAT(ctx.get_error(),
::testing::HasSubstr("Invalid UUID string length"));
ctx.Reset();

// Test invalid length (too long)
const char* uuid_long = "550e8400-e29b-41d4-a716-446655440000-extra";
result = castUUID_utf8(ctx_ptr, uuid_long, 42, &out_len);
EXPECT_EQ(out_len, 0);
EXPECT_TRUE(ctx.has_error());
EXPECT_THAT(ctx.get_error(),
::testing::HasSubstr("Invalid UUID string length"));
ctx.Reset();

// Test invalid hex character
const char* uuid_invalid_hex = "550e8400-e29b-41d4-a716-44665544000g";
result = castUUID_utf8(ctx_ptr, uuid_invalid_hex, 36, &out_len);
EXPECT_EQ(out_len, 0);
EXPECT_TRUE(ctx.has_error());
EXPECT_THAT(ctx.get_error(),
::testing::HasSubstr("Invalid hex digit in UUID"));
ctx.Reset();

// Test hyphens at wrong positions
const char* uuid_wrong_hyphens = "550e8400e-29b-41d4-a716-446655440000";
result = castUUID_utf8(ctx_ptr, uuid_wrong_hyphens, 36, &out_len);
EXPECT_EQ(out_len, 0);
EXPECT_TRUE(ctx.has_error());
EXPECT_THAT(ctx.get_error(),
::testing::HasSubstr("Invalid UUID format: hyphens at wrong positions"));
ctx.Reset();

// Test empty string
result = castUUID_utf8(ctx_ptr, "", 0, &out_len);
EXPECT_EQ(out_len, 0);
EXPECT_TRUE(ctx.has_error());
EXPECT_THAT(ctx.get_error(),
::testing::HasSubstr("Invalid UUID string length"));
ctx.Reset();

// Test another valid UUID (different pattern)
const char* uuid2 = "123e4567-e89b-12d3-a456-426614174000";
result = castUUID_utf8(ctx_ptr, uuid2, 36, &out_len);
EXPECT_EQ(out_len, 16);
EXPECT_FALSE(ctx.has_error());
unsigned char expected2[] = {0x12, 0x3e, 0x45, 0x67, 0xe8, 0x9b, 0x12, 0xd3,
0xa4, 0x56, 0x42, 0x66, 0x14, 0x17, 0x40, 0x00};
EXPECT_EQ(memcmp(result, expected2, 16), 0);

// Test UUID without hyphens (different pattern)
const char* uuid2_no_hyphens = "123e4567e89b12d3a456426614174000";
result = castUUID_utf8(ctx_ptr, uuid2_no_hyphens, 32, &out_len);
EXPECT_EQ(out_len, 16);
EXPECT_FALSE(ctx.has_error());
EXPECT_EQ(memcmp(result, expected2, 16), 0);
}
} // namespace gandiva
3 changes: 3 additions & 0 deletions cpp/src/gandiva/precompiled/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -841,4 +841,7 @@ const char* elt_int32_utf8_utf8_utf8_utf8_utf8(
int32_t instr_utf8(const char* string, int32_t string_len, const char* substring,
int32_t substring_len);

const char* castUUID_utf8(int64_t context, const char* data, int32_t data_len,
int32_t* out_len);

} // extern "C"