Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/index/explorer.h
Original file line number Diff line number Diff line change
Expand Up @@ -540,7 +540,7 @@ struct GENERATE_EXPLORER_SEARCH_INDEX {
: std::string_view{reinterpret_cast<const char *>(payload.data()),
payload.size()}};
sourcemeta::one::metapack_write_text(
action.destination, payload_view, "application/jsonl",
action.destination, payload_view, "application/octet-stream",
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

metapack_write_text() appends a trailing \n (and copies into a std::string), so using it for this binary search index will mutate the payload bytes (and even an “empty” index becomes a 1-byte payload). For a binary format this can be surprising and may break any consumer expecting the payload to end exactly at the last record.

Severity: medium

Fix This in Augment

🤖 Was this useful? React with 👍 or 👎, or 🚀 if it prevented an incident/outage.

// We don't want to compress this one so we can
// quickly skim through it while streaming it
sourcemeta::one::MetapackEncoding::Identity, {},
Expand Down
13 changes: 13 additions & 0 deletions src/search/include/sourcemeta/one/search.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,19 @@ struct SearchEntry {
std::uint8_t health;
};

#pragma pack(push, 1)
struct SearchIndexHeader {
std::uint32_t entry_count;
std::uint32_t records_offset;
};

struct SearchRecordHeader {
std::uint16_t path_length;
std::uint16_t title_length;
std::uint16_t description_length;
};
#pragma pack(pop)

SOURCEMETA_ONE_SEARCH_EXPORT
auto make_search(std::vector<SearchEntry> &&entries)
-> std::vector<std::uint8_t>;
Expand Down
163 changes: 130 additions & 33 deletions src/search/search.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
#include <algorithm> // std::ranges::search
#include <cassert> // assert
#include <cctype> // std::tolower
#include <sstream> // std::ostringstream
#include <cstring> // std::memcpy
#include <limits> // std::numeric_limits
#include <utility> // std::move

namespace sourcemeta::one {
Expand All @@ -29,59 +30,155 @@ auto make_search(std::vector<SearchEntry> &&entries)
return left.path < right.path;
});

std::ostringstream buffer;
constexpr auto MAX_FIELD_LENGTH{
static_cast<std::size_t>(std::numeric_limits<std::uint16_t>::max())};
std::erase_if(entries, [](const SearchEntry &entry) {
return entry.path.size() > MAX_FIELD_LENGTH ||
entry.title.size() > MAX_FIELD_LENGTH ||
entry.description.size() > MAX_FIELD_LENGTH;
});

if (entries.empty()) {
return {};
}

const auto entry_count{static_cast<std::uint32_t>(entries.size())};

// Compute total payload size
std::size_t total_size{sizeof(SearchIndexHeader) +
entry_count * sizeof(std::uint32_t)};
for (const auto &entry : entries) {
auto json_entry{sourcemeta::core::JSON::make_array()};
json_entry.push_back(sourcemeta::core::JSON{entry.path});
json_entry.push_back(sourcemeta::core::JSON{entry.title});
json_entry.push_back(sourcemeta::core::JSON{entry.description});
sourcemeta::core::stringify(json_entry, buffer);
buffer << '\n';
total_size += sizeof(SearchRecordHeader) + entry.path.size() +
entry.title.size() + entry.description.size();
}

const auto result{buffer.str()};
return {result.begin(), result.end()};
std::vector<std::uint8_t> payload(total_size);
const auto records_offset{static_cast<std::uint32_t>(
sizeof(SearchIndexHeader) + entry_count * sizeof(std::uint32_t))};

// Write header
SearchIndexHeader header{.entry_count = entry_count,
.records_offset = records_offset};
std::memcpy(payload.data(), &header, sizeof(SearchIndexHeader));

// Write records and fill offset table
auto *offset_table{payload.data() + sizeof(SearchIndexHeader)};
std::size_t record_position{records_offset};
for (std::uint32_t entry_index{0}; entry_index < entry_count; ++entry_index) {
const auto &entry{entries[entry_index]};

// Write this record's offset into the table
const auto record_offset{static_cast<std::uint32_t>(record_position)};
std::memcpy(offset_table + entry_index * sizeof(std::uint32_t),
&record_offset, sizeof(std::uint32_t));

// Write record header
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The record field lengths are truncated to std::uint16_t without checking entry.*.size() <= UINT16_MAX, which will silently corrupt/truncate large paths/titles/descriptions. Because the writer still advances by the full std::string::size(), this can also make the on-disk format internally inconsistent for long fields.

Severity: medium

Fix This in Augment

🤖 Was this useful? React with 👍 or 👎, or 🚀 if it prevented an incident/outage.

const SearchRecordHeader record_header{
.path_length = static_cast<std::uint16_t>(entry.path.size()),
.title_length = static_cast<std::uint16_t>(entry.title.size()),
.description_length =
static_cast<std::uint16_t>(entry.description.size())};
std::memcpy(payload.data() + record_position, &record_header,
sizeof(SearchRecordHeader));
record_position += sizeof(SearchRecordHeader);

// Write field data
std::memcpy(payload.data() + record_position, entry.path.data(),
entry.path.size());
record_position += entry.path.size();
std::memcpy(payload.data() + record_position, entry.title.data(),
entry.title.size());
record_position += entry.title.size();
std::memcpy(payload.data() + record_position, entry.description.data(),
entry.description.size());
record_position += entry.description.size();
}

assert(record_position == total_size);
return payload;
}

static auto case_insensitive_contains(const std::string_view haystack,
const std::string_view needle) -> bool {
return !std::ranges::search(
haystack, needle,
[](const auto left, const auto right) {
return std::tolower(static_cast<unsigned char>(left)) ==
std::tolower(static_cast<unsigned char>(right));
})
.empty();
}

auto search(const std::uint8_t *payload, const std::size_t payload_size,
const std::string_view query) -> sourcemeta::core::JSON {
auto result{sourcemeta::core::JSON::make_array()};
if (payload_size == 0) {
if (payload == nullptr || payload_size < sizeof(SearchIndexHeader)) {
return result;
}
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

payload is raw bytes from a file, but it’s being parsed via reinterpret_cast<...*>(payload) and dereferenced (header, offset_table, record_header), which relies on type-punning/object-lifetime assumptions and can be undefined behavior under optimization. Consider decoding these fixed-size structs/integers via std::memcpy into local trivially-copyable values instead.

Severity: medium

Other Locations
  • src/search/search.cc:125
  • src/search/search.cc:136

Fix This in Augment

🤖 Was this useful? React with 👍 or 👎, or 🚀 if it prevented an incident/outage.


assert(payload != nullptr);
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
const std::string_view data{reinterpret_cast<const char *>(payload),
payload_size};

std::size_t line_start{0};
while (line_start < data.size()) {
auto line_end{data.find('\n', line_start)};
if (line_end == std::string_view::npos) {
line_end = data.size();
const auto *header{reinterpret_cast<const SearchIndexHeader *>(payload)};

if (header->entry_count == 0) {
return result;
}

const auto offset_table_end{sizeof(SearchIndexHeader) +
static_cast<std::size_t>(header->entry_count) *
sizeof(std::uint32_t)};
if (offset_table_end > payload_size) {
return result;
}

// NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
const auto *offset_table{reinterpret_cast<const std::uint32_t *>(
payload + sizeof(SearchIndexHeader))};

for (std::uint32_t entry_index{0}; entry_index < header->entry_count;
++entry_index) {
const auto record_offset{offset_table[entry_index]};
if (record_offset + sizeof(SearchRecordHeader) > payload_size) {
break;
}

const auto line{data.substr(line_start, line_end - line_start)};
line_start = line_end + 1;
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
const auto *record_header{
reinterpret_cast<const SearchRecordHeader *>(payload + record_offset)};

if (line.empty()) {
continue;
const auto field_data_offset{record_offset + sizeof(SearchRecordHeader)};
const auto total_field_length{
static_cast<std::size_t>(record_header->path_length) +
record_header->title_length + record_header->description_length};
if (field_data_offset + total_field_length > payload_size) {
break;
}

if (std::ranges::search(line, query, [](const auto left, const auto right) {
return std::tolower(static_cast<unsigned char>(left)) ==
std::tolower(static_cast<unsigned char>(right));
}).empty()) {
const auto *field_data{payload + field_data_offset};

// NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
const std::string_view path{reinterpret_cast<const char *>(field_data),
record_header->path_length};
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
const std::string_view title{
reinterpret_cast<const char *>(field_data + record_header->path_length),
record_header->title_length};
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
const std::string_view description{
reinterpret_cast<const char *>(field_data + record_header->path_length +
record_header->title_length),
record_header->description_length};

if (!case_insensitive_contains(path, query) &&
!case_insensitive_contains(title, query) &&
!case_insensitive_contains(description, query)) {
continue;
}

auto entry{sourcemeta::core::JSON::make_object()};
const std::string line_string{line};
auto line_json{sourcemeta::core::parse_json(line_string)};
entry.assign("path", std::move(line_json.at(0)));
entry.assign("title", std::move(line_json.at(1)));
entry.assign("description", std::move(line_json.at(2)));
entry.assign("path", sourcemeta::core::JSON{std::string{path}});
entry.assign("title", sourcemeta::core::JSON{std::string{title}});
entry.assign("description",
sourcemeta::core::JSON{std::string{description}});
result.push_back(std::move(entry));

constexpr auto MAXIMUM_SEARCH_COUNT{10};
Expand Down
24 changes: 12 additions & 12 deletions test/cli/index/common/search-index-nested-rebuild.sh
Original file line number Diff line number Diff line change
Expand Up @@ -53,17 +53,17 @@ EOF

SEARCH="$TMP/output/explorer/%/search.metapack"

extract_search_entries() {
strings "$1" | grep '^\[' | LC_ALL=C sort
extract_search_paths() {
strings "$1" | grep '^/' | LC_ALL=C sort
}

# Run 1: full build with two schemas in separate directories
"$1" --skip-banner "$TMP/one.json" "$TMP/output" --concurrency 1 > /dev/null 2>&1

extract_search_entries "$SEARCH" > "$TMP/search_actual.txt"
extract_search_paths "$SEARCH" > "$TMP/search_actual.txt"
cat << 'EOF' > "$TMP/search_expected.txt"
["/left/schemas/a","",""]
["/right/schemas/b","",""]
/left/schemas/a
/right/schemas/b
EOF
diff "$TMP/search_actual.txt" "$TMP/search_expected.txt"

Expand All @@ -78,10 +78,10 @@ EOF

"$1" --skip-banner "$TMP/one.json" "$TMP/output" --concurrency 1 > /dev/null 2>&1

extract_search_entries "$SEARCH" > "$TMP/search_actual.txt"
extract_search_paths "$SEARCH" > "$TMP/search_actual.txt"
cat << 'EOF' > "$TMP/search_expected.txt"
["/left/schemas/a","",""]
["/right/schemas/b","",""]
/left/schemas/a
/right/schemas/b
EOF
diff "$TMP/search_actual.txt" "$TMP/search_expected.txt"

Expand All @@ -95,10 +95,10 @@ EOF

"$1" --skip-banner "$TMP/one.json" "$TMP/output" --concurrency 1 > /dev/null 2>&1

extract_search_entries "$SEARCH" > "$TMP/search_actual.txt"
extract_search_paths "$SEARCH" > "$TMP/search_actual.txt"
cat << 'EOF' > "$TMP/search_expected.txt"
["/left/schemas/a","",""]
["/right/schemas/b","",""]
["/right/schemas/c","",""]
/left/schemas/a
/right/schemas/b
/right/schemas/c
EOF
diff "$TMP/search_actual.txt" "$TMP/search_expected.txt"
2 changes: 1 addition & 1 deletion test/unit/search/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
sourcemeta_googletest(NAMESPACE sourcemeta PROJECT one NAME search
SOURCES search_test.cc)
SOURCES search_build_test.cc search_query_test.cc)

target_link_libraries(sourcemeta_one_search_unit
PRIVATE sourcemeta::one::search)
Loading
Loading