Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
170 changes: 170 additions & 0 deletions include/rfl/internal/strings/utf8_conversions.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
#ifndef RFL_INTERNAL_STRINGS_UTF8_CONVERSIONS_HPP_
#define RFL_INTERNAL_STRINGS_UTF8_CONVERSIONS_HPP_

#include <optional>
#include <string>

namespace rfl::internal::strings {

// Locale-independent conversion between UTF-8 (std::string) and std::wstring.
//
// JSON is defined to be UTF-8 (RFC 8259), so the std::string side is always
// UTF-8. wchar_t is treated as UTF-32 where it is >= 4 bytes (Linux, macOS) and
// as UTF-16 where it is 2 bytes (Windows). Unlike std::mbsrtowcs / std::wcsrtombs
// this does not depend on the process's C locale and cannot return an
// unchecked error sentinel.

namespace utf8_detail {

/// Decodes UTF-8 bytes into Unicode code points. Returns std::nullopt if the
/// input is not well-formed UTF-8 (truncated, overlong, bad continuation byte,
/// surrogate code point, or out of range).
inline std::optional<std::u32string> decode_utf8(const std::string& _str) {
std::u32string out;
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Consider using out.reserve(_str.size()) to avoid multiple reallocations during decoding. Since each UTF-8 byte represents at most one Unicode code point, the input string size is a safe upper bound for the number of code points.

Suggested change
std::u32string out;
std::u32string out;
out.reserve(_str.size());

std::size_t i = 0;
const std::size_t n = _str.size();
while (i < n) {
const auto c = static_cast<unsigned char>(_str[i]);
char32_t cp = 0;
int extra = 0;
if (c < 0x80) {
cp = c;
extra = 0;
} else if ((c >> 5) == 0x6) {
cp = c & 0x1F;
extra = 1;
} else if ((c >> 4) == 0xE) {
cp = c & 0x0F;
extra = 2;
} else if ((c >> 3) == 0x1E) {
cp = c & 0x07;
extra = 3;
} else {
return std::nullopt; // invalid leading byte
}
if (i + static_cast<std::size_t>(extra) >= n) {
return std::nullopt; // truncated multi-byte sequence
}
for (int k = 1; k <= extra; ++k) {
const auto cc = static_cast<unsigned char>(_str[i + k]);
if ((cc >> 6) != 0x2) {
return std::nullopt; // bad continuation byte
}
cp = (cp << 6) | (cc & 0x3F);
}
static constexpr char32_t mins[] = {0, 0x80, 0x800, 0x10000};
if (cp < mins[extra]) {
return std::nullopt; // overlong encoding
}
if (cp > 0x10FFFF || (cp >= 0xD800 && cp <= 0xDFFF)) {
return std::nullopt; // out of range or surrogate code point
}
out.push_back(cp);
i += static_cast<std::size_t>(extra) + 1;
}
return out;
}

/// Encodes Unicode code points as UTF-8 bytes.
inline std::string encode_utf8(const std::u32string& _cps) {
std::string out;
for (const char32_t cp : _cps) {
if (cp < 0x80) {
out.push_back(static_cast<char>(cp));
} else if (cp < 0x800) {
out.push_back(static_cast<char>(0xC0 | (cp >> 6)));
out.push_back(static_cast<char>(0x80 | (cp & 0x3F)));
} else if (cp < 0x10000) {
out.push_back(static_cast<char>(0xE0 | (cp >> 12)));
out.push_back(static_cast<char>(0x80 | ((cp >> 6) & 0x3F)));
out.push_back(static_cast<char>(0x80 | (cp & 0x3F)));
} else {
out.push_back(static_cast<char>(0xF0 | (cp >> 18)));
out.push_back(static_cast<char>(0x80 | ((cp >> 12) & 0x3F)));
out.push_back(static_cast<char>(0x80 | ((cp >> 6) & 0x3F)));
out.push_back(static_cast<char>(0x80 | (cp & 0x3F)));
}
}
return out;
}

/// Encodes Unicode code points as a std::wstring (UTF-32 or UTF-16 depending on
/// the width of wchar_t).
inline std::wstring codepoints_to_wide(const std::u32string& _cps) {
std::wstring out;
for (const char32_t cp : _cps) {
if constexpr (sizeof(wchar_t) >= 4) {
out.push_back(static_cast<wchar_t>(cp));
} else { // UTF-16
if (cp < 0x10000) {
out.push_back(static_cast<wchar_t>(cp));
} else {
const char32_t v = cp - 0x10000;
out.push_back(static_cast<wchar_t>(0xD800 + (v >> 10)));
out.push_back(static_cast<wchar_t>(0xDC00 + (v & 0x3FF)));
}
}
}
return out;
}

/// Decodes a std::wstring into Unicode code points. Returns std::nullopt if the
/// input contains unpaired surrogates or out-of-range code units.
inline std::optional<std::u32string> wide_to_codepoints(const std::wstring& _str) {
std::u32string out;
if constexpr (sizeof(wchar_t) >= 4) {
for (const wchar_t w : _str) {
const auto cp = static_cast<char32_t>(w);
if (cp > 0x10FFFF || (cp >= 0xD800 && cp <= 0xDFFF)) {
return std::nullopt;
}
out.push_back(cp);
}
} else { // UTF-16
for (std::size_t i = 0; i < _str.size(); ++i) {
const auto u = static_cast<char32_t>(static_cast<char16_t>(_str[i]));
if (u >= 0xD800 && u <= 0xDBFF) { // high surrogate
if (i + 1 >= _str.size()) {
return std::nullopt; // unpaired high surrogate
}
const auto lo = static_cast<char32_t>(static_cast<char16_t>(_str[i + 1]));
if (lo < 0xDC00 || lo > 0xDFFF) {
return std::nullopt; // high surrogate not followed by low surrogate
}
out.push_back(0x10000 + ((u - 0xD800) << 10) + (lo - 0xDC00));
++i;
} else if (u >= 0xDC00 && u <= 0xDFFF) {
return std::nullopt; // unpaired low surrogate
} else {
out.push_back(u);
}
}
}
return out;
}

} // namespace utf8_detail

/// Converts a UTF-8-encoded std::string to a std::wstring. Returns std::nullopt
/// if the input is not well-formed UTF-8.
inline std::optional<std::wstring> utf8_to_wstring(const std::string& _str) {
const auto cps = utf8_detail::decode_utf8(_str);
if (!cps) {
return std::nullopt;
}
return utf8_detail::codepoints_to_wide(*cps);
}

/// Converts a std::wstring to a UTF-8-encoded std::string. Returns std::nullopt
/// if the input contains unpaired surrogates or otherwise invalid code units.
inline std::optional<std::string> wstring_to_utf8(const std::wstring& _str) {
const auto cps = utf8_detail::wide_to_codepoints(_str);
if (!cps) {
return std::nullopt;
}
return utf8_detail::encode_utf8(*cps);
}

} // namespace rfl::internal::strings

#endif
13 changes: 10 additions & 3 deletions include/rfl/parsing/Parser_filepath.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,10 @@ struct Parser<R, W, std::filesystem::path, ProcessorsType> {
const auto to_path =
[&](std::string&& _str) -> Result<std::filesystem::path> {
try {
return std::filesystem::path(_str);
// JSON strings are UTF-8 (RFC 8259); construct the path from UTF-8
// explicitly rather than via the locale-dependent narrow constructor.
return std::filesystem::path(std::u8string(
reinterpret_cast<const char8_t*>(_str.data()), _str.size()));
Comment on lines +32 to +33
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Using the iterator-based constructor of std::filesystem::path with char8_t* avoids the creation of a temporary std::u8string object and its associated heap allocation.

Suggested change
return std::filesystem::path(std::u8string(
reinterpret_cast<const char8_t*>(_str.data()), _str.size()));
const auto* ptr = reinterpret_cast<const char8_t*>(_str.data());
return std::filesystem::path(ptr, ptr + _str.size());

} catch (std::exception& e) {
return error(e.what());
}
Expand All @@ -46,8 +49,12 @@ struct Parser<R, W, std::filesystem::path, ProcessorsType> {
template <class P>
static void write(const W& _w, const std::filesystem::path& _p,
const P& _parent) {
return Parser<R, W, std::string, ProcessorsType>::write(_w, _p.string(),
_parent);
// Emit the path as UTF-8 (RFC 8259); _p.string() is locale-dependent and
// throws / mangles on Windows for non-representable characters.
const auto u8 = _p.u8string();
return Parser<R, W, std::string, ProcessorsType>::write(
_w, std::string(reinterpret_cast<const char*>(u8.c_str()), u8.size()),
_parent);
}

/**
Expand Down
47 changes: 16 additions & 31 deletions include/rfl/parsing/Parser_wstring.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

#include "../Result.hpp"
#include "../always_false.hpp"
#include "../internal/strings/utf8_conversions.hpp"
#include "Parent.hpp"
#include "Parser_base.hpp"
#include "schema/Type.hpp"
Expand Down Expand Up @@ -37,25 +38,17 @@ struct Parser<R, W, std::wstring, ProcessorsType> {
if (!inStr) {
return Result<std::wstring>(error(inStr.error()));
}
// if (auto err = inStr.error(); err.has_value()) {
// return Result<std::wstring>(err.value());
// }

std::mbstate_t state = std::mbstate_t();
auto val = inStr.value();

std::wstring outStr(val.size() * 2, L'\0');

// Explicitly set the size so we don't empty it when we truncate
outStr.resize(val.size() * 2);

auto* ptr = val.c_str();

// Add 1 for null terminator
auto len = std::mbsrtowcs(outStr.data(), &ptr, val.size(), &state);
outStr.resize(len); // Truncate the extra bytes
// JSON strings are UTF-8 (RFC 8259). Convert explicitly rather than via the
// locale-dependent std::mbsrtowcs, which returns (size_t)-1 on any input
// that is not valid in the current C locale's encoding.
auto outStr = internal::strings::utf8_to_wstring(inStr.value());
if (!outStr) {
return Result<std::wstring>(
error("Could not parse the string: it is not valid UTF-8."));
}

return Result<std::wstring>(outStr);
return Result<std::wstring>(std::move(*outStr));
Comment on lines +45 to +51
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

This function is marked noexcept, but the string allocations and conversions inside utf8_to_wstring can throw std::bad_alloc. To prevent an unexpected std::terminate, consider wrapping the conversion in a try-catch block and returning a Result::error instead, which is consistent with the error handling pattern used in Parser_filepath.hpp.

    try {
      auto outStr = internal::strings::utf8_to_wstring(inStr.value());
      if (!outStr) {
        return error("Could not parse the string: it is not valid UTF-8.");
      }
      return std::move(*outStr);
    } catch (std::exception& e) {
      return error(e.what());
    }

}

/**
Expand All @@ -68,20 +61,12 @@ struct Parser<R, W, std::wstring, ProcessorsType> {
*/
template <class P>
static void write(const W& _w, const std::wstring& _str, const P& _parent) {
if (_str.empty()) {
ParentType::add_value(_w, std::string(), _parent);
return;
}

std::mbstate_t state = std::mbstate_t();
std::string outStr(_str.size(), '\0');
outStr.resize(_str.size());

auto* ptr = _str.c_str();
auto len = std::wcsrtombs(outStr.data(), &ptr, _str.size(), &state);
outStr.resize(len);

ParentType::add_value(_w, outStr, _parent);
// Emit the wstring as UTF-8 (RFC 8259). wstring_to_utf8 only fails on
// genuinely malformed input (e.g. unpaired surrogates); Parser<...>::write
// returns void and the Writer has no error channel, so emit an empty string
// in that case rather than crash.
const auto outStr = internal::strings::wstring_to_utf8(_str);
ParentType::add_value(_w, outStr.value_or(std::string()), _parent);
Comment on lines +68 to +69
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The current implementation performs an unnecessary copy of the string when outStr has a value. By removing const and using std::move(outStr).value_or(...), you can transfer ownership of the internal string to the writer.

Suggested change
const auto outStr = internal::strings::wstring_to_utf8(_str);
ParentType::add_value(_w, outStr.value_or(std::string()), _parent);
auto outStr = internal::strings::wstring_to_utf8(_str);
ParentType::add_value(_w, std::move(outStr).value_or(std::string()), _parent);

}

/**
Expand Down
5 changes: 4 additions & 1 deletion tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -O2")

# Note: Adding -Wno-stringop-overflow is necessary, because of false positive warnings, see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110498
if (MSVC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std:c++20")
# /utf-8 tells MSVC the test sources are UTF-8 (default is the system code
# page). Without it, non-ASCII characters in wide / u8 string literals are
# mis-decoded, breaking the wstring/filepath round-trip tests. See PR #668.
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std:c++20 /utf-8")
else()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -Wall -Werror -ggdb -ftemplate-backtrace-limit=0")
endif()
Expand Down
14 changes: 14 additions & 0 deletions tests/json/test_filepath.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,18 @@ TEST(json, test_filepath) {
homer,
R"({"firstName":"Homer","lastName":"Simpson","path":"/usr/lib/homer_simpson.cf"})");
}

// Regression test for issue #421: a std::filesystem::path containing non-ASCII
// characters previously round-tripped through the locale-dependent
// path::string() / narrow path constructor and could throw or be mangled.
// (This file is UTF-8 encoded.)
TEST(json, test_filepath_non_ascii) {
const auto homer =
Person{.first_name = "Homer",
.path = std::filesystem::path(u8"/home/中文/café.json")};

write_and_read(
homer,
R"({"firstName":"Homer","lastName":"Simpson","path":"/home/中文/café.json"})");
}
} // namespace test_filepath
26 changes: 26 additions & 0 deletions tests/json/test_wstring.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,30 @@ TEST(json, test_wstring) {

write_and_read(homer, R"({"firstName":"Homer"})");
}

struct WithWstrings {
std::wstring name;
std::wstring note;
};

// Regression test for issues #421 and #422: non-ASCII content previously made
// std::mbsrtowcs / std::wcsrtombs return (size_t)-1, which was passed unchecked
// to resize() and terminated the process. (This file is UTF-8 encoded;
// astral-plane characters use \U escapes.)
TEST(json, test_wstring_non_ascii) {
// Latin-1 supplement (é), plus an empty wstring.
write_and_read(WithWstrings{.name = L"René", .note = L""},
R"({"name":"René","note":""})");

// Cyrillic ("Привіт"), CJK and Hiragana ("中文 ひらがな").
write_and_read(
WithWstrings{.name = L"Привіт",
.note = L"中文 ひらがな"},
R"({"name":"Привіт","note":"中文 ひらがな"})");

// Code points beyond the BMP (surrogate pairs where wchar_t is 16 bits wide):
// U+1F4A9 and U+1F600.
write_and_read(WithWstrings{.name = L"\U0001f4a9", .note = L"a\U0001f600b"},
R"({"name":"💩","note":"a😀b"})");
}
} // namespace test_wstring
Loading