diff --git a/include/rfl/internal/strings/utf8_conversions.hpp b/include/rfl/internal/strings/utf8_conversions.hpp new file mode 100644 index 00000000..dc24923f --- /dev/null +++ b/include/rfl/internal/strings/utf8_conversions.hpp @@ -0,0 +1,170 @@ +#ifndef RFL_INTERNAL_STRINGS_UTF8_CONVERSIONS_HPP_ +#define RFL_INTERNAL_STRINGS_UTF8_CONVERSIONS_HPP_ + +#include +#include + +namespace rfl::internal::strings { + +// Locale-independent conversion between UTF-8 (std::string) and std::wstring. +// +// JSON is defined to be UTF-8 (RFC 8259), so the std::string side is always +// UTF-8. wchar_t is treated as UTF-32 where it is >= 4 bytes (Linux, macOS) and +// as UTF-16 where it is 2 bytes (Windows). Unlike std::mbsrtowcs / std::wcsrtombs +// this does not depend on the process's C locale and cannot return an +// unchecked error sentinel. + +namespace utf8_detail { + +/// Decodes UTF-8 bytes into Unicode code points. Returns std::nullopt if the +/// input is not well-formed UTF-8 (truncated, overlong, bad continuation byte, +/// surrogate code point, or out of range). +inline std::optional decode_utf8(const std::string& _str) { + std::u32string out; + std::size_t i = 0; + const std::size_t n = _str.size(); + while (i < n) { + const auto c = static_cast(_str[i]); + char32_t cp = 0; + int extra = 0; + if (c < 0x80) { + cp = c; + extra = 0; + } else if ((c >> 5) == 0x6) { + cp = c & 0x1F; + extra = 1; + } else if ((c >> 4) == 0xE) { + cp = c & 0x0F; + extra = 2; + } else if ((c >> 3) == 0x1E) { + cp = c & 0x07; + extra = 3; + } else { + return std::nullopt; // invalid leading byte + } + if (i + static_cast(extra) >= n) { + return std::nullopt; // truncated multi-byte sequence + } + for (int k = 1; k <= extra; ++k) { + const auto cc = static_cast(_str[i + k]); + if ((cc >> 6) != 0x2) { + return std::nullopt; // bad continuation byte + } + cp = (cp << 6) | (cc & 0x3F); + } + static constexpr char32_t mins[] = {0, 0x80, 0x800, 0x10000}; + if (cp < mins[extra]) { + return std::nullopt; // overlong encoding + } + if (cp > 0x10FFFF || (cp >= 0xD800 && cp <= 0xDFFF)) { + return std::nullopt; // out of range or surrogate code point + } + out.push_back(cp); + i += static_cast(extra) + 1; + } + return out; +} + +/// Encodes Unicode code points as UTF-8 bytes. +inline std::string encode_utf8(const std::u32string& _cps) { + std::string out; + for (const char32_t cp : _cps) { + if (cp < 0x80) { + out.push_back(static_cast(cp)); + } else if (cp < 0x800) { + out.push_back(static_cast(0xC0 | (cp >> 6))); + out.push_back(static_cast(0x80 | (cp & 0x3F))); + } else if (cp < 0x10000) { + out.push_back(static_cast(0xE0 | (cp >> 12))); + out.push_back(static_cast(0x80 | ((cp >> 6) & 0x3F))); + out.push_back(static_cast(0x80 | (cp & 0x3F))); + } else { + out.push_back(static_cast(0xF0 | (cp >> 18))); + out.push_back(static_cast(0x80 | ((cp >> 12) & 0x3F))); + out.push_back(static_cast(0x80 | ((cp >> 6) & 0x3F))); + out.push_back(static_cast(0x80 | (cp & 0x3F))); + } + } + return out; +} + +/// Encodes Unicode code points as a std::wstring (UTF-32 or UTF-16 depending on +/// the width of wchar_t). +inline std::wstring codepoints_to_wide(const std::u32string& _cps) { + std::wstring out; + for (const char32_t cp : _cps) { + if constexpr (sizeof(wchar_t) >= 4) { + out.push_back(static_cast(cp)); + } else { // UTF-16 + if (cp < 0x10000) { + out.push_back(static_cast(cp)); + } else { + const char32_t v = cp - 0x10000; + out.push_back(static_cast(0xD800 + (v >> 10))); + out.push_back(static_cast(0xDC00 + (v & 0x3FF))); + } + } + } + return out; +} + +/// Decodes a std::wstring into Unicode code points. Returns std::nullopt if the +/// input contains unpaired surrogates or out-of-range code units. +inline std::optional wide_to_codepoints(const std::wstring& _str) { + std::u32string out; + if constexpr (sizeof(wchar_t) >= 4) { + for (const wchar_t w : _str) { + const auto cp = static_cast(w); + if (cp > 0x10FFFF || (cp >= 0xD800 && cp <= 0xDFFF)) { + return std::nullopt; + } + out.push_back(cp); + } + } else { // UTF-16 + for (std::size_t i = 0; i < _str.size(); ++i) { + const auto u = static_cast(static_cast(_str[i])); + if (u >= 0xD800 && u <= 0xDBFF) { // high surrogate + if (i + 1 >= _str.size()) { + return std::nullopt; // unpaired high surrogate + } + const auto lo = static_cast(static_cast(_str[i + 1])); + if (lo < 0xDC00 || lo > 0xDFFF) { + return std::nullopt; // high surrogate not followed by low surrogate + } + out.push_back(0x10000 + ((u - 0xD800) << 10) + (lo - 0xDC00)); + ++i; + } else if (u >= 0xDC00 && u <= 0xDFFF) { + return std::nullopt; // unpaired low surrogate + } else { + out.push_back(u); + } + } + } + return out; +} + +} // namespace utf8_detail + +/// Converts a UTF-8-encoded std::string to a std::wstring. Returns std::nullopt +/// if the input is not well-formed UTF-8. +inline std::optional utf8_to_wstring(const std::string& _str) { + const auto cps = utf8_detail::decode_utf8(_str); + if (!cps) { + return std::nullopt; + } + return utf8_detail::codepoints_to_wide(*cps); +} + +/// Converts a std::wstring to a UTF-8-encoded std::string. Returns std::nullopt +/// if the input contains unpaired surrogates or otherwise invalid code units. +inline std::optional wstring_to_utf8(const std::wstring& _str) { + const auto cps = utf8_detail::wide_to_codepoints(_str); + if (!cps) { + return std::nullopt; + } + return utf8_detail::encode_utf8(*cps); +} + +} // namespace rfl::internal::strings + +#endif diff --git a/include/rfl/parsing/Parser_filepath.hpp b/include/rfl/parsing/Parser_filepath.hpp index 824debfc..a2de5851 100644 --- a/include/rfl/parsing/Parser_filepath.hpp +++ b/include/rfl/parsing/Parser_filepath.hpp @@ -27,7 +27,10 @@ struct Parser { const auto to_path = [&](std::string&& _str) -> Result { try { - return std::filesystem::path(_str); + // JSON strings are UTF-8 (RFC 8259); construct the path from UTF-8 + // explicitly rather than via the locale-dependent narrow constructor. + return std::filesystem::path(std::u8string( + reinterpret_cast(_str.data()), _str.size())); } catch (std::exception& e) { return error(e.what()); } @@ -46,8 +49,12 @@ struct Parser { template static void write(const W& _w, const std::filesystem::path& _p, const P& _parent) { - return Parser::write(_w, _p.string(), - _parent); + // Emit the path as UTF-8 (RFC 8259); _p.string() is locale-dependent and + // throws / mangles on Windows for non-representable characters. + const auto u8 = _p.u8string(); + return Parser::write( + _w, std::string(reinterpret_cast(u8.c_str()), u8.size()), + _parent); } /** diff --git a/include/rfl/parsing/Parser_wstring.hpp b/include/rfl/parsing/Parser_wstring.hpp index aa160f74..f6d72af6 100644 --- a/include/rfl/parsing/Parser_wstring.hpp +++ b/include/rfl/parsing/Parser_wstring.hpp @@ -5,6 +5,7 @@ #include "../Result.hpp" #include "../always_false.hpp" +#include "../internal/strings/utf8_conversions.hpp" #include "Parent.hpp" #include "Parser_base.hpp" #include "schema/Type.hpp" @@ -37,25 +38,17 @@ struct Parser { if (!inStr) { return Result(error(inStr.error())); } - // if (auto err = inStr.error(); err.has_value()) { - // return Result(err.value()); - // } - std::mbstate_t state = std::mbstate_t(); - auto val = inStr.value(); - - std::wstring outStr(val.size() * 2, L'\0'); - - // Explicitly set the size so we don't empty it when we truncate - outStr.resize(val.size() * 2); - - auto* ptr = val.c_str(); - - // Add 1 for null terminator - auto len = std::mbsrtowcs(outStr.data(), &ptr, val.size(), &state); - outStr.resize(len); // Truncate the extra bytes + // JSON strings are UTF-8 (RFC 8259). Convert explicitly rather than via the + // locale-dependent std::mbsrtowcs, which returns (size_t)-1 on any input + // that is not valid in the current C locale's encoding. + auto outStr = internal::strings::utf8_to_wstring(inStr.value()); + if (!outStr) { + return Result( + error("Could not parse the string: it is not valid UTF-8.")); + } - return Result(outStr); + return Result(std::move(*outStr)); } /** @@ -68,20 +61,12 @@ struct Parser { */ template static void write(const W& _w, const std::wstring& _str, const P& _parent) { - if (_str.empty()) { - ParentType::add_value(_w, std::string(), _parent); - return; - } - - std::mbstate_t state = std::mbstate_t(); - std::string outStr(_str.size(), '\0'); - outStr.resize(_str.size()); - - auto* ptr = _str.c_str(); - auto len = std::wcsrtombs(outStr.data(), &ptr, _str.size(), &state); - outStr.resize(len); - - ParentType::add_value(_w, outStr, _parent); + // Emit the wstring as UTF-8 (RFC 8259). wstring_to_utf8 only fails on + // genuinely malformed input (e.g. unpaired surrogates); Parser<...>::write + // returns void and the Writer has no error channel, so emit an empty string + // in that case rather than crash. + const auto outStr = internal::strings::wstring_to_utf8(_str); + ParentType::add_value(_w, outStr.value_or(std::string()), _parent); } /** diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 501264b0..d68b3a0f 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -2,7 +2,10 @@ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -O2") # Note: Adding -Wno-stringop-overflow is necessary, because of false positive warnings, see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110498 if (MSVC) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std:c++20") + # /utf-8 tells MSVC the test sources are UTF-8 (default is the system code + # page). Without it, non-ASCII characters in wide / u8 string literals are + # mis-decoded, breaking the wstring/filepath round-trip tests. See PR #668. + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std:c++20 /utf-8") else() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -Wall -Werror -ggdb -ftemplate-backtrace-limit=0") endif() diff --git a/tests/json/test_filepath.cpp b/tests/json/test_filepath.cpp index f9de556f..f264bcf2 100644 --- a/tests/json/test_filepath.cpp +++ b/tests/json/test_filepath.cpp @@ -21,4 +21,18 @@ TEST(json, test_filepath) { homer, R"({"firstName":"Homer","lastName":"Simpson","path":"/usr/lib/homer_simpson.cf"})"); } + +// Regression test for issue #421: a std::filesystem::path containing non-ASCII +// characters previously round-tripped through the locale-dependent +// path::string() / narrow path constructor and could throw or be mangled. +// (This file is UTF-8 encoded.) +TEST(json, test_filepath_non_ascii) { + const auto homer = + Person{.first_name = "Homer", + .path = std::filesystem::path(u8"/home/中文/café.json")}; + + write_and_read( + homer, + R"({"firstName":"Homer","lastName":"Simpson","path":"/home/中文/café.json"})"); +} } // namespace test_filepath diff --git a/tests/json/test_wstring.cpp b/tests/json/test_wstring.cpp index 10c178be..1f73080f 100644 --- a/tests/json/test_wstring.cpp +++ b/tests/json/test_wstring.cpp @@ -12,4 +12,30 @@ TEST(json, test_wstring) { write_and_read(homer, R"({"firstName":"Homer"})"); } + +struct WithWstrings { + std::wstring name; + std::wstring note; +}; + +// Regression test for issues #421 and #422: non-ASCII content previously made +// std::mbsrtowcs / std::wcsrtombs return (size_t)-1, which was passed unchecked +// to resize() and terminated the process. (This file is UTF-8 encoded; +// astral-plane characters use \U escapes.) +TEST(json, test_wstring_non_ascii) { + // Latin-1 supplement (é), plus an empty wstring. + write_and_read(WithWstrings{.name = L"René", .note = L""}, + R"({"name":"René","note":""})"); + + // Cyrillic ("Привіт"), CJK and Hiragana ("中文 ひらがな"). + write_and_read( + WithWstrings{.name = L"Привіт", + .note = L"中文 ひらがな"}, + R"({"name":"Привіт","note":"中文 ひらがな"})"); + + // Code points beyond the BMP (surrogate pairs where wchar_t is 16 bits wide): + // U+1F4A9 and U+1F600. + write_and_read(WithWstrings{.name = L"\U0001f4a9", .note = L"a\U0001f600b"}, + R"({"name":"💩","note":"a😀b"})"); +} } // namespace test_wstring