getml · liuzicheng1987 · May 15, 2026 · May 14, 2026 · May 14, 2026 · May 15, 2026
diff --git a/include/rfl/internal/strings/utf8_conversions.hpp b/include/rfl/internal/strings/utf8_conversions.hpp
@@ -0,0 +1,170 @@
+#ifndef RFL_INTERNAL_STRINGS_UTF8_CONVERSIONS_HPP_
+#define RFL_INTERNAL_STRINGS_UTF8_CONVERSIONS_HPP_
+
+#include <optional>
+#include <string>
+
+namespace rfl::internal::strings {
+
+// Locale-independent conversion between UTF-8 (std::string) and std::wstring.
+//
+// JSON is defined to be UTF-8 (RFC 8259), so the std::string side is always
+// UTF-8. wchar_t is treated as UTF-32 where it is >= 4 bytes (Linux, macOS) and
+// as UTF-16 where it is 2 bytes (Windows). Unlike std::mbsrtowcs / std::wcsrtombs
+// this does not depend on the process's C locale and cannot return an
+// unchecked error sentinel.
+
+namespace utf8_detail {
+
+/// Decodes UTF-8 bytes into Unicode code points. Returns std::nullopt if the
+/// input is not well-formed UTF-8 (truncated, overlong, bad continuation byte,
+/// surrogate code point, or out of range).
+inline std::optional<std::u32string> decode_utf8(const std::string& _str) {
+  std::u32string out;
-  std::u32string out;
+  std::u32string out;
+  out.reserve(_str.size());
-  std::u32string out;
+  std::u32string out;
+  out.reserve(_str.size());
+  std::size_t i = 0;
+  const std::size_t n = _str.size();
+  while (i < n) {
+    const auto c = static_cast<unsigned char>(_str[i]);
+    char32_t cp = 0;
+    int extra = 0;
+    if (c < 0x80) {
+      cp = c;
+      extra = 0;
+    } else if ((c >> 5) == 0x6) {
+      cp = c & 0x1F;
+      extra = 1;
+    } else if ((c >> 4) == 0xE) {
+      cp = c & 0x0F;
+      extra = 2;
+    } else if ((c >> 3) == 0x1E) {
+      cp = c & 0x07;
+      extra = 3;
+    } else {
+      return std::nullopt;  // invalid leading byte
+    }
+    if (i + static_cast<std::size_t>(extra) >= n) {
+      return std::nullopt;  // truncated multi-byte sequence
+    }
+    for (int k = 1; k <= extra; ++k) {
+      const auto cc = static_cast<unsigned char>(_str[i + k]);
+      if ((cc >> 6) != 0x2) {
+        return std::nullopt;  // bad continuation byte
+      }
+      cp = (cp << 6) | (cc & 0x3F);
+    }
+    static constexpr char32_t mins[] = {0, 0x80, 0x800, 0x10000};
+    if (cp < mins[extra]) {
+      return std::nullopt;  // overlong encoding
+    }
+    if (cp > 0x10FFFF || (cp >= 0xD800 && cp <= 0xDFFF)) {
+      return std::nullopt;  // out of range or surrogate code point
+    }
+    out.push_back(cp);
+    i += static_cast<std::size_t>(extra) + 1;
+  }
+  return out;
+}
+
+/// Encodes Unicode code points as UTF-8 bytes.
+inline std::string encode_utf8(const std::u32string& _cps) {
+  std::string out;
+  for (const char32_t cp : _cps) {
+    if (cp < 0x80) {
+      out.push_back(static_cast<char>(cp));
+    } else if (cp < 0x800) {
+      out.push_back(static_cast<char>(0xC0 | (cp >> 6)));
+      out.push_back(static_cast<char>(0x80 | (cp & 0x3F)));
+    } else if (cp < 0x10000) {
+      out.push_back(static_cast<char>(0xE0 | (cp >> 12)));
+      out.push_back(static_cast<char>(0x80 | ((cp >> 6) & 0x3F)));
+      out.push_back(static_cast<char>(0x80 | (cp & 0x3F)));
+    } else {
+      out.push_back(static_cast<char>(0xF0 | (cp >> 18)));
+      out.push_back(static_cast<char>(0x80 | ((cp >> 12) & 0x3F)));
+      out.push_back(static_cast<char>(0x80 | ((cp >> 6) & 0x3F)));
+      out.push_back(static_cast<char>(0x80 | (cp & 0x3F)));
+    }
+  }
+  return out;
+}
+
+/// Encodes Unicode code points as a std::wstring (UTF-32 or UTF-16 depending on
+/// the width of wchar_t).
+inline std::wstring codepoints_to_wide(const std::u32string& _cps) {
+  std::wstring out;
+  for (const char32_t cp : _cps) {
+    if constexpr (sizeof(wchar_t) >= 4) {
+      out.push_back(static_cast<wchar_t>(cp));
+    } else {  // UTF-16
+      if (cp < 0x10000) {
+        out.push_back(static_cast<wchar_t>(cp));
+      } else {
+        const char32_t v = cp - 0x10000;
+        out.push_back(static_cast<wchar_t>(0xD800 + (v >> 10)));
+        out.push_back(static_cast<wchar_t>(0xDC00 + (v & 0x3FF)));
+      }
+    }
+  }
+  return out;
+}
+
+/// Decodes a std::wstring into Unicode code points. Returns std::nullopt if the
+/// input contains unpaired surrogates or out-of-range code units.
+inline std::optional<std::u32string> wide_to_codepoints(const std::wstring& _str) {
+  std::u32string out;
+  if constexpr (sizeof(wchar_t) >= 4) {
+    for (const wchar_t w : _str) {
+      const auto cp = static_cast<char32_t>(w);
+      if (cp > 0x10FFFF || (cp >= 0xD800 && cp <= 0xDFFF)) {
+        return std::nullopt;
+      }
+      out.push_back(cp);
+    }
+  } else {  // UTF-16
+    for (std::size_t i = 0; i < _str.size(); ++i) {
+      const auto u = static_cast<char32_t>(static_cast<char16_t>(_str[i]));
+      if (u >= 0xD800 && u <= 0xDBFF) {  // high surrogate
+        if (i + 1 >= _str.size()) {
+          return std::nullopt;  // unpaired high surrogate
+        }
+        const auto lo = static_cast<char32_t>(static_cast<char16_t>(_str[i + 1]));
+        if (lo < 0xDC00 || lo > 0xDFFF) {
+          return std::nullopt;  // high surrogate not followed by low surrogate
+        }
+        out.push_back(0x10000 + ((u - 0xD800) << 10) + (lo - 0xDC00));
+        ++i;
+      } else if (u >= 0xDC00 && u <= 0xDFFF) {
+        return std::nullopt;  // unpaired low surrogate
+      } else {
+        out.push_back(u);
+      }
+    }
+  }
+  return out;
+}
+
+}  // namespace utf8_detail
+
+/// Converts a UTF-8-encoded std::string to a std::wstring. Returns std::nullopt
+/// if the input is not well-formed UTF-8.
+inline std::optional<std::wstring> utf8_to_wstring(const std::string& _str) {
+  const auto cps = utf8_detail::decode_utf8(_str);
+  if (!cps) {
+    return std::nullopt;
+  }
+  return utf8_detail::codepoints_to_wide(*cps);
+}
+
+/// Converts a std::wstring to a UTF-8-encoded std::string. Returns std::nullopt
+/// if the input contains unpaired surrogates or otherwise invalid code units.
+inline std::optional<std::string> wstring_to_utf8(const std::wstring& _str) {
+  const auto cps = utf8_detail::wide_to_codepoints(_str);
+  if (!cps) {
+    return std::nullopt;
+  }
+  return utf8_detail::encode_utf8(*cps);
+}
+
+}  // namespace rfl::internal::strings
+
+#endif
diff --git a/include/rfl/parsing/Parser_filepath.hpp b/include/rfl/parsing/Parser_filepath.hpp
@@ -27,7 +27,10 @@ struct Parser<R, W, std::filesystem::path, ProcessorsType> {
     const auto to_path =
         [&](std::string&& _str) -> Result<std::filesystem::path> {
       try {
-        return std::filesystem::path(_str);
+        // JSON strings are UTF-8 (RFC 8259); construct the path from UTF-8
+        // explicitly rather than via the locale-dependent narrow constructor.
+        return std::filesystem::path(std::u8string(
+            reinterpret_cast<const char8_t*>(_str.data()), _str.size()));
-        return std::filesystem::path(std::u8string(
-            reinterpret_cast<const char8_t*>(_str.data()), _str.size()));
+        const auto* ptr = reinterpret_cast<const char8_t*>(_str.data());
+        return std::filesystem::path(ptr, ptr + _str.size());
-        return std::filesystem::path(std::u8string(
-            reinterpret_cast<const char8_t*>(_str.data()), _str.size()));
+        const auto* ptr = reinterpret_cast<const char8_t*>(_str.data());
+        return std::filesystem::path(ptr, ptr + _str.size());
       } catch (std::exception& e) {
         return error(e.what());
       }
@@ -46,8 +49,12 @@ struct Parser<R, W, std::filesystem::path, ProcessorsType> {
   template <class P>
   static void write(const W& _w, const std::filesystem::path& _p,
                     const P& _parent) {
-    return Parser<R, W, std::string, ProcessorsType>::write(_w, _p.string(),
-                                                            _parent);
+    // Emit the path as UTF-8 (RFC 8259); _p.string() is locale-dependent and
+    // throws / mangles on Windows for non-representable characters.
+    const auto u8 = _p.u8string();
+    return Parser<R, W, std::string, ProcessorsType>::write(
+        _w, std::string(reinterpret_cast<const char*>(u8.c_str()), u8.size()),
+        _parent);
   }
 
   /**

diff --git a/include/rfl/parsing/Parser_wstring.hpp b/include/rfl/parsing/Parser_wstring.hpp
@@ -5,6 +5,7 @@
 
 #include "../Result.hpp"
 #include "../always_false.hpp"
+#include "../internal/strings/utf8_conversions.hpp"
 #include "Parent.hpp"
 #include "Parser_base.hpp"
 #include "schema/Type.hpp"
@@ -37,25 +38,17 @@ struct Parser<R, W, std::wstring, ProcessorsType> {
     if (!inStr) {
       return Result<std::wstring>(error(inStr.error()));
     }
-    // if (auto err = inStr.error(); err.has_value()) {
-    //   return Result<std::wstring>(err.value());
-    // }
 
-    std::mbstate_t state = std::mbstate_t();
-    auto val = inStr.value();
-
-    std::wstring outStr(val.size() * 2, L'\0');
-
-    // Explicitly set the size so we don't empty it when we truncate
-    outStr.resize(val.size() * 2);
-
-    auto* ptr = val.c_str();
-
-    // Add 1 for null terminator
-    auto len = std::mbsrtowcs(outStr.data(), &ptr, val.size(), &state);
-    outStr.resize(len);  // Truncate the extra bytes
+    // JSON strings are UTF-8 (RFC 8259). Convert explicitly rather than via the
+    // locale-dependent std::mbsrtowcs, which returns (size_t)-1 on any input
+    // that is not valid in the current C locale's encoding.
+    auto outStr = internal::strings::utf8_to_wstring(inStr.value());
+    if (!outStr) {
+      return Result<std::wstring>(
+          error("Could not parse the string: it is not valid UTF-8."));
+    }
 
-    return Result<std::wstring>(outStr);
+    return Result<std::wstring>(std::move(*outStr));
   }
 
   /**
@@ -68,20 +61,12 @@ struct Parser<R, W, std::wstring, ProcessorsType> {
    */
   template <class P>
   static void write(const W& _w, const std::wstring& _str, const P& _parent) {
-    if (_str.empty()) {
-      ParentType::add_value(_w, std::string(), _parent);
-      return;
-    }
-
-    std::mbstate_t state = std::mbstate_t();
-    std::string outStr(_str.size(), '\0');
-    outStr.resize(_str.size());
-
-    auto* ptr = _str.c_str();
-    auto len = std::wcsrtombs(outStr.data(), &ptr, _str.size(), &state);
-    outStr.resize(len);
-
-    ParentType::add_value(_w, outStr, _parent);
+    // Emit the wstring as UTF-8 (RFC 8259). wstring_to_utf8 only fails on
+    // genuinely malformed input (e.g. unpaired surrogates); Parser<...>::write
+    // returns void and the Writer has no error channel, so emit an empty string
+    // in that case rather than crash.
+    const auto outStr = internal::strings::wstring_to_utf8(_str);
+    ParentType::add_value(_w, outStr.value_or(std::string()), _parent);
-    const auto outStr = internal::strings::wstring_to_utf8(_str);
-    ParentType::add_value(_w, outStr.value_or(std::string()), _parent);
+    auto outStr = internal::strings::wstring_to_utf8(_str);
+    ParentType::add_value(_w, std::move(outStr).value_or(std::string()), _parent);
-    const auto outStr = internal::strings::wstring_to_utf8(_str);
-    ParentType::add_value(_w, outStr.value_or(std::string()), _parent);
+    auto outStr = internal::strings::wstring_to_utf8(_str);
+    ParentType::add_value(_w, std::move(outStr).value_or(std::string()), _parent);
   }
 
   /**

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -2,7 +2,10 @@ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -O2")
 
 # Note: Adding -Wno-stringop-overflow is necessary, because of false positive warnings, see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110498
 if (MSVC)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std:c++20")
+    # /utf-8 tells MSVC the test sources are UTF-8 (default is the system code
+    # page). Without it, non-ASCII characters in wide / u8 string literals are
+    # mis-decoded, breaking the wstring/filepath round-trip tests. See PR #668.
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std:c++20 /utf-8")
 else()
 	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -Wall -Werror -ggdb -ftemplate-backtrace-limit=0")
 endif()

diff --git a/tests/json/test_filepath.cpp b/tests/json/test_filepath.cpp
@@ -21,4 +21,18 @@ TEST(json, test_filepath) {
       homer,
       R"({"firstName":"Homer","lastName":"Simpson","path":"/usr/lib/homer_simpson.cf"})");
 }
+
+// Regression test for issue #421: a std::filesystem::path containing non-ASCII
+// characters previously round-tripped through the locale-dependent
+// path::string() / narrow path constructor and could throw or be mangled.
+// (This file is UTF-8 encoded.)
+TEST(json, test_filepath_non_ascii) {
+  const auto homer =
+      Person{.first_name = "Homer",
+             .path = std::filesystem::path(u8"/home/中文/café.json")};
+
+  write_and_read(
+      homer,
+      R"({"firstName":"Homer","lastName":"Simpson","path":"/home/中文/café.json"})");
+}
 }  // namespace test_filepath
diff --git a/tests/json/test_wstring.cpp b/tests/json/test_wstring.cpp
@@ -12,4 +12,30 @@ TEST(json, test_wstring) {
 
   write_and_read(homer, R"({"firstName":"Homer"})");
 }
+
+struct WithWstrings {
+  std::wstring name;
+  std::wstring note;
+};
+
+// Regression test for issues #421 and #422: non-ASCII content previously made
+// std::mbsrtowcs / std::wcsrtombs return (size_t)-1, which was passed unchecked
+// to resize() and terminated the process. (This file is UTF-8 encoded;
+// astral-plane characters use \U escapes.)
+TEST(json, test_wstring_non_ascii) {
+  // Latin-1 supplement (é), plus an empty wstring.
+  write_and_read(WithWstrings{.name = L"René", .note = L""},
+                 R"({"name":"René","note":""})");
+
+  // Cyrillic ("Привіт"), CJK and Hiragana ("中文 ひらがな").
+  write_and_read(
+      WithWstrings{.name = L"Привіт",
+                   .note = L"中文 ひらがな"},
+      R"({"name":"Привіт","note":"中文 ひらがな"})");
+
+  // Code points beyond the BMP (surrogate pairs where wchar_t is 16 bits wide):
+  // U+1F4A9 and U+1F600.
+  write_and_read(WithWstrings{.name = L"\U0001f4a9", .note = L"a\U0001f600b"},
+                 R"({"name":"💩","note":"a😀b"})");
+}
 }  // namespace test_wstring