From 7e7995d632db495652922bdb6a90c208788c5a48 Mon Sep 17 00:00:00 2001 From: Bo Bayles Date: Tue, 24 Mar 2026 09:32:44 -0500 Subject: [PATCH] Upgrade to ada 3.4.4 --- ada_url/ada.cpp | 67 ++++++++++++++++++++++++------- ada_url/ada.h | 76 +++++++++++++++++++++++++++--------- pyproject.toml | 2 +- tests/files/urltestdata.json | 75 +++++++++++++++++++++++++++++++++++ 4 files changed, 186 insertions(+), 34 deletions(-) diff --git a/ada_url/ada.cpp b/ada_url/ada.cpp index 6c2db6f..80bec6c 100644 --- a/ada_url/ada.cpp +++ b/ada_url/ada.cpp @@ -1,4 +1,4 @@ -/* auto-generated on 2026-02-23 21:29:24 -0500. Do not edit! */ +/* auto-generated on 2026-03-23 17:52:13 -0400. Do not edit! */ /* begin file src/ada.cpp */ #include "ada.h" /* begin file src/checkers.cpp */ @@ -10725,7 +10725,7 @@ constexpr static std::array is_forbidden_domain_code_point_table = for (uint8_t c = 0; c <= 32; c++) { result[c] = true; } - for (size_t c = 127; c < 255; c++) { + for (size_t c = 127; c < 256; c++) { result[c] = true; } return result; @@ -10767,7 +10767,7 @@ constexpr static std::array for (uint8_t c = 0; c <= 32; c++) { result[c] = 1; } - for (size_t c = 127; c < 255; c++) { + for (size_t c = 127; c < 256; c++) { result[c] = 1; } return result; @@ -13404,7 +13404,13 @@ result_type parse_url_impl(std::string_view user_input, url.query = base_url->query; } else { url.update_base_pathname(base_url->get_pathname()); - url.update_base_search(base_url->get_search()); + if (base_url->has_search()) { + // get_search() returns "" for an empty query string (URL ends + // with '?'). update_base_search("") would incorrectly clear the + // query, so pass "?" to preserve the empty query distinction. + auto s = base_url->get_search(); + url.update_base_search(s.empty() ? std::string_view("?") : s); + } } url.update_unencoded_base_hash(*fragment); return url; @@ -13628,7 +13634,13 @@ result_type parse_url_impl(std::string_view user_input, // cloning the base path includes cloning the has_opaque_path flag url.has_opaque_path = base_url->has_opaque_path; url.update_base_pathname(base_url->get_pathname()); - url.update_base_search(base_url->get_search()); + if (base_url->has_search()) { + // get_search() returns "" for an empty query string (URL ends + // with '?'). update_base_search("") would incorrectly clear the + // query, so pass "?" to preserve the empty query distinction. + auto s = base_url->get_search(); + url.update_base_search(s.empty() ? std::string_view("?") : s); + } } url.has_opaque_path = base_url->has_opaque_path; @@ -14046,7 +14058,13 @@ result_type parse_url_impl(std::string_view user_input, } else { url.update_host_to_base_host(base_url->get_hostname()); url.update_base_pathname(base_url->get_pathname()); - url.update_base_search(base_url->get_search()); + if (base_url->has_search()) { + // get_search() returns "" for an empty query string (URL ends + // with '?'). update_base_search("") would incorrectly clear the + // query, so pass "?" to preserve the empty query distinction. + auto s = base_url->get_search(); + url.update_base_search(s.empty() ? std::string_view("?") : s); + } } url.has_opaque_path = base_url->has_opaque_path; @@ -16657,8 +16675,15 @@ tl::expected canonicalize_pathname( const auto pathname = url->get_pathname(); // If leading slash is false, then set result to the code point substring // from 2 to the end of the string within result. - return leading_slash ? std::string(pathname) - : std::string(pathname.substr(2)); + if (!leading_slash) { + // pathname should start with "/-" but path traversal (e.g. "../../") + // can reduce it to just "/" which is shorter than 2 characters. + if (pathname.size() < 2) { + return tl::unexpected(errors::type_error); + } + return std::string(pathname.substr(2)); + } + return std::string(pathname); } // If parseResult is failure, then throw a TypeError. return tl::unexpected(errors::type_error); @@ -17195,7 +17220,8 @@ std::string generate_pattern_string( // point. bool needs_grouping = !part.suffix.empty() || - (!part.prefix.empty() && part.prefix[0] != options.get_prefix()[0]); + (!part.prefix.empty() && !options.get_prefix().empty() && + part.prefix[0] != options.get_prefix()[0]); // If all of the following are true: // - needs grouping is false; and @@ -17233,9 +17259,8 @@ std::string generate_pattern_string( // then set needs grouping to true. if (!needs_grouping && part.prefix.empty() && previous_part && previous_part->type == url_pattern_part_type::FIXED_TEXT && - !options.get_prefix().empty() && - previous_part->value.at(previous_part->value.size() - 1) == - options.get_prefix()[0]) { + !previous_part->value.empty() && !options.get_prefix().empty() && + previous_part->value.back() == options.get_prefix()[0]) { needs_grouping = true; } @@ -17358,8 +17383,14 @@ std_regex_provider::regex_search(std::string_view input, const std::regex& pattern) { // Use iterator-based regex_search to avoid string allocation std::match_results match_result; - if (!std::regex_search(input.begin(), input.end(), match_result, pattern, - std::regex_constants::match_any)) { + try { + if (!std::regex_search(input.begin(), input.end(), match_result, pattern, + std::regex_constants::match_any)) { + return std::nullopt; + } + } catch (const std::regex_error& e) { + (void)e; + ada_log("std_regex_provider::regex_search failed:", e.what()); return std::nullopt; } std::vector> matches; @@ -17378,7 +17409,13 @@ std_regex_provider::regex_search(std::string_view input, bool std_regex_provider::regex_match(std::string_view input, const std::regex& pattern) { - return std::regex_match(input.begin(), input.end(), pattern); + try { + return std::regex_match(input.begin(), input.end(), pattern); + } catch (const std::regex_error& e) { + (void)e; + ada_log("std_regex_provider::regex_match failed:", e.what()); + return false; + } } #endif // ADA_USE_UNSAFE_STD_REGEX_PROVIDER diff --git a/ada_url/ada.h b/ada_url/ada.h index 1210d7d..8f9089d 100644 --- a/ada_url/ada.h +++ b/ada_url/ada.h @@ -1,4 +1,4 @@ -/* auto-generated on 2026-02-23 21:29:24 -0500. Do not edit! */ +/* auto-generated on 2026-03-23 17:52:13 -0400. Do not edit! */ /* begin file include/ada.h */ /** * @file ada.h @@ -6458,6 +6458,39 @@ constexpr std::string_view is_special_list[] = {"http", " ", "https", "ws", "ftp", "wss", "file", " "}; // for use with get_special_port constexpr uint16_t special_ports[] = {80, 0, 443, 80, 21, 443, 0, 0}; + +// @private +// convert a string_view to a 64-bit integer key for fast comparison +constexpr uint64_t make_key(std::string_view sv) { + uint64_t val = 0; + for (size_t i = 0; i < sv.size(); i++) + val |= (uint64_t)(uint8_t)sv[i] << (i * 8); + return val; +} +// precomputed keys for the special schemes, indexed by a hash of the input +// string +constexpr uint64_t scheme_keys[] = { + make_key("http"), // 0: HTTP + 0, // 1: sentinel + make_key("https"), // 2: HTTPS + make_key("ws"), // 3: WS + make_key("ftp"), // 4: FTP + make_key("wss"), // 5: WSS + make_key("file"), // 6: FILE + 0, // 7: sentinel +}; + +// @private +// branchless load of up to 5 characters into a uint64_t, padding with zeros if +// n < 5 +inline uint64_t branchless_load5(const char *p, size_t n) { + uint64_t input = (uint8_t)p[0]; + input |= ((uint64_t)(uint8_t)p[n > 1] << 8) & (0 - (uint64_t)(n > 1)); + input |= ((uint64_t)(uint8_t)p[(n > 2) * 2] << 16) & (0 - (uint64_t)(n > 2)); + input |= ((uint64_t)(uint8_t)p[(n > 3) * 3] << 24) & (0 - (uint64_t)(n > 3)); + input |= ((uint64_t)(uint8_t)p[(n > 4) * 4] << 32) & (0 - (uint64_t)(n > 4)); + return input; +} } // namespace details /**** @@ -6498,7 +6531,9 @@ constexpr uint16_t get_special_port(std::string_view scheme) noexcept { } int hash_value = (2 * scheme.size() + (unsigned)(scheme[0])) & 7; const std::string_view target = details::is_special_list[hash_value]; - if ((target[0] == scheme[0]) && (target.substr(1) == scheme.substr(1))) { + if (scheme.size() == target.size() && + details::branchless_load5(scheme.data(), scheme.size()) == + details::scheme_keys[hash_value]) { return details::special_ports[hash_value]; } else { return 0; @@ -6513,7 +6548,9 @@ constexpr ada::scheme::type get_scheme_type(std::string_view scheme) noexcept { } int hash_value = (2 * scheme.size() + (unsigned)(scheme[0])) & 7; const std::string_view target = details::is_special_list[hash_value]; - if ((target[0] == scheme[0]) && (target.substr(1) == scheme.substr(1))) { + if (scheme.size() == target.size() && + details::branchless_load5(scheme.data(), scheme.size()) == + details::scheme_keys[hash_value]) { return ada::scheme::type(hash_value); } else { return ada::scheme::NOT_SPECIAL; @@ -9368,7 +9405,8 @@ inline void url_search_params::remove(const std::string_view key, } inline void url_search_params::sort() { - // We rely on the fact that the content is valid UTF-8. + // Keys are expected to be valid UTF-8, but percent_decode can produce + // arbitrary byte sequences. Handle truncated/invalid sequences gracefully. std::ranges::stable_sort(params, [](const key_value_pair &lhs, const key_value_pair &rhs) { size_t i = 0, j = 0; @@ -9382,18 +9420,15 @@ inline void url_search_params::sort() { low_surrogate1 = 0; } else { uint8_t c1 = uint8_t(lhs.first[i]); - if (c1 <= 0x7F) { - codePoint1 = c1; - i++; - } else if (c1 <= 0xDF) { + if (c1 > 0x7F && c1 <= 0xDF && i + 1 < lhs.first.size()) { codePoint1 = ((c1 & 0x1F) << 6) | (uint8_t(lhs.first[i + 1]) & 0x3F); i += 2; - } else if (c1 <= 0xEF) { + } else if (c1 > 0xDF && c1 <= 0xEF && i + 2 < lhs.first.size()) { codePoint1 = ((c1 & 0x0F) << 12) | ((uint8_t(lhs.first[i + 1]) & 0x3F) << 6) | (uint8_t(lhs.first[i + 2]) & 0x3F); i += 3; - } else { + } else if (c1 > 0xEF && c1 <= 0xF7 && i + 3 < lhs.first.size()) { codePoint1 = ((c1 & 0x07) << 18) | ((uint8_t(lhs.first[i + 1]) & 0x3F) << 12) | ((uint8_t(lhs.first[i + 2]) & 0x3F) << 6) | @@ -9404,6 +9439,10 @@ inline void url_search_params::sort() { uint16_t high_surrogate = uint16_t(0xD800 + (codePoint1 >> 10)); low_surrogate1 = uint16_t(0xDC00 + (codePoint1 & 0x3FF)); codePoint1 = high_surrogate; + } else { + // ASCII (c1 <= 0x7F) or truncated/invalid UTF-8: treat as raw byte + codePoint1 = c1; + i++; } } @@ -9412,18 +9451,15 @@ inline void url_search_params::sort() { low_surrogate2 = 0; } else { uint8_t c2 = uint8_t(rhs.first[j]); - if (c2 <= 0x7F) { - codePoint2 = c2; - j++; - } else if (c2 <= 0xDF) { + if (c2 > 0x7F && c2 <= 0xDF && j + 1 < rhs.first.size()) { codePoint2 = ((c2 & 0x1F) << 6) | (uint8_t(rhs.first[j + 1]) & 0x3F); j += 2; - } else if (c2 <= 0xEF) { + } else if (c2 > 0xDF && c2 <= 0xEF && j + 2 < rhs.first.size()) { codePoint2 = ((c2 & 0x0F) << 12) | ((uint8_t(rhs.first[j + 1]) & 0x3F) << 6) | (uint8_t(rhs.first[j + 2]) & 0x3F); j += 3; - } else { + } else if (c2 > 0xEF && c2 <= 0xF7 && j + 3 < rhs.first.size()) { codePoint2 = ((c2 & 0x07) << 18) | ((uint8_t(rhs.first[j + 1]) & 0x3F) << 12) | ((uint8_t(rhs.first[j + 2]) & 0x3F) << 6) | @@ -9433,6 +9469,10 @@ inline void url_search_params::sort() { uint16_t high_surrogate = uint16_t(0xD800 + (codePoint2 >> 10)); low_surrogate2 = uint16_t(0xDC00 + (codePoint2 & 0x3FF)); codePoint2 = high_surrogate; + } else { + // ASCII (c2 <= 0x7F) or truncated/invalid UTF-8: treat as raw byte + codePoint2 = c2; + j++; } } @@ -11228,14 +11268,14 @@ constructor_string_parser::parse(std::string_view input) { #ifndef ADA_ADA_VERSION_H #define ADA_ADA_VERSION_H -#define ADA_VERSION "3.4.3" +#define ADA_VERSION "3.4.4" namespace ada { enum { ADA_VERSION_MAJOR = 3, ADA_VERSION_MINOR = 4, - ADA_VERSION_REVISION = 3, + ADA_VERSION_REVISION = 4, }; } // namespace ada diff --git a/pyproject.toml b/pyproject.toml index 04fd6d6..02d9b0f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "ada-url" -version = "1.30.0" +version = "1.31.0" authors = [ {name = "Bo Bayles", email = "bo@bbayles.com"}, ] diff --git a/tests/files/urltestdata.json b/tests/files/urltestdata.json index fd2201c..8a39edd 100644 --- a/tests/files/urltestdata.json +++ b/tests/files/urltestdata.json @@ -6095,6 +6095,21 @@ "search": "", "hash": "" }, + { + "input": "https://0000000000000000000000000000000000000000177.0.0.1", + "base": null, + "href": "https://127.0.0.1/", + "origin": "https://127.0.0.1", + "protocol": "https:", + "username": "", + "password": "", + "host": "127.0.0.1", + "hostname": "127.0.0.1", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, "More IPv4 parsing (via https://github.com/jsdom/whatwg-url/issues/92)", { "input": "https://0x100000000/test", @@ -10296,5 +10311,65 @@ "pathname": "/a\\b", "search": "", "hash": "" + }, + { + "comment": "Fragment with <> on data: URI", + "input": "data:text/plain,test# ", + "base": null, + "href": "data:text/plain,test#%3Cfoo%3E%20%3Cbar%3E", + "protocol": "data:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "text/plain,test", + "search": "", + "hash": "#%3Cfoo%3E%20%3Cbar%3E" + }, + { + "comment": "Fragment with <> on about:blank", + "input": "about:blank# ", + "base": null, + "href": "about:blank#%3Cfoo%3E%20%3Cbar%3E", + "protocol": "about:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "blank", + "search": "", + "hash": "#%3Cfoo%3E%20%3Cbar%3E" + }, + { + "comment": "Fragment percent-encode set on data: URI; tabs and newlines are removed", + "input":"data:text/plain,test#\u0000\u0001\t\n\r\u001f !\"#$%&'()*+,-./09:;<=>?@AZ[\\]^_`az{|}~\u007f\u0080\u0081Éé", + "base": null, + "href": "data:text/plain,test#%00%01%1F%20!%22#$%&'()*+,-./09:;%3C=%3E?@AZ[\\]^_%60az{|}~%7F%C2%80%C2%81%C3%89%C3%A9", + "protocol": "data:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "text/plain,test", + "search": "", + "hash": "#%00%01%1F%20!%22#$%&'()*+,-./09:;%3C=%3E?@AZ[\\]^_%60az{|}~%7F%C2%80%C2%81%C3%89%C3%A9" + }, + { + "comment": "Fragment percent-encode set on about:blank; tabs and newlines are removed", + "input": "about:blank#\u0000\u0001\t\n\r\u001f !\"#$%&'()*+,-./09:;<=>?@AZ[\\]^_`az{|}~\u007f\u0080\u0081Éé", + "base": null, + "href": "about:blank#%00%01%1F%20!%22#$%&'()*+,-./09:;%3C=%3E?@AZ[\\]^_%60az{|}~%7F%C2%80%C2%81%C3%89%C3%A9", + "protocol": "about:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "blank", + "search": "", + "hash": "#%00%01%1F%20!%22#$%&'()*+,-./09:;%3C=%3E?@AZ[\\]^_%60az{|}~%7F%C2%80%C2%81%C3%89%C3%A9" } ]