From 9cf5aa5d634fa048133a1f2752ede0c8c2cc72f0 Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Sat, 22 Nov 2025 14:13:30 +0100 Subject: [PATCH] parser.c: Record escape positions while parsing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We can then pass them to the decoder to save having to parse the string again. ``` == Parsing activitypub.json (58160 bytes) ruby 3.4.6 (2025-09-16 revision dbd83256b1) +YJIT +PRISM [arm64-darwin24] Warming up -------------------------------------- after 1.275k i/100ms Calculating ------------------------------------- after 12.774k (± 0.8%) i/s (78.29 μs/i) - 65.025k in 5.090834s Comparison: before: 12314.3 i/s after: 12773.8 i/s - 1.04x faster == Parsing twitter.json (567916 bytes) ruby 3.4.6 (2025-09-16 revision dbd83256b1) +YJIT +PRISM [arm64-darwin24] Warming up -------------------------------------- after 143.000 i/100ms Calculating ------------------------------------- after 1.441k (± 0.2%) i/s (693.86 μs/i) - 7.293k in 5.060345s Comparison: before: 1430.1 i/s after: 1441.2 i/s - 1.01x faster == Parsing citm_catalog.json (1727030 bytes) ruby 3.4.6 (2025-09-16 revision dbd83256b1) +YJIT +PRISM [arm64-darwin24] Warming up -------------------------------------- after 69.000 i/100ms Calculating ------------------------------------- after 695.919 (± 0.4%) i/s (1.44 ms/i) - 3.519k in 5.056691s Comparison: before: 687.8 i/s after: 695.9 i/s - 1.01x faster ``` --- ext/json/ext/parser/parser.c | 88 ++++++++++++++++++++++++++---------- 1 file changed, 63 insertions(+), 25 deletions(-) diff --git a/ext/json/ext/parser/parser.c b/ext/json/ext/parser/parser.c index 2f584a51..0216eef9 100644 --- a/ext/json/ext/parser/parser.c +++ b/ext/json/ext/parser/parser.c @@ -616,8 +616,10 @@ static inline bool json_string_cacheable_p(const char *string, size_t length) return length <= JSON_RVALUE_CACHE_MAX_ENTRY_LENGTH && rb_isalpha(string[0]); } -static inline VALUE json_string_fastpath(JSON_ParserState *state, const char *string, const char *stringEnd, bool is_name, bool intern, bool symbolize) +static inline VALUE json_string_fastpath(JSON_ParserState *state, JSON_ParserConfig *config, const char *string, const char *stringEnd, bool is_name) { + bool intern = is_name || config->freeze; + bool symbolize = is_name && config->symbolize_names; size_t bufferSize = stringEnd - string; if (is_name && state->in_array && RB_LIKELY(json_string_cacheable_p(string, bufferSize))) { @@ -636,8 +638,33 @@ static inline VALUE json_string_fastpath(JSON_ParserState *state, const char *st return build_string(string, stringEnd, intern, symbolize); } -static VALUE json_string_unescape(JSON_ParserState *state, const char *string, const char *stringEnd, bool is_name, bool intern, bool symbolize) +#define JSON_MAX_UNESCAPE_POSITIONS 16 +typedef struct _json_unescape_positions { + long size; + const char **positions; + bool has_more; +} JSON_UnescapePositions; + +static inline const char *json_next_backslash(const char *pe, const char *stringEnd, JSON_UnescapePositions *positions) +{ + while (positions->size) { + positions->size--; + const char *next_position = positions->positions[0]; + positions->positions++; + return next_position; + } + + if (positions->has_more) { + return memchr(pe, '\\', stringEnd - pe); + } + + return NULL; +} + +static NOINLINE() VALUE json_string_unescape(JSON_ParserState *state, JSON_ParserConfig *config, const char *string, const char *stringEnd, bool is_name, JSON_UnescapePositions *positions) { + bool intern = is_name || config->freeze; + bool symbolize = is_name && config->symbolize_names; size_t bufferSize = stringEnd - string; const char *p = string, *pe = string, *bufferStart; char *buffer; @@ -649,7 +676,7 @@ static VALUE json_string_unescape(JSON_ParserState *state, const char *string, c #define APPEND_CHAR(chr) *buffer++ = chr; p = ++pe; - while (pe < stringEnd && (pe = memchr(pe, '\\', stringEnd - pe))) { + while (pe < stringEnd && (pe = json_next_backslash(pe, stringEnd, positions))) { if (pe > p) { MEMCPY(buffer, p, char, pe - p); buffer += pe - p; @@ -893,20 +920,6 @@ static inline VALUE json_decode_object(JSON_ParserState *state, JSON_ParserConfi return object; } -static inline VALUE json_decode_string(JSON_ParserState *state, JSON_ParserConfig *config, const char *start, const char *end, bool escaped, bool is_name) -{ - VALUE string; - bool intern = is_name || config->freeze; - bool symbolize = is_name && config->symbolize_names; - if (escaped) { - string = json_string_unescape(state, start, end, is_name, intern, symbolize); - } else { - string = json_string_fastpath(state, start, end, is_name, intern, symbolize); - } - - return string; -} - static inline VALUE json_push_value(JSON_ParserState *state, JSON_ParserConfig *config, VALUE value) { if (RB_UNLIKELY(config->on_load_proc)) { @@ -964,22 +977,30 @@ static ALWAYS_INLINE() bool string_scan(JSON_ParserState *state) return false; } -static inline VALUE json_parse_string(JSON_ParserState *state, JSON_ParserConfig *config, bool is_name) +static VALUE json_parse_escaped_string(JSON_ParserState *state, JSON_ParserConfig *config, bool is_name, const char *start) { - state->cursor++; - const char *start = state->cursor; - bool escaped = false; + const char *backslashes[JSON_MAX_UNESCAPE_POSITIONS]; + JSON_UnescapePositions positions = { + .size = 0, + .positions = backslashes, + .has_more = false, + }; - while (RB_UNLIKELY(string_scan(state))) { + do { switch (*state->cursor) { case '"': { - VALUE string = json_decode_string(state, config, start, state->cursor, escaped, is_name); + VALUE string = json_string_unescape(state, config, start, state->cursor, is_name, &positions); state->cursor++; return json_push_value(state, config, string); } case '\\': { + if (RB_LIKELY(positions.size < JSON_MAX_UNESCAPE_POSITIONS)) { + backslashes[positions.size] = state->cursor; + positions.size++; + } else { + positions.has_more = true; + } state->cursor++; - escaped = true; break; } default: @@ -988,12 +1009,29 @@ static inline VALUE json_parse_string(JSON_ParserState *state, JSON_ParserConfig } state->cursor++; - } + } while (string_scan(state)); raise_parse_error("unexpected end of input, expected closing \"", state); return Qfalse; } +static ALWAYS_INLINE() VALUE json_parse_string(JSON_ParserState *state, JSON_ParserConfig *config, bool is_name) +{ + state->cursor++; + const char *start = state->cursor; + + if (RB_UNLIKELY(!string_scan(state))) { + raise_parse_error("unexpected end of input, expected closing \"", state); + } + + if (RB_LIKELY(*state->cursor == '"')) { + VALUE string = json_string_fastpath(state, config, start, state->cursor, is_name); + state->cursor++; + return json_push_value(state, config, string); + } + return json_parse_escaped_string(state, config, is_name, start); +} + #if JSON_CPU_LITTLE_ENDIAN_64BITS // From: https://lemire.me/blog/2022/01/21/swar-explained-parsing-eight-digits/ // Additional References: