From e670fd7a0e4bd0a30d02d00fe901cb5cb861a021 Mon Sep 17 00:00:00 2001 From: Ilia Alshanetsky Date: Sat, 11 Apr 2026 14:12:09 -0400 Subject: [PATCH] ext/standard: speed up php_url_parse_ex2 by ~12% Three related changes to ext/standard/url.c targeting the ctype macros on the parse_url hot path. On a 17-URL mix (17M parses per run, CPU pinned, same-session A/B), median wall time drops from 1.90s to 1.68s, a ~12% reduction and ~13% throughput increase (8.94M/s to 10.10M/s). 1. php_replace_controlchars replaces its iscntrl() call with an inline `c < 0x20 || c == 0x7f` comparison. Callgrind showed iscntrl at ~14% of total instructions on a realistic URL workload; glibc's iscntrl goes through __ctype_b_loc() per byte for a TLS lookup and table deref, which defeats auto-vectorization. URL components are bytes, not locale-dependent text, so C/POSIX semantics are what we want regardless of the process locale. The Zend language scanner uses the same pattern (yych <= 0x1F). This runs once per component per parse, up to 7 times. 2. The scheme-validation walk uses isalpha/isdigit which have the same __ctype_b_loc tax. I extracted the check into php_url_is_scheme_char with an inline ASCII test: ((c | 0x20) - 'a' < 26u) || (c - '0' < 10u) for the letter/digit half, plus the three literal comparisons for + - and . The scheme loop runs once per byte of the scheme on every parse. A helper php_url_is_ascii_digit covers the two isdigit call sites in the port-scan loops (one in the mailto-branch port probe, one in the parse_port fallback). 3. The three branches that allocate ret->scheme all followed zend_string_init with a php_replace_controlchars call. The scheme loop above has already rejected any byte that isn't in [a-zA-Z0-9+.-], so the control-char scan on scheme is dead work. Removed from all three sites. No behavior change: the inline comparisons are identical in behavior to the ctype macros in C/POSIX, and URL bytes are never locale-dependent. I checked that contaminated inputs like http://ex\x7fample.com/p\x1fath still get their control bytes replaced with underscores. --- ext/standard/url.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/ext/standard/url.c b/ext/standard/url.c index 4ddf7f80c64f..0a4d9ce091d2 100644 --- a/ext/standard/url.c +++ b/ext/standard/url.c @@ -47,6 +47,17 @@ PHPAPI void php_url_free(php_url *theurl) } /* }}} */ +static zend_always_inline bool php_url_is_scheme_char(unsigned char c) +{ + return ((c | 0x20) - 'a' < 26u) || (c - '0' < 10u) + || c == '+' || c == '-' || c == '.'; +} + +static zend_always_inline bool php_url_is_ascii_digit(unsigned char c) +{ + return c - '0' < 10u; +} + static void php_replace_controlchars(char *str, size_t len) { unsigned char *s = (unsigned char *)str; @@ -55,8 +66,8 @@ static void php_replace_controlchars(char *str, size_t len) ZEND_ASSERT(str != NULL); while (s < e) { - if (iscntrl(*s)) { - *s='_'; + if (UNEXPECTED(*s < 0x20 || *s == 0x7f)) { + *s = '_'; } s++; } @@ -103,7 +114,7 @@ PHPAPI php_url *php_url_parse_ex2(char const *str, size_t length, bool *has_port p = s; while (p < e) { /* scheme = 1*[ lowalpha | digit | "+" | "-" | "." ] */ - if (!isalpha(*p) && !isdigit(*p) && *p != '+' && *p != '.' && *p != '-') { + if (!php_url_is_scheme_char((unsigned char) *p)) { if (e + 1 < ue && e < binary_strcspn(s, ue, "?#")) { goto parse_port; } else if (s + 1 < ue && *s == '/' && *(s + 1) == '/') { /* relative-scheme URL */ @@ -119,7 +130,6 @@ PHPAPI php_url *php_url_parse_ex2(char const *str, size_t length, bool *has_port if (e + 1 == ue) { /* only scheme is available */ ret->scheme = zend_string_init(s, (e - s), 0); - php_replace_controlchars(ZSTR_VAL(ret->scheme), ZSTR_LEN(ret->scheme)); return ret; } @@ -132,7 +142,7 @@ PHPAPI php_url *php_url_parse_ex2(char const *str, size_t length, bool *has_port * correctly parse things like a.com:80 */ p = e + 1; - while (p < ue && isdigit(*p)) { + while (p < ue && php_url_is_ascii_digit((unsigned char) *p)) { p++; } @@ -141,13 +151,11 @@ PHPAPI php_url *php_url_parse_ex2(char const *str, size_t length, bool *has_port } ret->scheme = zend_string_init(s, (e-s), 0); - php_replace_controlchars(ZSTR_VAL(ret->scheme), ZSTR_LEN(ret->scheme)); s = e + 1; goto just_path; } else { ret->scheme = zend_string_init(s, (e-s), 0); - php_replace_controlchars(ZSTR_VAL(ret->scheme), ZSTR_LEN(ret->scheme)); if (e + 2 < ue && *(e + 2) == '/') { s = e + 3; @@ -172,7 +180,7 @@ PHPAPI php_url *php_url_parse_ex2(char const *str, size_t length, bool *has_port p = e + 1; pp = p; - while (pp < ue && pp - p < 6 && isdigit(*pp)) { + while (pp < ue && pp - p < 6 && php_url_is_ascii_digit((unsigned char) *pp)) { pp++; }