-
Notifications
You must be signed in to change notification settings - Fork 3.2k
HTML API: Auto-escape JavaScript and JSON script tag contents when necessary #10635
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: trunk
Are you sure you want to change the base?
Changes from 20 commits
4a8ca98
4a28ef2
0bef687
cdba027
ba54ae4
a697e9e
2b3d0d0
8246439
3f7f227
1362451
8d30680
3b5ef4e
f8cfdf9
501d201
bcc02ae
ea03441
c7d1827
a134e82
d4bd4b3
edae8d5
02ca3c0
d058a78
dfb63af
11f51c9
a3e0e27
288b952
a7495dd
d8c320c
369eefc
55d4e47
2ef0bf0
cb6b990
e399bf6
83c1fab
1872681
83ff62f
5979782
d469ae4
402ae9f
6b2c9ba
eef0ccb
71d2686
d4693a2
eb4e091
4c3b0b2
bfa60cf
3252160
0b9b2bd
b2533e1
ddeb301
b36fc32
1249af0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3811,29 +3811,81 @@ public function set_modifiable_text( string $plaintext_content ): bool { | |
|
|
||
| switch ( $this->get_tag() ) { | ||
| case 'SCRIPT': | ||
| /** | ||
| * This is over-protective, but ensures the update doesn't break | ||
| * the HTML structure of the SCRIPT element. | ||
| /* | ||
| * SCRIPT tag contents can be dangerous. | ||
| * | ||
| * The text `</script>` could close the SCRIPT element prematurely. | ||
| * | ||
| * More thorough analysis could track the HTML tokenizer states | ||
| * and to ensure that the SCRIPT element closes at the expected | ||
| * SCRIPT close tag as is done in {@see ::skip_script_data()}. | ||
| * The text `<script>` could enter the "script data double escaped state", preventing the | ||
| * SCRIPT element from closing as expected, for example: | ||
| * | ||
| * A SCRIPT element could be closed prematurely by contents | ||
| * like `</script>`. A SCRIPT element could be prevented from | ||
| * closing by contents like `<!--<script>`. | ||
| * <script> | ||
| * // If this "<!--" then "<script>" the closing tag will not be recognized. | ||
| * </script> | ||
| * <h1>This appears inside the preceding SCRIPT element.</h1> | ||
| * | ||
| * The following strings are essential for dangerous content, | ||
| * although they are insufficient on their own. This trade-off | ||
| * prevents dangerous scripts from being sent to the browser. | ||
| * It is also unlikely to produce HTML that may confuse more | ||
| * basic HTML tooling. | ||
| * The relevant state transitions happen on text like: | ||
| * 1. < | ||
| * 2. / (optional) | ||
| * 3. script (case-insensitive) | ||
| * 4. One of the following characters: | ||
| * - \t | ||
| * - \n | ||
| * - \r (\r and \r\n newlines are normalized to \n in HTML pre-processing) | ||
| * - \f | ||
| * - " " (U+0020 SPACE) | ||
| * - / | ||
| * - > | ||
| * | ||
| * @see https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-state | ||
| */ | ||
| if ( | ||
| false !== stripos( $plaintext_content, '</script' ) || | ||
| false !== stripos( $plaintext_content, '<script' ) | ||
| ) { | ||
| return false; | ||
| /* | ||
| * JavaScript can be safely escaped. | ||
| */ | ||
|
||
| if ( $this->is_javascript_script_tag() ) { | ||
| $plaintext_content = preg_replace_callback( | ||
| /* | ||
| * This case-insensitive pattern consists of three groups: | ||
| * | ||
| * 1: "<" or "</" | ||
| * 2: "s" | ||
| * 3: "cript" + a trailing character that terminates a tag name. | ||
| */ | ||
sirreal marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| '~(</?)(s)(cript[\\t\\r\\n\\f />])~i', | ||
sirreal marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| static function ( $matches ) { | ||
| $escaped_s_char = 's' === $matches[2] | ||
| ? '\\u0073' | ||
| : '\\u0053'; | ||
sirreal marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| return "{$matches[1]}{$escaped_s_char}{$matches[3]}"; | ||
| }, | ||
| $plaintext_content | ||
| ); | ||
| } elseif ( $this->is_json_script_tag() ) { | ||
sirreal marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| /** | ||
| * JSON can be safely escaped. | ||
| * | ||
| * The following replacement may appear insufficient, "<" is replaced | ||
| * with its JSON escape sequence "\u003C" without considering whether | ||
| * the "<" is preceded by an escaping backslash. JSON does not support | ||
| * arbitrary character escaping in strings (unlike JavaScript) so "\<" | ||
| * is invalid JSON and does not need to be considered. | ||
| * | ||
| * @see https://www.json.org/json-en.html | ||
| */ | ||
| $plaintext_content = strtr( | ||
| $plaintext_content, | ||
| array( '<' => '\\u003C' ) | ||
sirreal marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| ); | ||
| } else { | ||
| /* | ||
| * Other types of script tags cannot be escaped safely. | ||
| */ | ||
| return false; | ||
| } | ||
| } | ||
|
|
||
| $this->lexical_updates['modifiable text'] = new WP_HTML_Text_Replacement( | ||
|
|
@@ -3891,6 +3943,162 @@ static function ( $tag_match ) { | |
| return false; | ||
| } | ||
|
|
||
| /** | ||
| * Indicates if the currently matched tag is a JavaScript script tag. | ||
| * | ||
| * @see https://html.spec.whatwg.org/multipage/scripting.html#prepare-the-script-element | ||
| * | ||
| * @since 7.0.0 | ||
| * | ||
| * @return bool True if the script tag will be evaluated as JavaScript. | ||
| */ | ||
| private function is_javascript_script_tag(): bool { | ||
| if ( 'SCRIPT' !== $this->get_tag() || $this->get_namespace() !== 'html' ) { | ||
| return false; | ||
| } | ||
|
|
||
| /* | ||
| * > If any of the following are true: | ||
| * > - el has a type attribute whose value is the empty string; | ||
| * > - el has no type attribute but it has a language attribute and that attribute's | ||
| * > value is the empty string; or | ||
| * > - el has neither a type attribute nor a language attribute, | ||
| * > then let the script block's type string for this script element be "text/javascript". | ||
| */ | ||
| $type_attr = $this->get_attribute( 'type' ); | ||
| $language_attr = $this->get_attribute( 'language' ); | ||
|
||
| if ( true === $type_attr || '' === $type_attr ) { | ||
| return true; | ||
| } | ||
| if ( | ||
| null === $type_attr | ||
| && ( null === $language_attr || true === $language_attr || '' === $language_attr ) | ||
| ) { | ||
| return true; | ||
| } | ||
|
|
||
| /* | ||
| * > Otherwise, if el has a type attribute, then let the script block's type string be | ||
| * > the value of that attribute with leading and trailing ASCII whitespace stripped. | ||
| * > Otherwise, el has a non-empty language attribute; let the script block's type string | ||
| * > be the concatenation of "text/" and the value of el's language attribute. | ||
| */ | ||
| $type_string = null !== $type_attr ? trim( $type_attr, " \t\f\r\n" ) : "text/{$language_attr}"; | ||
|
|
||
| /* | ||
| * > If the script block's type string is a JavaScript MIME type essence match, then | ||
| * > set el's type to "classic". | ||
| * | ||
| * > A string is a JavaScript MIME type essence match if it is an ASCII case-insensitive | ||
| * > match for one of the JavaScript MIME type essence strings. | ||
| * | ||
| * > A JavaScript MIME type is any MIME type whose essence is one of the following: | ||
| * > | ||
| * > - application/ecmascript | ||
| * > - application/javascript | ||
| * > - application/x-ecmascript | ||
| * > - application/x-javascript | ||
| * > - text/ecmascript | ||
| * > - text/javascript | ||
| * > - text/javascript1.0 | ||
| * > - text/javascript1.1 | ||
| * > - text/javascript1.2 | ||
| * > - text/javascript1.3 | ||
| * > - text/javascript1.4 | ||
| * > - text/javascript1.5 | ||
| * > - text/jscript | ||
| * > - text/livescript | ||
| * > - text/x-ecmascript | ||
| * > - text/x-javascript | ||
| * | ||
| * @see https://mimesniff.spec.whatwg.org/#javascript-mime-type-essence-match | ||
| * @see https://mimesniff.spec.whatwg.org/#javascript-mime-type | ||
| */ | ||
| switch ( strtolower( $type_string ) ) { | ||
| case 'application/ecmascript': | ||
| case 'application/javascript': | ||
| case 'application/x-ecmascript': | ||
| case 'application/x-javascript': | ||
| case 'text/ecmascript': | ||
| case 'text/javascript': | ||
| case 'text/javascript1.0': | ||
| case 'text/javascript1.1': | ||
| case 'text/javascript1.2': | ||
| case 'text/javascript1.3': | ||
| case 'text/javascript1.4': | ||
| case 'text/javascript1.5': | ||
| case 'text/jscript': | ||
| case 'text/livescript': | ||
| case 'text/x-ecmascript': | ||
| case 'text/x-javascript': | ||
| return true; | ||
|
|
||
| /* | ||
| * > Otherwise, if the script block's type string is an ASCII case-insensitive match for | ||
| * > the string "module", then set el's type to "module". | ||
| * | ||
| * A module is evaluated as JavaScript. | ||
| */ | ||
| case 'module': | ||
| return true; | ||
|
||
| } | ||
|
|
||
| /* | ||
| * > Otherwise, if the script block's type string is an ASCII case-insensitive match for the string "importmap", then set el's type to "importmap". | ||
| * > Otherwise, if the script block's type string is an ASCII case-insensitive match for the string "speculationrules", then set el's type to "speculationrules". | ||
| * | ||
| * These conditions indicate JSON content. | ||
| */ | ||
|
|
||
| /* | ||
| * > Otherwise, return. (No script is executed, and el's type is left as null.) | ||
| */ | ||
| return false; | ||
| } | ||
|
|
||
| /** | ||
| * Indicates if the currently matched tag is a JSON script tag. | ||
| * | ||
| * @since 7.0.0 | ||
| * | ||
| * @return bool True if the script tag should be treated as JSON. | ||
| */ | ||
| private function is_json_script_tag(): bool { | ||
| if ( 'SCRIPT' !== $this->get_tag() || $this->get_namespace() !== 'html' ) { | ||
| return false; | ||
| } | ||
|
|
||
| $type = $this->get_attribute( 'type' ); | ||
| if ( null === $type || true === $type || '' === $type ) { | ||
| return false; | ||
| } | ||
| $type = strtolower( trim( $type, " \t\f\r\n" ) ); | ||
|
|
||
| /* | ||
| * > … | ||
| * > Otherwise, if the script block's type string is an ASCII case-insensitive match for the string "importmap", then set el's type to "importmap". | ||
| * > Otherwise, if the script block's type string is an ASCII case-insensitive match for the string "speculationrules", then set el's type to "speculationrules". | ||
| * @see https://html.spec.whatwg.org/#script-processing-model | ||
| * | ||
| * > A JSON MIME type is any MIME type whose subtype ends in "+json" or whose essence | ||
| * > is "application/json" or "text/json". | ||
| * | ||
| * @todo The JSON MIME type handling handles some common cases but when MIME type parsing is available it should be leveraged here. | ||
| * | ||
| * @see https://mimesniff.spec.whatwg.org/#json-mime-type | ||
| */ | ||
| if ( | ||
| 'importmap' === $type || | ||
| 'speculationrules' === $type || | ||
| 'application/json' === $type || | ||
| 'text/json' === $type | ||
| ) { | ||
| return true; | ||
| } | ||
|
|
||
| return false; | ||
| } | ||
|
|
||
| /** | ||
| * Updates or creates a new attribute on the currently matched tag with the passed value. | ||
| * | ||
|
|
||

There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we’re not checking for these state transitions. in the code following, we’re checking for the simplified transition, akin to the simplified diagram
I think it would be helpful to match the discussion of the approach with the comment. that is, I don’t know if it’s particularly helpful to spell out all of the characters when terser descriptions are warranted. e.g.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Those lines capture all of the relevant state transitions, it's really about these minimal transitions that either close the script or move to double-escaped:
The idea being that if those transitions are prevented then the script contents cannot break the HTML structure.
I agree the documentation here needs to be reviewed broadly. It's important to document the escaping well for posterity to understand why this is implemented as it is.