From e9d70c0e580ed8b20bdc40880ac072dc49556c69 Mon Sep 17 00:00:00 2001 From: Chris Huber Date: Wed, 24 Jun 2026 00:56:17 -0400 Subject: [PATCH 1/2] Improve raw HTML block conversion --- .../src/HtmlToBlocks/BlockFactory.php | 15 +++- .../src/HtmlToBlocks/HtmlTransformer.php | 78 ++++++++++++++++++- php-transformer/tests/contract/run.php | 36 +++++++++ ...materialized-script-asset-no-fallback.json | 2 + .../parity/html-anchor-inline-patterns.json | 3 +- .../html-interactive-chrome-primitives.json | 4 +- .../html-logo-nav-button-classification.json | 7 +- ...ml-static-script-metadata-no-fallback.json | 43 ++++++++++ .../parity/mixed-source-markdown.json | 3 +- 9 files changed, 182 insertions(+), 9 deletions(-) create mode 100644 php-transformer/tests/fixtures/parity/html-static-script-metadata-no-fallback.json diff --git a/php-transformer/src/HtmlToBlocks/BlockFactory.php b/php-transformer/src/HtmlToBlocks/BlockFactory.php index d87d8c6..f07a954 100644 --- a/php-transformer/src/HtmlToBlocks/BlockFactory.php +++ b/php-transformer/src/HtmlToBlocks/BlockFactory.php @@ -29,13 +29,26 @@ public function create(string $name, array $attrs = array(), array $innerBlocks return array( 'blockName' => $name, - 'attrs' => $attrs, + 'attrs' => $this->commentAttrs($name, $attrs), 'innerBlocks' => $innerBlocks, 'innerHTML' => $innerHtml, 'innerContent' => $innerContent, ); } + /** + * @param array $attrs + * @return array + */ + private function commentAttrs(string $name, array $attrs): array + { + if ( 'core/paragraph' === $name && preg_match('/^\s* $attrs * @param array> $innerBlocks diff --git a/php-transformer/src/HtmlToBlocks/HtmlTransformer.php b/php-transformer/src/HtmlToBlocks/HtmlTransformer.php index afaf408..8a0043a 100644 --- a/php-transformer/src/HtmlToBlocks/HtmlTransformer.php +++ b/php-transformer/src/HtmlToBlocks/HtmlTransformer.php @@ -49,6 +49,11 @@ final class HtmlTransformer */ private array $structureProvenance = array(); + /** + * @var array> + */ + private array $scriptMetadata = array(); + /** * @var array> */ @@ -86,6 +91,7 @@ public function transform(string $html, array $options = array()): TransformerRe $this->presentationProvenance = array(); $this->sourceProvenance = array(); $this->structureProvenance = array(); + $this->scriptMetadata = array(); $this->assetMetadata = $this->assetMetadataFromOptions($options); $this->staticClassPromotions = $this->detectStaticClassPromotions($html); $this->staticStyleRules = $this->staticStyleRules($html, (string) ($options['static_css'] ?? '')); @@ -167,6 +173,18 @@ public function transform(string $html, array $options = array()): TransformerRe ), ); + foreach ( $this->scriptMetadata as $metadata ) { + $diagnostics[] = array( + 'code' => 'html_static_script_metadata', + 'message' => 'Static script data was preserved as bounded metadata and does not require client script execution.', + 'source' => self::class, + 'reason' => 'script_static_metadata', + 'tag' => 'script', + 'selector' => $metadata['selector'] ?? null, + 'script_role' => $metadata['script_role'] ?? null, + ); + } + foreach ( $fallbacks as $fallback ) { if ( ! empty($fallback['diagnostic_code']) ) { $diagnostics[] = array( @@ -203,6 +221,7 @@ public function transform(string $html, array $options = array()): TransformerRe 'presentation_signals' => $this->presentationProvenance, 'source_provenance' => $sourceProvenance, 'structure_signals' => $this->structureProvenance, + 'script_metadata' => $this->scriptMetadata, ), ); $sourceReports['conversion_report'] = ConversionReportProjection::fromResultParts('html', $blocks, $fallbacks, $sourceReports, array(), $provenance, $metrics); @@ -718,6 +737,10 @@ private function convertElement(DOMElement $element, array &$fallbacks, bool $ca } if ( 'script' === $tagName ) { + if ( $this->captureStaticScriptMetadata($element) ) { + return null; + } + $this->captureScriptFallback($element, $fallbacks); return null; } @@ -773,7 +796,7 @@ private function convertElement(DOMElement $element, array &$fallbacks, bool $ca } } - if ( in_array($tagName, array( 'article', 'aside', 'body', 'div', 'footer', 'header', 'main', 'nav', 'section' ), true) ) { + if ( in_array($tagName, array( 'article', 'aside', 'body', 'center', 'div', 'footer', 'header', 'main', 'nav', 'section' ), true) ) { $logo = $this->logoPattern->match( $element, fn (DOMElement $sourceElement): array => $this->presentationAttributes($sourceElement), @@ -1438,7 +1461,7 @@ private function isNonContentRuntimeControl(DOMElement $element): bool private function isInlineContentElement(string $tagName): bool { - return in_array($tagName, array( 'abbr', 'b', 'cite', 'code', 'em', 'i', 'mark', 'small', 'span', 'strong', 'sub', 'sup', 'time' ), true); + return in_array($tagName, array( 'abbr', 'b', 'cite', 'code', 'em', 'font', 'i', 'mark', 'small', 'span', 'strong', 'sub', 'sup', 'time' ), true); } private function hasBlockContentChildren(DOMElement $element): bool @@ -2434,6 +2457,7 @@ private function captureScriptFallback(DOMElement $element, array &$fallbacks): { $boundedHtml = $this->boundedFallbackHtml($this->safeFallbackHtml($element)); $boundedBody = $this->boundedFallbackText(trim($element->textContent ?? '')); + $scriptRole = $this->scriptRole($element); $fallbacks[] = FallbackDiagnostic::build(array( 'type' => 'html', 'reason' => 'script_requires_runtime', @@ -2445,6 +2469,8 @@ private function captureScriptFallback(DOMElement $element, array &$fallbacks): 'attributes' => $this->safeScriptAttributes($element), 'context' => $this->sourceContext($element), 'events' => $this->eventMetadata($element), + 'script_role' => $scriptRole, + 'script_source_kind' => '' !== trim($this->attr($element, 'src')) ? 'external' : 'inline', 'text_length' => strlen(trim($element->textContent ?? '')), 'child_count' => $this->childElementCount($element), 'html' => $boundedHtml['html'], @@ -2456,6 +2482,54 @@ private function captureScriptFallback(DOMElement $element, array &$fallbacks): ), $this->fallbackProvenance); } + private function captureStaticScriptMetadata(DOMElement $element): bool + { + if ( '' !== trim($this->attr($element, 'src')) ) { + return false; + } + + $scriptRole = $this->scriptRole($element); + if ( 'data' !== $scriptRole ) { + return false; + } + + $boundedBody = $this->boundedFallbackText(trim($element->textContent ?? '')); + $this->scriptMetadata[] = array( + 'type' => 'script_metadata', + 'reason' => 'script_static_metadata', + 'source_format' => 'html', + 'tag' => 'script', + 'selector' => $this->elementSelector($element), + 'attributes' => $this->safeScriptAttributes($element), + 'context' => $this->sourceContext($element), + 'script_role' => $scriptRole, + 'script_source_kind' => 'inline', + 'body' => $boundedBody['text'], + 'body_bytes' => $boundedBody['bytes'], + 'body_truncated' => $boundedBody['truncated'], + ); + + return true; + } + + private function scriptRole(DOMElement $element): string + { + $type = strtolower(trim($this->attr($element, 'type'))); + if ( '' === $type || in_array($type, array( 'text/javascript', 'application/javascript', 'module' ), true) ) { + return 'runtime'; + } + + if ( str_starts_with($type, 'application/ld+json') || in_array($type, array( 'application/json', 'importmap', 'speculationrules' ), true) ) { + return 'data'; + } + + if ( str_starts_with($type, 'text/') && ! in_array($type, array( 'text/javascript', 'text/ecmascript' ), true) ) { + return 'data'; + } + + return 'runtime'; + } + /** * @return array */ diff --git a/php-transformer/tests/contract/run.php b/php-transformer/tests/contract/run.php index 6e6c994..9397caa 100644 --- a/php-transformer/tests/contract/run.php +++ b/php-transformer/tests/contract/run.php @@ -244,6 +244,17 @@ function serialize_blocks(array $blocks): string $assert(! str_contains((string) $buttonResult['serialized_blocks'], '\\u003c'), 'button serialization avoids escaped nested HTML attrs'); $assert('pass' === ($buttonResult['source_reports']['wp_block_validity']['status'] ?? ''), 'HTML transform exposes passing WordPress block validity report for generated buttons'); +$linkedLogoResult = ( new HtmlTransformer() )->transform( + '
' +)->toArray(); +$linkedLogoBlock = $linkedLogoResult['blocks'][0] ?? array(); +$linkedLogoSerialized = (string) ($linkedLogoResult['serialized_blocks'] ?? ''); +$assert('core/paragraph' === ($linkedLogoBlock['blockName'] ?? ''), 'linked logo text converts to a paragraph block'); +$assert(! array_key_exists('content', is_array($linkedLogoBlock['attrs'] ?? null) ? $linkedLogoBlock['attrs'] : array()), 'paragraph source content is not serialized as a block comment attribute'); +$assert(str_contains($linkedLogoSerialized, ''), 'linked logo paragraph preserves anchor markup in saved HTML'); +$assert(! str_contains($linkedLogoSerialized, '\\u003ca'), 'linked logo paragraph avoids raw anchor HTML in delimiter JSON'); +$assert('pass' === ($linkedLogoResult['source_reports']['wp_block_validity']['status'] ?? ''), 'linked logo paragraph passes generated block validity checks'); + $invalidButtonBlocks = array( array( 'blockName' => 'core/button', @@ -474,6 +485,31 @@ function serialize_blocks(array $blocks): string $assert(($staticPlan['totals']['routes'] ?? null) === ($staticSummary['route_count'] ?? null), 'conversion report route count matches materialization plan totals'); $assert(($staticPlan['totals']['navigation_links'] ?? null) === ($staticSummary['navigation_link_count'] ?? null), 'conversion report navigation link count matches materialization plan totals'); $assert(($staticPlan['totals']['menus'] ?? null) === ($staticSummary['menu_count'] ?? null), 'conversion report menu count matches materialization plan totals'); + +$legacyFrontPageSite = $compiler->compile( + array( + 'entrypoint' => 'index.html', + 'files' => array( + 'index.html' => '

Home

', + 'about-us.html' => 'About Us
About Hank\'s Tool Rental
Family owned since 1987.
We answer the phone.
', + ), + ) +)->toArray(); +$legacyPlanPage = null; +foreach ( $legacyFrontPageSite['source_reports']['materialization_plan']['pages'] ?? array() as $planPage ) { + if ( 'about-us.html' === ($planPage['source_path'] ?? '') ) { + $legacyPlanPage = $planPage; + } +} +$legacyBlockMarkup = (string) ($legacyPlanPage['block_markup'] ?? ''); +$assert('' !== trim($legacyBlockMarkup), 'legacy HTML 4 FrontPage-era documents produce non-empty materialization block markup'); +$assert(str_contains($legacyBlockMarkup, 'About Hank\'s Tool Rental'), 'legacy HTML 4 FrontPage-era table/font/center content is preserved'); +$assert(str_contains($legacyBlockMarkup, '