Skip to content

Commit b9c099f

Browse files
committed
refactor(sanitizer): split HtmlPurifierSanitizer into modular components
BREAKING CHANGE: HtmlPurifierSanitizer internal structure has been reorganized - Create new namespace KaririCode\Sanitizer\Processor\Domain\HtmlPurifier - Split functionality into specialized classes: - Config/Configuration: Manage allowed tags and attributes - Dom/DomHandler: Handle DOM operations - NodeSanitizer/NodeSanitizer: Process DOM nodes - Cleaner/InputCleaner: Pre-process input - Cleaner/OutputCleaner: Post-process output - AttributeCleaner/AttributeCleaner: Handle attribute sanitization
1 parent 5ca5c98 commit b9c099f

File tree

8 files changed

+557
-196
lines changed

8 files changed

+557
-196
lines changed
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace KaririCode\Sanitizer\Processor\Domain\HtmlPurifier\Cleaner;
6+
7+
use KaririCode\Sanitizer\Processor\Domain\HtmlPurifier\Configuration;
8+
9+
final class AttributeCleaner
10+
{
11+
public function __construct(
12+
private readonly Configuration $config
13+
) {
14+
}
15+
16+
public function cleanAttributes(\DOMElement $element): void
17+
{
18+
if (!in_array(strtolower($element->tagName), $this->config->getAllowedTags(), true)) {
19+
return;
20+
}
21+
22+
$attributes = iterator_to_array($element->attributes);
23+
foreach ($attributes as $attr) {
24+
$attrName = $attr->name;
25+
$tagName = strtolower($element->tagName);
26+
27+
if (!$this->isAttributeAllowed($attrName, $tagName)) {
28+
$element->removeAttribute($attrName);
29+
}
30+
}
31+
}
32+
33+
private function isAttributeAllowed(string $attrName, string $tagName): bool
34+
{
35+
$allowedAttributes = $this->config->getAllowedAttributes();
36+
37+
return isset($allowedAttributes[$attrName])
38+
&& in_array($tagName, $allowedAttributes[$attrName], true);
39+
}
40+
}
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace KaririCode\Sanitizer\Processor\Domain\HtmlPurifier\Cleaner;
6+
7+
final class InputCleaner
8+
{
9+
public function clean(string $input): string
10+
{
11+
// Remove scripts and inline events
12+
$input = preg_replace('/<script\b[^>]*>(.*?)<\/script>/is', '', $input);
13+
$input = preg_replace('/\bon\w+\s*=\s*"[^"]*"/i', '', $input);
14+
$input = preg_replace('/<!--.*?-->/s', '', $input);
15+
16+
return $input;
17+
}
18+
}
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace KaririCode\Sanitizer\Processor\Domain\HtmlPurifier\Cleaner;
6+
7+
final class OutputCleaner
8+
{
9+
public function clean(string $output): string
10+
{
11+
// Remove any remaining HTML comments
12+
$output = preg_replace('/<!--.*?-->/s', '', $output);
13+
14+
// Normalize whitespace
15+
$output = preg_replace('/^\s+|\s+$/m', '', $output);
16+
$output = preg_replace('/\s+/', ' ', $output);
17+
18+
return trim($output);
19+
}
20+
}
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace KaririCode\Sanitizer\Processor\Domain\HtmlPurifier;
6+
7+
final class Configuration
8+
{
9+
private const DEFAULT_ALLOWED_TAGS = [
10+
'p', 'br', 'strong', 'em', 'u', 'ol', 'ul', 'li',
11+
'a', 'img', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
12+
];
13+
private const DEFAULT_ALLOWED_ATTRIBUTES = [
14+
'href' => ['a'],
15+
'src' => ['img'],
16+
'alt' => ['img'],
17+
];
18+
19+
private array $allowedTags;
20+
private array $allowedAttributes;
21+
22+
public function __construct()
23+
{
24+
$this->allowedTags = self::DEFAULT_ALLOWED_TAGS;
25+
$this->allowedAttributes = self::DEFAULT_ALLOWED_ATTRIBUTES;
26+
}
27+
28+
public function configure(array $options): void
29+
{
30+
$this->allowedTags = $options['allowedTags'] ?? $this->allowedTags;
31+
$this->allowedAttributes = $options['allowedAttributes'] ?? $this->allowedAttributes;
32+
}
33+
34+
public function getAllowedTags(): array
35+
{
36+
return $this->allowedTags;
37+
}
38+
39+
public function getAllowedAttributes(): array
40+
{
41+
return $this->allowedAttributes;
42+
}
43+
}
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace KaririCode\Sanitizer\Processor\Domain\HtmlPurifier;
6+
7+
final class DomHandler
8+
{
9+
private \DOMDocument $dom;
10+
private ?\DOMElement $root = null;
11+
12+
public function __construct()
13+
{
14+
$this->dom = new \DOMDocument('1.0', 'UTF-8');
15+
}
16+
17+
public function loadHtml(string $input): void
18+
{
19+
$wrappedInput = '<div id="temp-root">' . $input . '</div>';
20+
libxml_use_internal_errors(true);
21+
22+
$this->dom->loadHTML(
23+
mb_convert_encoding($wrappedInput, 'HTML-ENTITIES', 'UTF-8'),
24+
LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD
25+
);
26+
27+
$this->root = $this->dom->getElementById('temp-root');
28+
}
29+
30+
public function getRoot(): ?\DOMElement
31+
{
32+
return $this->root;
33+
}
34+
35+
public function saveHtml(\DOMNode $node): string
36+
{
37+
return $this->dom->saveHTML($node) ?: '';
38+
}
39+
40+
public function createDocumentFragment(): \DOMDocumentFragment
41+
{
42+
return $this->dom->createDocumentFragment();
43+
}
44+
}
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace KaririCode\Sanitizer\Processor\Domain\HtmlPurifier;
6+
7+
final class NodeSanitizer
8+
{
9+
public function __construct(
10+
private readonly Configuration $config
11+
) {
12+
}
13+
14+
public function sanitizeNode(\DOMNode $node): void
15+
{
16+
if (!$node->hasChildNodes()) {
17+
return;
18+
}
19+
20+
$children = iterator_to_array($node->childNodes);
21+
22+
foreach ($children as $child) {
23+
if ($child instanceof \DOMElement) {
24+
$tagName = strtolower($child->tagName);
25+
26+
// First, recursively process the child nodes
27+
$this->sanitizeNode($child);
28+
29+
// Then, check if the current tag is allowed
30+
if (!in_array($tagName, $this->config->getAllowedTags(), true)) {
31+
$this->unwrapNode($node, $child);
32+
}
33+
}
34+
}
35+
}
36+
37+
private function unwrapNode(\DOMNode $parent, \DOMElement $element): void
38+
{
39+
$fragment = $parent->ownerDocument->createDocumentFragment();
40+
41+
while ($element->firstChild) {
42+
$fragment->appendChild($element->firstChild);
43+
}
44+
45+
$parent->replaceChild($fragment, $element);
46+
}
47+
}

src/Processor/Domain/HtmlPurifierSanitizer.php

Lines changed: 26 additions & 119 deletions
Original file line numberDiff line numberDiff line change
@@ -6,146 +6,53 @@
66

77
use KaririCode\Contract\Processor\ConfigurableProcessor;
88
use KaririCode\Sanitizer\Processor\AbstractSanitizerProcessor;
9+
use KaririCode\Sanitizer\Processor\Domain\HtmlPurifier\Cleaner\InputCleaner;
10+
use KaririCode\Sanitizer\Processor\Domain\HtmlPurifier\Cleaner\OutputCleaner;
11+
use KaririCode\Sanitizer\Processor\Domain\HtmlPurifier\Configuration;
12+
use KaririCode\Sanitizer\Processor\Domain\HtmlPurifier\DomHandler;
13+
use KaririCode\Sanitizer\Processor\Domain\HtmlPurifier\NodeSanitizer;
914

1015
final class HtmlPurifierSanitizer extends AbstractSanitizerProcessor implements ConfigurableProcessor
1116
{
12-
private const DEFAULT_ALLOWED_TAGS = ['p', 'br', 'strong', 'em', 'u', 'ol', 'ul', 'li', 'a', 'img'];
13-
private const DEFAULT_ALLOWED_ATTRIBUTES = ['href' => ['a'], 'src' => ['img'], 'alt' => ['img']];
14-
private const DEFAULT_ENCODING = 'UTF-8';
15-
16-
private array $allowedTags;
17-
private array $allowedAttributes;
18-
private string $encoding;
17+
private Configuration $config;
18+
private DomHandler $domHandler;
19+
private NodeSanitizer $nodeSanitizer;
20+
private InputCleaner $inputCleaner;
21+
private OutputCleaner $outputCleaner;
1922

2023
public function __construct()
2124
{
22-
$this->allowedTags = self::DEFAULT_ALLOWED_TAGS;
23-
$this->allowedAttributes = self::DEFAULT_ALLOWED_ATTRIBUTES;
24-
$this->encoding = self::DEFAULT_ENCODING;
25+
$this->config = new Configuration();
26+
$this->domHandler = new DomHandler();
27+
$this->nodeSanitizer = new NodeSanitizer($this->config);
28+
$this->inputCleaner = new InputCleaner();
29+
$this->outputCleaner = new OutputCleaner();
2530
}
2631

2732
public function configure(array $options): void
2833
{
29-
$this->allowedTags = $options['allowedTags'] ?? $this->allowedTags;
30-
$this->allowedAttributes = $options['allowedAttributes'] ?? $this->allowedAttributes;
31-
$this->encoding = $options['encoding'] ?? $this->encoding;
34+
$this->config->configure($options);
3235
}
3336

3437
public function process(mixed $input): string
3538
{
3639
$input = $this->guardAgainstNonString($input);
37-
$input = $this->sanitizeHtml($input);
38-
39-
$dom = new \DOMDocument('1.0', $this->encoding);
40-
$this->loadHtmlToDom($dom, $input);
41-
$this->filterNodes($dom->getElementsByTagName('*'));
40+
$input = $this->inputCleaner->clean($input);
4241

43-
return $this->cleanHtmlOutput($dom->saveHTML());
44-
}
42+
$this->domHandler->loadHtml($input);
4543

46-
private function filterNodes(\DOMNodeList $nodes): void
47-
{
48-
for ($i = $nodes->length - 1; $i >= 0; --$i) {
49-
$node = $nodes->item($i);
50-
if (!$this->isAllowedTag($node->nodeName)) {
51-
$this->unwrapNode($node);
52-
} else {
53-
$this->filterAttributes($node);
54-
}
44+
$root = $this->domHandler->getRoot();
45+
if (!$root) {
46+
return '';
5547
}
56-
}
57-
58-
private function filterAttributes(\DOMElement $element): void
59-
{
60-
for ($i = $element->attributes->length - 1; $i >= 0; --$i) {
61-
/** @var DOMNode */
62-
$attr = $element->attributes->item($i);
63-
if (!$this->isAllowedAttribute($element->nodeName, $attr->name)) {
64-
$element->removeAttribute($attr->name);
65-
}
66-
}
67-
}
68-
69-
private function sanitizeHtml(string $html): string
70-
{
71-
$html = $this->removeScripts($html);
7248

73-
return $this->removeComments($html);
74-
}
75-
76-
private function loadHtmlToDom(\DOMDocument $dom, string $html): void
77-
{
78-
libxml_use_internal_errors(true);
79-
$isLoaded = $dom->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', $this->encoding), LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
80-
$errors = libxml_get_errors();
81-
libxml_clear_errors();
82-
}
83-
84-
private function cleanHtmlOutput(string $output): string
85-
{
86-
$output = preg_replace('/^<!DOCTYPE.+?>/', '', $output);
87-
$output = str_replace(['<html>', '</html>', '<body>', '</body>'], '', $output);
88-
89-
return trim($output);
90-
}
91-
92-
private function isAllowedTag(string $tagName): bool
93-
{
94-
return in_array(strtolower($tagName), $this->allowedTags, true);
95-
}
96-
97-
private function isAllowedAttribute(string $elementName, string $attributeName): bool
98-
{
99-
return isset($this->allowedAttributes[strtolower($attributeName)])
100-
&& in_array(strtolower($elementName), $this->allowedAttributes[strtolower($attributeName)], true);
101-
}
102-
103-
private function unwrapNode(\DOMNode $node): void
104-
{
105-
$parent = $node->parentNode;
106-
while ($node->firstChild) {
107-
$parent->insertBefore($node->firstChild, $node);
108-
}
109-
$parent->removeChild($node);
110-
}
111-
112-
private function removeScripts(string $html): string
113-
{
114-
return $this->removeElementsByTagName('script', $html);
115-
}
116-
117-
private function removeComments(string $html): string
118-
{
119-
return $this->removeElementsByXPath('//comment()', $html);
120-
}
121-
122-
private function removeElementsByTagName(string $tagName, string $html): string
123-
{
124-
$dom = new \DOMDocument();
125-
libxml_use_internal_errors(true);
126-
$dom->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', $this->encoding), LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
127-
libxml_clear_errors();
128-
129-
$elements = $dom->getElementsByTagName($tagName);
130-
while ($elements->length > 0) {
131-
$elements->item(0)->parentNode->removeChild($elements->item(0));
132-
}
133-
134-
return $dom->saveHTML();
135-
}
136-
137-
private function removeElementsByXPath(string $query, string $html): string
138-
{
139-
$dom = new \DOMDocument();
140-
libxml_use_internal_errors(true);
141-
$dom->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', $this->encoding), LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
142-
libxml_clear_errors();
49+
$this->nodeSanitizer->sanitizeNode($root);
14350

144-
$xpath = new \DOMXPath($dom);
145-
foreach ($xpath->query($query) as $element) {
146-
$element->parentNode->removeChild($element);
51+
$output = '';
52+
foreach ($root->childNodes as $child) {
53+
$output .= $this->domHandler->saveHtml($child);
14754
}
14855

149-
return $dom->saveHTML();
56+
return $this->outputCleaner->clean($output);
15057
}
15158
}

0 commit comments

Comments
 (0)