Skip to content

Commit e80dd99

Browse files
committed
refactor(processor): restructure sanitizers and add new domain, input, and security modules
- Removed legacy cleaner classes: EmailAddressCleaner, NumericValueCleaner, UrlAddressCleaner - Removed HtmlTagRemover, WhitespaceRemover, and HtmlSpecialCharsEncoder - Added new domain-level sanitizers: HtmlPurifierSanitizer, JsonSanitizer, and MarkdownSanitizer - Reorganized and added input sanitizers: HtmlSpecialCharsSanitizer, NormalizeLineBreaksSanitizer, StripTagsSanitizer (renamed from HtmlPurifier), TrimSanitizer - Added new security sanitizers: FilenameSanitizer, SqlInjectionSanitizer, XssSanitizer (moved to security module) - Updated and added corresponding test cases for all new and renamed sanitizers - Removed outdated test files for the deleted processors - Modified tests/application.php for compatibility with the new processor structure
1 parent 1776fa2 commit e80dd99

34 files changed

+870
-498
lines changed

src/Processor/Cleaner/EmailAddressCleaner.php

Lines changed: 0 additions & 18 deletions
This file was deleted.

src/Processor/Cleaner/NumericValueCleaner.php

Lines changed: 0 additions & 19 deletions
This file was deleted.

src/Processor/Cleaner/UrlAddressCleaner.php

Lines changed: 0 additions & 18 deletions
This file was deleted.
Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace KaririCode\Sanitizer\Processor\Domain;
6+
7+
use KaririCode\Contract\Processor\ConfigurableProcessor;
8+
use KaririCode\Sanitizer\Processor\AbstractSanitizerProcessor;
9+
10+
final class HtmlPurifierSanitizer extends AbstractSanitizerProcessor implements ConfigurableProcessor
11+
{
12+
private const DEFAULT_ALLOWED_TAGS = ['p', 'br', 'strong', 'em', 'u', 'ol', 'ul', 'li', 'a', 'img'];
13+
private const DEFAULT_ALLOWED_ATTRIBUTES = ['href' => ['a'], 'src' => ['img'], 'alt' => ['img']];
14+
private const DEFAULT_ENCODING = 'UTF-8';
15+
16+
private array $allowedTags;
17+
private array $allowedAttributes;
18+
private string $encoding;
19+
20+
public function __construct()
21+
{
22+
$this->allowedTags = self::DEFAULT_ALLOWED_TAGS;
23+
$this->allowedAttributes = self::DEFAULT_ALLOWED_ATTRIBUTES;
24+
$this->encoding = self::DEFAULT_ENCODING;
25+
}
26+
27+
public function configure(array $options): void
28+
{
29+
$this->allowedTags = $options['allowedTags'] ?? $this->allowedTags;
30+
$this->allowedAttributes = $options['allowedAttributes'] ?? $this->allowedAttributes;
31+
$this->encoding = $options['encoding'] ?? $this->encoding;
32+
}
33+
34+
public function process(mixed $input): string
35+
{
36+
$input = $this->guardAgainstNonString($input);
37+
$input = $this->sanitizeHtml($input);
38+
39+
$dom = new \DOMDocument('1.0', $this->encoding);
40+
$this->loadHtmlToDom($dom, $input);
41+
$this->filterNodes($dom->getElementsByTagName('*'));
42+
43+
return $this->cleanHtmlOutput($dom->saveHTML());
44+
}
45+
46+
private function filterNodes(\DOMNodeList $nodes): void
47+
{
48+
for ($i = $nodes->length - 1; $i >= 0; --$i) {
49+
$node = $nodes->item($i);
50+
if (!$this->isAllowedTag($node->nodeName)) {
51+
$this->unwrapNode($node);
52+
} else {
53+
$this->filterAttributes($node);
54+
}
55+
}
56+
}
57+
58+
private function filterAttributes(\DOMElement $element): void
59+
{
60+
for ($i = $element->attributes->length - 1; $i >= 0; --$i) {
61+
/** @var DOMNode */
62+
$attr = $element->attributes->item($i);
63+
if (!$this->isAllowedAttribute($element->nodeName, $attr->name)) {
64+
$element->removeAttribute($attr->name);
65+
}
66+
}
67+
}
68+
69+
private function sanitizeHtml(string $html): string
70+
{
71+
$html = $this->removeScripts($html);
72+
73+
return $this->removeComments($html);
74+
}
75+
76+
private function loadHtmlToDom(\DOMDocument $dom, string $html): void
77+
{
78+
libxml_use_internal_errors(true);
79+
$isLoaded = $dom->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', $this->encoding), LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
80+
$errors = libxml_get_errors();
81+
libxml_clear_errors();
82+
}
83+
84+
private function cleanHtmlOutput(string $output): string
85+
{
86+
$output = preg_replace('/^<!DOCTYPE.+?>/', '', $output);
87+
$output = str_replace(['<html>', '</html>', '<body>', '</body>'], '', $output);
88+
89+
return trim($output);
90+
}
91+
92+
private function isAllowedTag(string $tagName): bool
93+
{
94+
return in_array(strtolower($tagName), $this->allowedTags, true);
95+
}
96+
97+
private function isAllowedAttribute(string $elementName, string $attributeName): bool
98+
{
99+
return isset($this->allowedAttributes[strtolower($attributeName)])
100+
&& in_array(strtolower($elementName), $this->allowedAttributes[strtolower($attributeName)], true);
101+
}
102+
103+
private function unwrapNode(\DOMNode $node): void
104+
{
105+
$parent = $node->parentNode;
106+
while ($node->firstChild) {
107+
$parent->insertBefore($node->firstChild, $node);
108+
}
109+
$parent->removeChild($node);
110+
}
111+
112+
private function removeScripts(string $html): string
113+
{
114+
return $this->removeElementsByTagName('script', $html);
115+
}
116+
117+
private function removeComments(string $html): string
118+
{
119+
return $this->removeElementsByXPath('//comment()', $html);
120+
}
121+
122+
private function removeElementsByTagName(string $tagName, string $html): string
123+
{
124+
$dom = new \DOMDocument();
125+
libxml_use_internal_errors(true);
126+
$dom->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', $this->encoding), LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
127+
libxml_clear_errors();
128+
129+
$elements = $dom->getElementsByTagName($tagName);
130+
while ($elements->length > 0) {
131+
$elements->item(0)->parentNode->removeChild($elements->item(0));
132+
}
133+
134+
return $dom->saveHTML();
135+
}
136+
137+
private function removeElementsByXPath(string $query, string $html): string
138+
{
139+
$dom = new \DOMDocument();
140+
libxml_use_internal_errors(true);
141+
$dom->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', $this->encoding), LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
142+
libxml_clear_errors();
143+
144+
$xpath = new \DOMXPath($dom);
145+
foreach ($xpath->query($query) as $element) {
146+
$element->parentNode->removeChild($element);
147+
}
148+
149+
return $dom->saveHTML();
150+
}
151+
}
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace KaririCode\Sanitizer\Processor\Domain;
6+
7+
use KaririCode\Sanitizer\Processor\AbstractSanitizerProcessor;
8+
9+
class JsonSanitizer extends AbstractSanitizerProcessor
10+
{
11+
public function process(mixed $input): string
12+
{
13+
$input = $this->guardAgainstNonString($input);
14+
$decoded = json_decode($input, true);
15+
if (JSON_ERROR_NONE !== json_last_error()) {
16+
throw new \InvalidArgumentException('Invalid JSON input');
17+
}
18+
19+
return json_encode($decoded, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES);
20+
}
21+
}
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace KaririCode\Sanitizer\Processor\Domain;
6+
7+
use KaririCode\Sanitizer\Processor\AbstractSanitizerProcessor;
8+
9+
class MarkdownSanitizer extends AbstractSanitizerProcessor
10+
{
11+
public function process(mixed $input): string
12+
{
13+
$input = $this->guardAgainstNonString($input);
14+
// Remove HTML tags, keeping Markdown intact
15+
$input = strip_tags($input);
16+
// Escape special Markdown characters
17+
$input = preg_replace('/([*_`#])/', '\\\\$1', $input);
18+
19+
return $input;
20+
}
21+
}

src/Processor/Encoder/HtmlSpecialCharsEncoder.php

Lines changed: 0 additions & 19 deletions
This file was deleted.
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace KaririCode\Sanitizer\Processor\Input;
6+
7+
use KaririCode\Sanitizer\Processor\AbstractSanitizerProcessor;
8+
9+
class HtmlSpecialCharsSanitizer extends AbstractSanitizerProcessor
10+
{
11+
public function process(mixed $input): string
12+
{
13+
$input = $this->guardAgainstNonString($input);
14+
15+
return htmlspecialchars($input, ENT_QUOTES | ENT_HTML5, 'UTF-8');
16+
}
17+
}
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace KaririCode\Sanitizer\Processor\Input;
6+
7+
use KaririCode\Sanitizer\Processor\AbstractSanitizerProcessor;
8+
9+
class NormalizeLineBreaksSanitizer extends AbstractSanitizerProcessor
10+
{
11+
public function process(mixed $input): string
12+
{
13+
$input = $this->guardAgainstNonString($input);
14+
15+
return str_replace(["\r\n", "\r"], "\n", $input);
16+
}
17+
}

src/Processor/HtmlPurifier.php renamed to src/Processor/Input/StripTagsSanitizer.php

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,14 @@
22

33
declare(strict_types=1);
44

5-
namespace KaririCode\Sanitizer\Processor;
5+
namespace KaririCode\Sanitizer\Processor\Input;
66

77
use KaririCode\Contract\Processor\ConfigurableProcessor;
8+
use KaririCode\Sanitizer\Processor\AbstractSanitizerProcessor;
89

9-
class HtmlPurifier extends AbstractSanitizerProcessor implements ConfigurableProcessor
10+
class StripTagsSanitizer extends AbstractSanitizerProcessor implements ConfigurableProcessor
1011
{
11-
private array $allowedTags = ['p', 'br', 'strong', 'em'];
12+
private array $allowedTags = [];
1213

1314
public function configure(array $options): void
1415
{

0 commit comments

Comments
 (0)