Skip to content

Commit 837005e

Browse files
committed
Moved code that parses CSS selectors into its own class.
Updated htmldoc::find() to use the new class. Updated htmldoc::minify() to move the code that sorts attributes into its own method. Updated htmldoc::open(), htmldoc::getCharsetFromHtml(), and htmldoc::minify() to reduce cognitive complexity.
1 parent 6cb2403 commit 837005e

File tree

3 files changed

+196
-146
lines changed

3 files changed

+196
-146
lines changed

src/autoload.php

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@
1010
'hexydec\\html\\pre' => __DIR__.'/tokens/pre.php',
1111
'hexydec\\html\\custom' => __DIR__.'/tokens/custom.php',
1212
'hexydec\\html\\tag' => __DIR__.'/tokens/tag.php',
13-
'hexydec\\html\\text' => __DIR__.'/tokens/text.php'
13+
'hexydec\\html\\text' => __DIR__.'/tokens/text.php',
14+
'hexydec\\html\\selector' => __DIR__.'/helpers/selector.php'
1415
];
1516
if (isset($classes[$class])) {
1617
return (bool) require($classes[$class]);

src/helpers/selector.php

Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
<?php
2+
declare(strict_types = 1);
3+
namespace hexydec\html;
4+
use \hexydec\tokens\tokenise;
5+
6+
class selector {
7+
8+
/**
9+
* @var array $selectors Regexp components keyed by their corresponding codename for tokenising CSS selectors
10+
*/
11+
protected static array $tokens = [
12+
'quotes' => '(?<!\\\\)"(?:[^"\\\\]++|\\\\.)*+"',
13+
'join' => '\\s*[>+~]\\s*',
14+
'comparison' => '[\\^*$<>]?=', // comparison operators for media queries or attribute selectors
15+
'squareopen' => '\\[',
16+
'squareclose' => '\\]',
17+
'bracketopen' => '\\(',
18+
'bracketclose' => '\\)',
19+
'comma' => ',',
20+
'pseudo' => ':[A-Za-z-]++',
21+
'id' => '#[^ +>\.#{\\[,]++',
22+
'class' => '\.[^ +>\.#{\\[\\(\\),]++',
23+
'string' => '\\*|[^\\[\\]{}\\(\\):;,>+=~\\^$!" #\\.*]++',
24+
'whitespace' => '\s++',
25+
];
26+
27+
public function get(string $selector) {
28+
$tokens = new tokenise(self::$tokens, \trim($selector));
29+
return $this->parse($tokens);
30+
}
31+
32+
/**
33+
* Parses a CSS selector string
34+
*
35+
* @param string $selector The CSS selector string to parse
36+
* @return array|bool An array of selector components
37+
*/
38+
public function parse(tokenise $tokens) {
39+
if (($token = $tokens->next()) !== null) {
40+
$selectors = $parts = [];
41+
$join = null;
42+
do {
43+
switch ($token['type']) {
44+
case 'id':
45+
$parts[] = [
46+
'id' => \mb_substr($token['value'], 1),
47+
'join' => $join
48+
];
49+
$join = null;
50+
break;
51+
52+
case 'class':
53+
$parts[] = [
54+
'class' => \mb_substr($token['value'], 1),
55+
'join' => $join
56+
];
57+
$join = null;
58+
break;
59+
60+
case 'string':
61+
$parts[] = [
62+
'tag' => $token['value'],
63+
'join' => $join
64+
];
65+
$join = null;
66+
break;
67+
68+
case 'squareopen':
69+
$parts[] = $this->parseAttributes($tokens, $join);
70+
$join = null;
71+
break;
72+
73+
case 'pseudo':
74+
$sub = null;
75+
if (($bracket = $tokens->next()) !== null && $bracket['type'] === 'bracketopen') {
76+
$sub = $this->parse($tokens);
77+
} elseif ($bracket) {
78+
$tokens->prev();
79+
}
80+
$parts[] = [
81+
'pseudo' => \mb_substr($token['value'], 1),
82+
'sub' => $sub,
83+
'join' => $join
84+
];
85+
$join = null;
86+
break;
87+
88+
case 'join':
89+
$join = \trim($token['value']);
90+
break;
91+
92+
case 'whitespace':
93+
if ($parts) {
94+
$join = ' ';
95+
}
96+
break;
97+
98+
case 'comma':
99+
$selectors[] = $parts;
100+
$parts = [];
101+
break;
102+
103+
case 'bracketclose':
104+
$selectors[] = $parts;
105+
$parts = [];
106+
break;
107+
}
108+
} while (($token = $tokens->next()) !== null);
109+
if ($parts) {
110+
$selectors[] = $parts;
111+
}
112+
return $selectors;
113+
}
114+
return false;
115+
}
116+
117+
protected function parseAttributes(tokenise $tokens, ?string $join = null) {
118+
$item = ['join' => $join, 'sensitive' => true];
119+
while (($token = $tokens->next()) !== null && $token['type'] !== 'squareclose') {
120+
121+
// record comparison
122+
if ($token['type'] === 'comparison') {
123+
$item['comparison'] = $token['value'];
124+
125+
// handle string or quotes
126+
} elseif (\in_array($token['type'], ['string', 'quotes'], true)) {
127+
128+
// strip quotes
129+
if ($token['type'] === 'quotes') {
130+
$token['value'] = \stripslashes(\mb_substr($token['value'], 1, -1));
131+
}
132+
133+
// set attribute
134+
if (!isset($item['attribute'])) {
135+
$item['attribute'] = $token['value'];
136+
137+
// set value
138+
} elseif (!isset($item['value'])) {
139+
$item['value'] = $token['value'];
140+
141+
// set sensitive
142+
} elseif ($token['value'] === 'i') {
143+
$item['sensitive'] = false;
144+
}
145+
}
146+
}
147+
return $item;
148+
}
149+
}

src/htmldoc.php

Lines changed: 45 additions & 145 deletions
Original file line numberDiff line numberDiff line change
@@ -197,12 +197,10 @@ public function open(string $url, mixed $context = null, string &$error = null)
197197
// find charset in headers
198198
$charset = null;
199199
$meta = \stream_get_meta_data($handle);
200-
if (!empty($meta['wrapper_data'])) {
201-
foreach ($meta['wrapper_data'] AS $item) {
202-
if (\mb_stripos($item, 'Content-Type:') === 0 && ($value = \mb_stristr($item, 'charset=')) !== false) {
203-
$charset = \mb_substr($value, 8);
204-
break;
205-
}
200+
foreach ($meta['wrapper_data'] ?? [] AS $item) {
201+
if (\mb_stripos($item, 'Content-Type:') === 0 && ($value = \mb_stristr($item, 'charset=')) !== false) {
202+
$charset = \mb_substr($value, 8);
203+
break;
206204
}
207205
}
208206

@@ -251,20 +249,19 @@ public function load(string $html, string $charset = null, &$error = null) : boo
251249
* @return string The defined or detected charset or null if the charset is not defined
252250
*/
253251
protected function getCharsetFromHtml(string $html) : ?string {
254-
if (\preg_match('/<meta[^>]+charset[^>]+>/i', $html, $match)) {
255-
$obj = new htmldoc($this->config);
256-
if ($obj->load($match[0], \mb_internal_encoding())) {
257-
258-
// <meta charset="xxx" />
259-
if (($charset = $obj->attr('charset')) !== null && $this->isEncodingValid($charset)) {
252+
$obj = new htmldoc($this->config);
253+
$pat = '/<meta[^>]+charset[^>]+>/i';
254+
if (\preg_match($pat, $html, $match) && $obj->load($match[0], \mb_internal_encoding())) {
255+
256+
// <meta charset="xxx" />
257+
if (($charset = $obj->attr('charset')) !== null && $this->isEncodingValid($charset)) {
258+
return $charset;
259+
260+
// <meta http-equiv="Content-Type" content="text/html; charset=xxx" />
261+
} elseif (($value = $obj->eq(0)->attr('content')) !== null && ($charset = \mb_stristr($value, 'charset=')) !== false) {
262+
$charset = \mb_substr($charset, 8);
263+
if ($this->isEncodingValid($charset)) {
260264
return $charset;
261-
262-
// <meta http-equiv="Content-Type" content="text/html; charset=xxx" />
263-
} elseif (($value = $obj->eq(0)->attr('content')) !== null && ($charset = \mb_stristr($value, 'charset=')) !== false) {
264-
$charset = \mb_substr($charset, 8);
265-
if ($this->isEncodingValid($charset)) {
266-
return $charset;
267-
}
268265
}
269266
}
270267
}
@@ -307,110 +304,6 @@ protected function parse($html) {
307304
return false;
308305
}
309306

310-
/**
311-
* Parses a CSS selector string
312-
*
313-
* @param string $selector The CSS selector string to parse
314-
* @return array|bool An array of selector components
315-
*/
316-
protected function parseSelector(tokenise $tokens) {
317-
if (($token = $tokens->next()) !== null) {
318-
$selectors = $parts = [];
319-
$join = null;
320-
do {
321-
switch ($token['type']) {
322-
case 'id':
323-
$parts[] = [
324-
'id' => \mb_substr($token['value'], 1),
325-
'join' => $join
326-
];
327-
$join = null;
328-
break;
329-
330-
case 'class':
331-
$parts[] = [
332-
'class' => \mb_substr($token['value'], 1),
333-
'join' => $join
334-
];
335-
$join = null;
336-
break;
337-
338-
case 'string':
339-
$parts[] = [
340-
'tag' => $token['value'],
341-
'join' => $join
342-
];
343-
$join = null;
344-
break;
345-
346-
case 'squareopen':
347-
$item = ['join' => $join, 'sensitive' => true];
348-
while (($token = $tokens->next()) !== null) {
349-
if ($token['type'] === 'squareclose') {
350-
break;
351-
} elseif (\in_array($token['type'], ['string', 'quotes'], true)) {
352-
if ($token['type'] === 'quotes') {
353-
$token['value'] = \stripslashes(\mb_substr($token['value'], 1, -1));
354-
}
355-
if (!isset($item['attribute'])) {
356-
$item['attribute'] = $token['value'];
357-
} elseif (!isset($item['value'])) {
358-
$item['value'] = $token['value'];
359-
} elseif ($token['value'] === 'i') {
360-
$item['sensitive'] = false;
361-
}
362-
} elseif ($token['type'] === 'comparison') {
363-
$item['comparison'] = $token['value'];
364-
}
365-
}
366-
$parts[] = $item;
367-
$join = null;
368-
break;
369-
370-
case 'pseudo':
371-
$sub = null;
372-
if (($bracket = $tokens->next()) !== null && $bracket['type'] === 'bracketopen') {
373-
$sub = $this->parseSelector($tokens);
374-
} elseif ($bracket) {
375-
$tokens->prev();
376-
}
377-
$parts[] = [
378-
'pseudo' => \mb_substr($token['value'], 1),
379-
'sub' => $sub,
380-
'join' => $join
381-
];
382-
$join = null;
383-
break;
384-
385-
case 'join':
386-
$join = \trim($token['value']);
387-
break;
388-
389-
case 'whitespace':
390-
if ($parts) {
391-
$join = ' ';
392-
}
393-
break;
394-
395-
case 'comma':
396-
$selectors[] = $parts;
397-
$parts = [];
398-
break;
399-
400-
case 'bracketclose':
401-
$selectors[] = $parts;
402-
$parts = [];
403-
break;
404-
}
405-
} while (($token = $tokens->next()) !== null);
406-
if ($parts) {
407-
$selectors[] = $parts;
408-
}
409-
return $selectors;
410-
}
411-
return false;
412-
}
413-
414307
/**
415308
* Caches the input values and records the number of occurences
416309
*
@@ -475,14 +368,14 @@ public function get(int $index = null) {
475368
* @return htmldoc A new htmldoc object containing the found tag items
476369
*/
477370
public function find(string $selector) : htmldoc {
478-
$tokens = new tokenise(self::$selectors, \trim($selector));
371+
$obj = new selector();
479372

480373
// parse selector and find tags
481374
$found = [];
482-
if (($parsed = $this->parseSelector($tokens)) !== false) {
375+
if (($tokens = $obj->get($selector)) !== false) {
483376
foreach ($this->children AS $item) {
484377
if (\get_class($item) === 'hexydec\\html\\tag') {
485-
foreach ($parsed AS $value) {
378+
foreach ($tokens AS $value) {
486379
if (($items = $item->find($value)) !== false) {
487380
$found = \array_merge($found, $items);
488381
}
@@ -628,25 +521,8 @@ public function minify(array $minify = []) : void {
628521
}
629522

630523
// sort classes by occurence, then by string
631-
if (\is_array($minify['attributes'])) {
632-
633-
// sort attribute values by most frequent
634-
if ($minify['attributes']['sort'] && !empty($this->cache['attr'])) {
635-
\arsort($this->cache['attr'], SORT_NUMERIC);
636-
\arsort($this->cache['attrvalues'], SORT_NUMERIC);
637-
$attr = [];
638-
foreach ($this->cache['attrvalues'] AS $item => $occurences) {
639-
if ($occurences > 5) {
640-
$item = \mb_strstr($item, '=', true);
641-
if (!\in_array($item, $attr, true)) {
642-
$attr[] = $item;
643-
}
644-
} else {
645-
break;
646-
}
647-
}
648-
$minify['attributes']['sort'] = \array_unique(\array_merge($attr, \array_keys($this->cache['attr'])));
649-
}
524+
if (!empty($minify['attributes']['sort']) && !empty($this->cache['attr'])) {
525+
$minify['attributes']['sort'] = $this->sortAttributes($this->cache['attr'], $this->cache['attrvalues']);
650526
}
651527

652528
// minify children
@@ -655,6 +531,30 @@ public function minify(array $minify = []) : void {
655531
}
656532
}
657533

534+
/**
535+
* Sort attributes in frequency order
536+
*
537+
* @param array $attr An array of attribute keys
538+
* @param array $values An array of attribute values
539+
* @return array An array of attributes ordered by frequency
540+
*/
541+
protected function sortAttributes(array $attr, array $values) : array {
542+
\arsort($attr, SORT_NUMERIC);
543+
\arsort($values, SORT_NUMERIC);
544+
$items = [];
545+
foreach ($values AS $item => $occurences) {
546+
if ($occurences > 5) {
547+
$item = \mb_strstr($item, '=', true);
548+
if (!\in_array($item, $items, true)) {
549+
$items[] = $item;
550+
}
551+
} else {
552+
break;
553+
}
554+
}
555+
return \array_unique(\array_merge($items, \array_keys($attr)));
556+
}
557+
658558
/**
659559
* Compile the document as an HTML string
660560
*

0 commit comments

Comments
 (0)