@@ -197,12 +197,10 @@ public function open(string $url, mixed $context = null, string &$error = null)
197197 // find charset in headers
198198 $ charset = null ;
199199 $ meta = \stream_get_meta_data ($ handle );
200- if (!empty ($ meta ['wrapper_data ' ])) {
201- foreach ($ meta ['wrapper_data ' ] AS $ item ) {
202- if (\mb_stripos ($ item , 'Content-Type: ' ) === 0 && ($ value = \mb_stristr ($ item , 'charset= ' )) !== false ) {
203- $ charset = \mb_substr ($ value , 8 );
204- break ;
205- }
200+ foreach ($ meta ['wrapper_data ' ] ?? [] AS $ item ) {
201+ if (\mb_stripos ($ item , 'Content-Type: ' ) === 0 && ($ value = \mb_stristr ($ item , 'charset= ' )) !== false ) {
202+ $ charset = \mb_substr ($ value , 8 );
203+ break ;
206204 }
207205 }
208206
@@ -251,20 +249,19 @@ public function load(string $html, string $charset = null, &$error = null) : boo
251249 * @return string The defined or detected charset or null if the charset is not defined
252250 */
253251 protected function getCharsetFromHtml (string $ html ) : ?string {
254- if (\preg_match ('/<meta[^>]+charset[^>]+>/i ' , $ html , $ match )) {
255- $ obj = new htmldoc ($ this ->config );
256- if ($ obj ->load ($ match [0 ], \mb_internal_encoding ())) {
257-
258- // <meta charset="xxx" />
259- if (($ charset = $ obj ->attr ('charset ' )) !== null && $ this ->isEncodingValid ($ charset )) {
252+ $ obj = new htmldoc ($ this ->config );
253+ $ pat = '/<meta[^>]+charset[^>]+>/i ' ;
254+ if (\preg_match ($ pat , $ html , $ match ) && $ obj ->load ($ match [0 ], \mb_internal_encoding ())) {
255+
256+ // <meta charset="xxx" />
257+ if (($ charset = $ obj ->attr ('charset ' )) !== null && $ this ->isEncodingValid ($ charset )) {
258+ return $ charset ;
259+
260+ // <meta http-equiv="Content-Type" content="text/html; charset=xxx" />
261+ } elseif (($ value = $ obj ->eq (0 )->attr ('content ' )) !== null && ($ charset = \mb_stristr ($ value , 'charset= ' )) !== false ) {
262+ $ charset = \mb_substr ($ charset , 8 );
263+ if ($ this ->isEncodingValid ($ charset )) {
260264 return $ charset ;
261-
262- // <meta http-equiv="Content-Type" content="text/html; charset=xxx" />
263- } elseif (($ value = $ obj ->eq (0 )->attr ('content ' )) !== null && ($ charset = \mb_stristr ($ value , 'charset= ' )) !== false ) {
264- $ charset = \mb_substr ($ charset , 8 );
265- if ($ this ->isEncodingValid ($ charset )) {
266- return $ charset ;
267- }
268265 }
269266 }
270267 }
@@ -307,110 +304,6 @@ protected function parse($html) {
307304 return false ;
308305 }
309306
310- /**
311- * Parses a CSS selector string
312- *
313- * @param string $selector The CSS selector string to parse
314- * @return array|bool An array of selector components
315- */
316- protected function parseSelector (tokenise $ tokens ) {
317- if (($ token = $ tokens ->next ()) !== null ) {
318- $ selectors = $ parts = [];
319- $ join = null ;
320- do {
321- switch ($ token ['type ' ]) {
322- case 'id ' :
323- $ parts [] = [
324- 'id ' => \mb_substr ($ token ['value ' ], 1 ),
325- 'join ' => $ join
326- ];
327- $ join = null ;
328- break ;
329-
330- case 'class ' :
331- $ parts [] = [
332- 'class ' => \mb_substr ($ token ['value ' ], 1 ),
333- 'join ' => $ join
334- ];
335- $ join = null ;
336- break ;
337-
338- case 'string ' :
339- $ parts [] = [
340- 'tag ' => $ token ['value ' ],
341- 'join ' => $ join
342- ];
343- $ join = null ;
344- break ;
345-
346- case 'squareopen ' :
347- $ item = ['join ' => $ join , 'sensitive ' => true ];
348- while (($ token = $ tokens ->next ()) !== null ) {
349- if ($ token ['type ' ] === 'squareclose ' ) {
350- break ;
351- } elseif (\in_array ($ token ['type ' ], ['string ' , 'quotes ' ], true )) {
352- if ($ token ['type ' ] === 'quotes ' ) {
353- $ token ['value ' ] = \stripslashes (\mb_substr ($ token ['value ' ], 1 , -1 ));
354- }
355- if (!isset ($ item ['attribute ' ])) {
356- $ item ['attribute ' ] = $ token ['value ' ];
357- } elseif (!isset ($ item ['value ' ])) {
358- $ item ['value ' ] = $ token ['value ' ];
359- } elseif ($ token ['value ' ] === 'i ' ) {
360- $ item ['sensitive ' ] = false ;
361- }
362- } elseif ($ token ['type ' ] === 'comparison ' ) {
363- $ item ['comparison ' ] = $ token ['value ' ];
364- }
365- }
366- $ parts [] = $ item ;
367- $ join = null ;
368- break ;
369-
370- case 'pseudo ' :
371- $ sub = null ;
372- if (($ bracket = $ tokens ->next ()) !== null && $ bracket ['type ' ] === 'bracketopen ' ) {
373- $ sub = $ this ->parseSelector ($ tokens );
374- } elseif ($ bracket ) {
375- $ tokens ->prev ();
376- }
377- $ parts [] = [
378- 'pseudo ' => \mb_substr ($ token ['value ' ], 1 ),
379- 'sub ' => $ sub ,
380- 'join ' => $ join
381- ];
382- $ join = null ;
383- break ;
384-
385- case 'join ' :
386- $ join = \trim ($ token ['value ' ]);
387- break ;
388-
389- case 'whitespace ' :
390- if ($ parts ) {
391- $ join = ' ' ;
392- }
393- break ;
394-
395- case 'comma ' :
396- $ selectors [] = $ parts ;
397- $ parts = [];
398- break ;
399-
400- case 'bracketclose ' :
401- $ selectors [] = $ parts ;
402- $ parts = [];
403- break ;
404- }
405- } while (($ token = $ tokens ->next ()) !== null );
406- if ($ parts ) {
407- $ selectors [] = $ parts ;
408- }
409- return $ selectors ;
410- }
411- return false ;
412- }
413-
414307 /**
415308 * Caches the input values and records the number of occurences
416309 *
@@ -475,14 +368,14 @@ public function get(int $index = null) {
475368 * @return htmldoc A new htmldoc object containing the found tag items
476369 */
477370 public function find (string $ selector ) : htmldoc {
478- $ tokens = new tokenise ( self :: $ selectors , \trim ( $ selector) );
371+ $ obj = new selector ( );
479372
480373 // parse selector and find tags
481374 $ found = [];
482- if (($ parsed = $ this -> parseSelector ( $ tokens )) !== false ) {
375+ if (($ tokens = $ obj -> get ( $ selector )) !== false ) {
483376 foreach ($ this ->children AS $ item ) {
484377 if (\get_class ($ item ) === 'hexydec \\html \\tag ' ) {
485- foreach ($ parsed AS $ value ) {
378+ foreach ($ tokens AS $ value ) {
486379 if (($ items = $ item ->find ($ value )) !== false ) {
487380 $ found = \array_merge ($ found , $ items );
488381 }
@@ -628,25 +521,8 @@ public function minify(array $minify = []) : void {
628521 }
629522
630523 // sort classes by occurence, then by string
631- if (\is_array ($ minify ['attributes ' ])) {
632-
633- // sort attribute values by most frequent
634- if ($ minify ['attributes ' ]['sort ' ] && !empty ($ this ->cache ['attr ' ])) {
635- \arsort ($ this ->cache ['attr ' ], SORT_NUMERIC );
636- \arsort ($ this ->cache ['attrvalues ' ], SORT_NUMERIC );
637- $ attr = [];
638- foreach ($ this ->cache ['attrvalues ' ] AS $ item => $ occurences ) {
639- if ($ occurences > 5 ) {
640- $ item = \mb_strstr ($ item , '= ' , true );
641- if (!\in_array ($ item , $ attr , true )) {
642- $ attr [] = $ item ;
643- }
644- } else {
645- break ;
646- }
647- }
648- $ minify ['attributes ' ]['sort ' ] = \array_unique (\array_merge ($ attr , \array_keys ($ this ->cache ['attr ' ])));
649- }
524+ if (!empty ($ minify ['attributes ' ]['sort ' ]) && !empty ($ this ->cache ['attr ' ])) {
525+ $ minify ['attributes ' ]['sort ' ] = $ this ->sortAttributes ($ this ->cache ['attr ' ], $ this ->cache ['attrvalues ' ]);
650526 }
651527
652528 // minify children
@@ -655,6 +531,30 @@ public function minify(array $minify = []) : void {
655531 }
656532 }
657533
534+ /**
535+ * Sort attributes in frequency order
536+ *
537+ * @param array $attr An array of attribute keys
538+ * @param array $values An array of attribute values
539+ * @return array An array of attributes ordered by frequency
540+ */
541+ protected function sortAttributes (array $ attr , array $ values ) : array {
542+ \arsort ($ attr , SORT_NUMERIC );
543+ \arsort ($ values , SORT_NUMERIC );
544+ $ items = [];
545+ foreach ($ values AS $ item => $ occurences ) {
546+ if ($ occurences > 5 ) {
547+ $ item = \mb_strstr ($ item , '= ' , true );
548+ if (!\in_array ($ item , $ items , true )) {
549+ $ items [] = $ item ;
550+ }
551+ } else {
552+ break ;
553+ }
554+ }
555+ return \array_unique (\array_merge ($ items , \array_keys ($ attr )));
556+ }
557+
658558 /**
659559 * Compile the document as an HTML string
660560 *
0 commit comments