@@ -443,96 +443,44 @@ private function resolveChildUrls(DOMElement $el) {
443443 }
444444 }
445445
446- public function textContent (DOMElement $ el ) {
447- $ excludeTags = array ('noframe ' , 'noscript ' , 'script ' , 'style ' , 'frames ' , 'frameset ' );
448-
449- if (isset ($ el ->tagName ) and in_array (strtolower ($ el ->tagName ), $ excludeTags )) {
450- return '' ;
451- }
452-
453- $ this ->resolveChildUrls ($ el );
454-
455- $ clonedEl = $ el ->cloneNode (true );
456-
457- foreach ($ this ->xpath ->query ('.//img ' , $ clonedEl ) as $ imgEl ) {
458- $ newNode = $ this ->doc ->createTextNode ($ imgEl ->getAttribute ($ imgEl ->hasAttribute ('alt ' ) ? 'alt ' : 'src ' ));
459- $ imgEl ->parentNode ->replaceChild ($ newNode , $ imgEl );
460- }
461-
462- foreach ($ excludeTags as $ tagName ) {
463- foreach ($ this ->xpath ->query (".// {$ tagName }" , $ clonedEl ) as $ elToRemove ) {
464- $ elToRemove ->parentNode ->removeChild ($ elToRemove );
465- }
466- }
467-
468- return $ this ->innerText ($ clonedEl );
446+ /**
447+ * The following two methods implements plain text parsing.
448+ * @see https://wiki.zegnat.net/media/textparsing.html
449+ **/
450+ public function textContent (DOMElement $ element )
451+ {
452+ return preg_replace (
453+ '/(^[\t\n\f\r ]+| +(?=\n)|(?<=\n) +| +(?= )|[\t\n\f\r ]+$)/ ' ,
454+ '' ,
455+ $ this ->elementToString ($ element )
456+ );
469457 }
470-
471- /**
472- * This method attempts to return a better 'innerText' representation than DOMNode::textContent
473- *
474- * @param DOMElement|DOMText $el
475- * @param bool $implied when parsing for implied name for h-*, rules may be slightly different
476- * @see: https://github.com/glennjones/microformat-shiv/blob/dev/lib/text.js
477- */
478- public function innerText ($ el , $ implied =false ) {
479- $ out = '' ;
480-
481- $ blockLevelTags = array ('h1 ' , 'h2 ' , 'h3 ' , 'h4 ' , 'h5 ' , 'h6 ' , 'p ' , 'hr ' , 'pre ' , 'table ' ,
482- 'address ' , 'article ' , 'aside ' , 'blockquote ' , 'caption ' , 'col ' , 'colgroup ' , 'dd ' , 'div ' ,
483- 'dt ' , 'dir ' , 'fieldset ' , 'figcaption ' , 'figure ' , 'footer ' , 'form ' , 'header ' , 'hgroup ' , 'hr ' ,
484- 'li ' , 'map ' , 'menu ' , 'nav ' , 'optgroup ' , 'option ' , 'section ' , 'tbody ' , 'testarea ' ,
485- 'tfoot ' , 'th ' , 'thead ' , 'tr ' , 'td ' , 'ul ' , 'ol ' , 'dl ' , 'details ' );
486-
487- $ excludeTags = array ('noframe ' , 'noscript ' , 'script ' , 'style ' , 'frames ' , 'frameset ' );
488-
489- // PHP DOMDocument doesn’t correctly handle whitespace around elements it doesn’t recognise.
490- $ unsupportedTags = array ('data ' );
491-
492- if (isset ($ el ->tagName )) {
493- if (in_array (strtolower ($ el ->tagName ), $ excludeTags )) {
494- return $ out ;
495- } else if ($ el ->tagName == 'img ' ) {
496- if ($ el ->hasAttribute ('alt ' )) {
497- return $ el ->getAttribute ('alt ' );
498- } else if (!$ implied && $ el ->hasAttribute ('src ' )) {
499- return $ this ->resolveUrl ($ el ->getAttribute ('src ' ));
500- }
501- } else if ($ el ->tagName == 'area ' and $ el ->hasAttribute ('alt ' )) {
502- return $ el ->getAttribute ('alt ' );
503- } else if ($ el ->tagName == 'abbr ' and $ el ->hasAttribute ('title ' )) {
504- return $ el ->getAttribute ('title ' );
505- }
506- }
507-
508- // if node is a text node get its text
509- if (isset ($ el ->nodeType ) && $ el ->nodeType === 3 ) {
510- $ out .= $ el ->textContent ;
511- }
512-
513- // get the text of the child nodes
514- if ($ el ->childNodes && $ el ->childNodes ->length > 0 ) {
515- for ($ j = 0 ; $ j < $ el ->childNodes ->length ; $ j ++) {
516- $ text = $ this ->innerText ($ el ->childNodes ->item ($ j ), $ implied );
517- if (!is_null ($ text )) {
518- $ out .= $ text ;
519- }
520- }
521- }
522-
523- if (isset ($ el ->tagName )) {
524- // if its a block level tag add an additional space at the end
525- if (in_array (strtolower ($ el ->tagName ), $ blockLevelTags )) {
526- $ out .= ' ' ;
527- } elseif ($ implied and in_array (strtolower ($ el ->tagName ), $ unsupportedTags )) {
528- $ out .= ' ' ;
529- } else if (strtolower ($ el ->tagName ) == 'br ' ) {
530- // else if its a br, replace with newline
531- $ out .= "\n" ;
532- }
533- }
534-
535- return ($ out === '' ) ? NULL : $ out ;
458+ private function elementToString (DOMElement $ input )
459+ {
460+ $ output = '' ;
461+ foreach ($ input ->childNodes as $ child ) {
462+ if ($ child ->nodeType === XML_TEXT_NODE ) {
463+ $ output .= str_replace (array ("\t" , "\n" , "\r" ) , ' ' , $ child ->textContent );
464+ } else if ($ child ->nodeType === XML_ELEMENT_NODE ) {
465+ $ tagName = strtoupper ($ child ->tagName );
466+ if (in_array ($ tagName , array ('SCRIPT ' , 'STYLE ' ))) {
467+ continue ;
468+ } else if ($ tagName === 'IMG ' ) {
469+ if ($ child ->hasAttribute ('alt ' )) {
470+ $ output .= ' ' . trim ($ child ->getAttribute ('alt ' ), "\t\n\f\r " ) . ' ' ;
471+ } else if ($ child ->hasAttribute ('src ' )) {
472+ $ output .= ' ' . $ this ->resolveUrl (trim ($ child ->getAttribute ('src ' ), "\t\n\f\r " )) . ' ' ;
473+ }
474+ } else if ($ tagName === 'BR ' ) {
475+ $ output .= "\n" ;
476+ } else if ($ tagName === 'P ' ) {
477+ $ output .= "\n" . $ this ->elementToString ($ child );
478+ } else {
479+ $ output .= $ this ->elementToString ($ child );
480+ }
481+ }
482+ }
483+ return $ output ;
536484 }
537485
538486 /**
@@ -648,7 +596,7 @@ public function parseP(\DOMElement $p) {
648596 } elseif (in_array ($ p ->tagName , array ('data ' , 'input ' )) and $ p ->hasAttribute ('value ' )) {
649597 $ pValue = $ p ->getAttribute ('value ' );
650598 } else {
651- $ pValue = unicodeTrim ( $ this ->innerText ($ p) );
599+ $ pValue = $ this ->textContent ($ p );
652600 }
653601
654602 return $ pValue ;
@@ -685,7 +633,7 @@ public function parseU(\DOMElement $u) {
685633 } elseif (in_array ($ u ->tagName , array ('data ' , 'input ' )) and $ u ->hasAttribute ('value ' )) {
686634 return $ u ->getAttribute ('value ' );
687635 } else {
688- return unicodeTrim ( $ this ->textContent ($ u) );
636+ return $ this ->textContent ($ u );
689637 }
690638 }
691639
@@ -916,7 +864,7 @@ public function parseE(\DOMElement $e) {
916864
917865 $ return = array (
918866 'html ' => unicodeTrim ($ html ),
919- 'value ' => unicodeTrim ( $ this ->innerText ($ e) ),
867+ 'value ' => $ this ->textContent ($ e ),
920868 );
921869
922870 if ($ this ->lang ) {
@@ -1123,7 +1071,7 @@ public function parseH(\DOMElement $e, $is_backcompat = false, $has_nested_mf =
11231071 }
11241072 }
11251073
1126- throw new Exception ($ this ->innerText ($ e , true ));
1074+ throw new Exception ($ this ->textContent ($ e , true ));
11271075 } catch (Exception $ exc ) {
11281076 $ return ['name ' ][] = unicodeTrim ($ exc ->getMessage ());
11291077 }
0 commit comments