vendor/wikimedia/html-formatter/src/HtmlFormatter.php

   1 <?php
   2 /**
   3  * Performs transformations of HTML by wrapping around libxml2 and working
   4  * around its countless bugs.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License as published by
   8  * the Free Software Foundation; either version 2 of the License, or
   9  * (at your option) any later version.
  10  *
  11  * This program is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14  * GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License along
  17  * with this program; if not, write to the Free Software Foundation, Inc.,
  18  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  19  * http://www.gnu.org/copyleft/gpl.html
  20  *
  21  * @file
  22  */
  23
  24 namespace HtmlFormatter;
  25
  26 class HtmlFormatter {
  27         /**
  28          * @var DOMDocument
  29          */
  30         private $doc;
  31
  32         private $html;
  33         private $itemsToRemove = [];
  34         private $elementsToFlatten = [];
  35         protected $removeMedia = false;
  36
  37         /**
  38          * Constructor
  39          *
  40          * @param string $html Text to process
  41          */
  42         public function __construct( $html ) {
  43                 $this->html = $html;
  44         }
  45
  46         /**
  47          * Turns a chunk of HTML into a proper document
  48          * @param string $html
  49          * @return string
  50          */
  51         public static function wrapHTML( $html ) {
  52                 return '<!doctype html><html><head></head><body>' . $html . '</body></html>';
  53         }
  54
  55         /**
  56          * Override this in descendant class to modify HTML after it has been converted from DOM tree
  57          * @param string $html HTML to process
  58          * @return string Processed HTML
  59          */
  60         protected function onHtmlReady( $html ) {
  61                 return $html;
  62         }
  63
  64         /**
  65          * @return DOMDocument DOM to manipulate
  66          */
  67         public function getDoc() {
  68                 if ( !$this->doc ) {
  69                         // DOMDocument::loadHTML apparently isn't very good with encodings, so
  70                         // convert input to ASCII by encoding everything above 128 as entities.
  71                         $html = \mb_convert_encoding( $this->html, 'HTML-ENTITIES', 'UTF-8' );
  72
  73                         // Workaround for bug that caused spaces before references
  74                         // to disappear during processing: https://phabricator.wikimedia.org/T55086
  75                         $html = str_replace( ' <', '&#32;<', $html );
  76
  77                         \libxml_use_internal_errors( true );
  78                         $loader = \libxml_disable_entity_loader();
  79                         $this->doc = new \DOMDocument();
  80                         $this->doc->strictErrorChecking = false;
  81                         $this->doc->loadHTML( $html );
  82                         \libxml_disable_entity_loader( $loader );
  83                         \libxml_use_internal_errors( false );
  84                         $this->doc->encoding = 'UTF-8';
  85                 }
  86                 return $this->doc;
  87         }
  88
  89         /**
  90          * Sets whether images/videos/sounds should be removed from output
  91          * @param bool $flag
  92          */
  93         public function setRemoveMedia( $flag = true ) {
  94                 $this->removeMedia = $flag;
  95         }
  96
  97         /**
  98          * Adds one or more selector of content to remove. A subset of CSS selector
  99          * syntax is supported:
 100          *
 101          *   <tag>
 102          *   <tag>.class
 103          *   .<class>
 104          *   #<id>
 105          *
 106          * @param array|string $selectors Selector(s) of stuff to remove
 107          */
 108         public function remove( $selectors ) {
 109                 $this->itemsToRemove = array_merge( $this->itemsToRemove, (array)$selectors );
 110         }
 111
 112         /**
 113          * Adds one or more element name to the list to flatten (remove tag, but not its content)
 114          * Can accept undelimited regexes
 115          *
 116          * Note this interface may fail in surprising unexpected ways due to usage of regexes,
 117          * so should not be relied on for HTML markup security measures.
 118          *
 119          * @param array|string $elements Name(s) of tag(s) to flatten
 120          */
 121         public function flatten( $elements ) {
 122                 $this->elementsToFlatten = array_merge( $this->elementsToFlatten, (array)$elements );
 123         }
 124
 125         /**
 126          * Instructs the formatter to flatten all tags
 127          */
 128         public function flattenAllTags() {
 129                 $this->flatten( '[?!]?[a-z0-9]+' );
 130         }
 131
 132         /**
 133          * Removes content we've chosen to remove.  The text of the removed elements can be
 134          * extracted with the getText method.
 135          * @return array Array of removed DOMElements
 136          */
 137         public function filterContent() {
 138                 $removals = $this->parseItemsToRemove();
 139
 140                 // Bail out early if nothing to do
 141                 if ( \array_reduce( $removals,
 142                         function ( $carry, $item ) {
 143                                 return $carry && !$item;
 144                         },
 145                         true
 146                 ) ) {
 147                         return [];
 148                 }
 149
 150                 $doc = $this->getDoc();
 151
 152                 // Remove tags
 153
 154                 // You can't remove DOMNodes from a DOMNodeList as you're iterating
 155                 // over them in a foreach loop. It will seemingly leave the internal
 156                 // iterator on the foreach out of wack and results will be quite
 157                 // strange. Though, making a queue of items to remove seems to work.
 158                 $domElemsToRemove = [];
 159                 foreach ( $removals['TAG'] as $tagToRemove ) {
 160                         $tagToRemoveNodes = $doc->getElementsByTagName( $tagToRemove );
 161                         foreach ( $tagToRemoveNodes as $tagToRemoveNode ) {
 162                                 if ( $tagToRemoveNode ) {
 163                                         $domElemsToRemove[] = $tagToRemoveNode;
 164                                 }
 165                         }
 166                 }
 167                 $removed = $this->removeElements( $domElemsToRemove );
 168
 169                 // Elements with named IDs
 170                 $domElemsToRemove = [];
 171                 foreach ( $removals['ID'] as $itemToRemove ) {
 172                         $itemToRemoveNode = $doc->getElementById( $itemToRemove );
 173                         if ( $itemToRemoveNode ) {
 174                                 $domElemsToRemove[] = $itemToRemoveNode;
 175                         }
 176                 }
 177                 $removed = array_merge( $removed, $this->removeElements( $domElemsToRemove ) );
 178
 179                 // CSS Classes
 180                 $domElemsToRemove = [];
 181                 $xpath = new \DOMXPath( $doc );
 182                 foreach ( $removals['CLASS'] as $classToRemove ) {
 183                         $elements = $xpath->query( '//*[contains(@class, "' . $classToRemove . '")]' );
 184
 185                         /** @var $element DOMElement */
 186                         foreach ( $elements as $element ) {
 187                                 $classes = $element->getAttribute( 'class' );
 188                                 if ( \preg_match( "/\b$classToRemove\b/", $classes ) && $element->parentNode ) {
 189                                         $domElemsToRemove[] = $element;
 190                                 }
 191                         }
 192                 }
 193                 $removed = \array_merge( $removed, $this->removeElements( $domElemsToRemove ) );
 194
 195                 // Tags with CSS Classes
 196                 foreach ( $removals['TAG_CLASS'] as $classToRemove ) {
 197                         $parts = explode( '.', $classToRemove );
 198
 199                         $elements = $xpath->query(
 200                                 '//' . $parts[0] . '[@class="' . $parts[1] . '"]'
 201                         );
 202                         $removed = array_merge( $removed, $this->removeElements( $elements ) );
 203                 }
 204
 205                 return $removed;
 206         }
 207
 208         /**
 209          * Removes a list of elelments from DOMDocument
 210          * @param array|DOMNodeList $elements
 211          * @return array Array of removed elements
 212          */
 213         private function removeElements( $elements ) {
 214                 $list = $elements;
 215                 if ( $elements instanceof \DOMNodeList ) {
 216                         $list = [];
 217                         foreach ( $elements as $element ) {
 218                                 $list[] = $element;
 219                         }
 220                 }
 221                 /** @var $element DOMElement */
 222                 foreach ( $list as $element ) {
 223                         if ( $element->parentNode ) {
 224                                 $element->parentNode->removeChild( $element );
 225                         }
 226                 }
 227                 return $list;
 228         }
 229
 230         /**
 231          * libxml in its usual pointlessness converts many chars to entities - this function
 232          * perfoms a reverse conversion
 233          * @param string $html
 234          * @return string
 235          */
 236         private function fixLibXML( $html ) {
 237                 // We don't include rules like '&#34;' => '&amp;quot;' because entities had already been
 238                 // normalized by libxml. Using this function with input not sanitized by libxml is UNSAFE!
 239                 $replacements = [
 240                         '&quot;' => '&amp;quot;',
 241                         '&amp;' => '&amp;amp;',
 242                         '&lt;' => '&amp;lt;',
 243                         '&gt;' => '&amp;gt;',
 244                 ];
 245                 $html = strtr( $html, $replacements );
 246
 247                 if ( \function_exists( 'mb_convert_encoding' ) ) {
 248                         // Just in case the conversion in getDoc() above used named
 249                         // entities that aren't known to html_entity_decode().
 250                         $html = \mb_convert_encoding( $html, 'UTF-8', 'HTML-ENTITIES' );
 251                 } else {
 252                         $html = \html_entity_decode( $html, ENT_COMPAT, 'utf-8' );
 253                 }
 254                 return $html;
 255         }
 256
 257         /**
 258          * Performs final transformations and returns resulting HTML.  Note that if you want to call this
 259          * both without an element and with an element you should call it without an element first.  If you
 260          * specify the $element in the method it'll change the underlying dom and you won't be able to get
 261          * it back.
 262          *
 263          * @param DOMElement|string|null $element ID of element to get HTML from or
 264          *   false to get it from the whole tree
 265          * @return string Processed HTML
 266          */
 267         public function getText( $element = null ) {
 268
 269                 if ( $this->doc ) {
 270                         if ( $element !== null && !( $element instanceof \DOMElement ) ) {
 271                                 $element = $this->doc->getElementById( $element );
 272                         }
 273                         if ( $element ) {
 274                                 $body = $this->doc->getElementsByTagName( 'body' )->item( 0 );
 275                                 $nodesArray = [];
 276                                 foreach ( $body->childNodes as $node ) {
 277                                         $nodesArray[] = $node;
 278                                 }
 279                                 foreach ( $nodesArray as $nodeArray ) {
 280                                         $body->removeChild( $nodeArray );
 281                                 }
 282                                 $body->appendChild( $element );
 283                         }
 284                         $html = $this->doc->saveHTML();
 285
 286                         $html = $this->fixLibXml( $html );
 287                         if ( PHP_EOL === "\r\n" ) {
 288                                 // Cleanup for CRLF misprocessing of unknown origin on Windows.
 289                                 $html = str_replace( '&#13;', '', $html );
 290                         }
 291                 } else {
 292                         $html = $this->html;
 293                 }
 294                 // Remove stuff added by wrapHTML()
 295                 $html = \preg_replace( '/<!--.*?-->|^.*?<body>|<\/body>.*$/s', '', $html );
 296                 $html = $this->onHtmlReady( $html );
 297
 298                 if ( $this->elementsToFlatten ) {
 299                         $elements = \implode( '|', $this->elementsToFlatten );
 300                         $html = \preg_replace( "#</?($elements)\\b[^>]*>#is", '', $html );
 301                 }
 302
 303                 return $html;
 304         }
 305
 306         /**
 307          * Helper function for parseItemsToRemove(). This function extracts the selector type
 308          * and the raw name of a selector from a CSS-style selector string and assigns those
 309          * values to parameters passed by reference. For example, if given '#toc' as the
 310          * $selector parameter, it will assign 'ID' as the $type and 'toc' as the $rawName.
 311          * @param string $selector CSS selector to parse
 312          * @param string $type The type of selector (ID, CLASS, TAG_CLASS, or TAG)
 313          * @param string $rawName The raw name of the selector
 314          * @return bool Whether the selector was successfully recognised
 315          * @throws MWException
 316          */
 317         protected function parseSelector( $selector, &$type, &$rawName ) {
 318                 if ( strpos( $selector, '.' ) === 0 ) {
 319                         $type = 'CLASS';
 320                         $rawName = substr( $selector, 1 );
 321                 } elseif ( strpos( $selector, '#' ) === 0 ) {
 322                         $type = 'ID';
 323                         $rawName = substr( $selector, 1 );
 324                 } elseif ( strpos( $selector, '.' ) !== 0 && strpos( $selector, '.' ) !== false ) {
 325                         $type = 'TAG_CLASS';
 326                         $rawName = $selector;
 327                 } elseif ( strpos( $selector, '[' ) === false && strpos( $selector, ']' ) === false ) {
 328                         $type = 'TAG';
 329                         $rawName = $selector;
 330                 } else {
 331                         throw new \Exception( __METHOD__ . "(): unrecognized selector '$selector'" );
 332                 }
 333
 334                 return true;
 335         }
 336
 337         /**
 338          * Transforms CSS-style selectors into an internal representation suitable for
 339          * processing by filterContent()
 340          * @return array
 341          */
 342         protected function parseItemsToRemove() {
 343                 $removals = [
 344                         'ID' => [],
 345                         'TAG' => [],
 346                         'CLASS' => [],
 347                         'TAG_CLASS' => [],
 348                 ];
 349
 350                 foreach ( $this->itemsToRemove as $itemToRemove ) {
 351                         $type = '';
 352                         $rawName = '';
 353                         if ( $this->parseSelector( $itemToRemove, $type, $rawName ) ) {
 354                                 $removals[$type][] = $rawName;
 355                         }
 356                 }
 357
 358                 if ( $this->removeMedia ) {
 359                         $removals['TAG'][] = 'img';
 360                         $removals['TAG'][] = 'audio';
 361                         $removals['TAG'][] = 'video';
 362                 }
 363
 364                 return $removals;
 365         }
 366 }