X-Git-Url: https://scripts.mit.edu/gitweb/autoinstallsdev/mediawiki.git/blobdiff_plain/19e297c21b10b1b8a3acad5e73fc71dcb35db44a..6932310fd58ebef145fa01eb76edf7150284d8ea:/vendor/wikimedia/html-formatter/src/HtmlFormatter.php diff --git a/vendor/wikimedia/html-formatter/src/HtmlFormatter.php b/vendor/wikimedia/html-formatter/src/HtmlFormatter.php new file mode 100644 index 00000000..e93ab1f0 --- /dev/null +++ b/vendor/wikimedia/html-formatter/src/HtmlFormatter.php @@ -0,0 +1,366 @@ +html = $html; + } + + /** + * Turns a chunk of HTML into a proper document + * @param string $html + * @return string + */ + public static function wrapHTML( $html ) { + return '' . $html . ''; + } + + /** + * Override this in descendant class to modify HTML after it has been converted from DOM tree + * @param string $html HTML to process + * @return string Processed HTML + */ + protected function onHtmlReady( $html ) { + return $html; + } + + /** + * @return DOMDocument DOM to manipulate + */ + public function getDoc() { + if ( !$this->doc ) { + // DOMDocument::loadHTML apparently isn't very good with encodings, so + // convert input to ASCII by encoding everything above 128 as entities. + $html = \mb_convert_encoding( $this->html, 'HTML-ENTITIES', 'UTF-8' ); + + // Workaround for bug that caused spaces before references + // to disappear during processing: https://phabricator.wikimedia.org/T55086 + $html = str_replace( ' <', ' <', $html ); + + \libxml_use_internal_errors( true ); + $loader = \libxml_disable_entity_loader(); + $this->doc = new \DOMDocument(); + $this->doc->strictErrorChecking = false; + $this->doc->loadHTML( $html ); + \libxml_disable_entity_loader( $loader ); + \libxml_use_internal_errors( false ); + $this->doc->encoding = 'UTF-8'; + } + return $this->doc; + } + + /** + * Sets whether images/videos/sounds should be removed from output + * @param bool $flag + */ + public function setRemoveMedia( $flag = true ) { + $this->removeMedia = $flag; + } + + /** + * Adds one or more selector of content to remove. A subset of CSS selector + * syntax is supported: + * + * + * .class + * . + * # + * + * @param array|string $selectors Selector(s) of stuff to remove + */ + public function remove( $selectors ) { + $this->itemsToRemove = array_merge( $this->itemsToRemove, (array)$selectors ); + } + + /** + * Adds one or more element name to the list to flatten (remove tag, but not its content) + * Can accept undelimited regexes + * + * Note this interface may fail in surprising unexpected ways due to usage of regexes, + * so should not be relied on for HTML markup security measures. + * + * @param array|string $elements Name(s) of tag(s) to flatten + */ + public function flatten( $elements ) { + $this->elementsToFlatten = array_merge( $this->elementsToFlatten, (array)$elements ); + } + + /** + * Instructs the formatter to flatten all tags + */ + public function flattenAllTags() { + $this->flatten( '[?!]?[a-z0-9]+' ); + } + + /** + * Removes content we've chosen to remove. The text of the removed elements can be + * extracted with the getText method. + * @return array Array of removed DOMElements + */ + public function filterContent() { + $removals = $this->parseItemsToRemove(); + + // Bail out early if nothing to do + if ( \array_reduce( $removals, + function ( $carry, $item ) { + return $carry && !$item; + }, + true + ) ) { + return []; + } + + $doc = $this->getDoc(); + + // Remove tags + + // You can't remove DOMNodes from a DOMNodeList as you're iterating + // over them in a foreach loop. It will seemingly leave the internal + // iterator on the foreach out of wack and results will be quite + // strange. Though, making a queue of items to remove seems to work. + $domElemsToRemove = []; + foreach ( $removals['TAG'] as $tagToRemove ) { + $tagToRemoveNodes = $doc->getElementsByTagName( $tagToRemove ); + foreach ( $tagToRemoveNodes as $tagToRemoveNode ) { + if ( $tagToRemoveNode ) { + $domElemsToRemove[] = $tagToRemoveNode; + } + } + } + $removed = $this->removeElements( $domElemsToRemove ); + + // Elements with named IDs + $domElemsToRemove = []; + foreach ( $removals['ID'] as $itemToRemove ) { + $itemToRemoveNode = $doc->getElementById( $itemToRemove ); + if ( $itemToRemoveNode ) { + $domElemsToRemove[] = $itemToRemoveNode; + } + } + $removed = array_merge( $removed, $this->removeElements( $domElemsToRemove ) ); + + // CSS Classes + $domElemsToRemove = []; + $xpath = new \DOMXPath( $doc ); + foreach ( $removals['CLASS'] as $classToRemove ) { + $elements = $xpath->query( '//*[contains(@class, "' . $classToRemove . '")]' ); + + /** @var $element DOMElement */ + foreach ( $elements as $element ) { + $classes = $element->getAttribute( 'class' ); + if ( \preg_match( "/\b$classToRemove\b/", $classes ) && $element->parentNode ) { + $domElemsToRemove[] = $element; + } + } + } + $removed = \array_merge( $removed, $this->removeElements( $domElemsToRemove ) ); + + // Tags with CSS Classes + foreach ( $removals['TAG_CLASS'] as $classToRemove ) { + $parts = explode( '.', $classToRemove ); + + $elements = $xpath->query( + '//' . $parts[0] . '[@class="' . $parts[1] . '"]' + ); + $removed = array_merge( $removed, $this->removeElements( $elements ) ); + } + + return $removed; + } + + /** + * Removes a list of elelments from DOMDocument + * @param array|DOMNodeList $elements + * @return array Array of removed elements + */ + private function removeElements( $elements ) { + $list = $elements; + if ( $elements instanceof \DOMNodeList ) { + $list = []; + foreach ( $elements as $element ) { + $list[] = $element; + } + } + /** @var $element DOMElement */ + foreach ( $list as $element ) { + if ( $element->parentNode ) { + $element->parentNode->removeChild( $element ); + } + } + return $list; + } + + /** + * libxml in its usual pointlessness converts many chars to entities - this function + * perfoms a reverse conversion + * @param string $html + * @return string + */ + private function fixLibXML( $html ) { + // We don't include rules like '"' => '"' because entities had already been + // normalized by libxml. Using this function with input not sanitized by libxml is UNSAFE! + $replacements = [ + '"' => '"', + '&' => '&', + '<' => '<', + '>' => '>', + ]; + $html = strtr( $html, $replacements ); + + if ( \function_exists( 'mb_convert_encoding' ) ) { + // Just in case the conversion in getDoc() above used named + // entities that aren't known to html_entity_decode(). + $html = \mb_convert_encoding( $html, 'UTF-8', 'HTML-ENTITIES' ); + } else { + $html = \html_entity_decode( $html, ENT_COMPAT, 'utf-8' ); + } + return $html; + } + + /** + * Performs final transformations and returns resulting HTML. Note that if you want to call this + * both without an element and with an element you should call it without an element first. If you + * specify the $element in the method it'll change the underlying dom and you won't be able to get + * it back. + * + * @param DOMElement|string|null $element ID of element to get HTML from or + * false to get it from the whole tree + * @return string Processed HTML + */ + public function getText( $element = null ) { + + if ( $this->doc ) { + if ( $element !== null && !( $element instanceof \DOMElement ) ) { + $element = $this->doc->getElementById( $element ); + } + if ( $element ) { + $body = $this->doc->getElementsByTagName( 'body' )->item( 0 ); + $nodesArray = []; + foreach ( $body->childNodes as $node ) { + $nodesArray[] = $node; + } + foreach ( $nodesArray as $nodeArray ) { + $body->removeChild( $nodeArray ); + } + $body->appendChild( $element ); + } + $html = $this->doc->saveHTML(); + + $html = $this->fixLibXml( $html ); + if ( PHP_EOL === "\r\n" ) { + // Cleanup for CRLF misprocessing of unknown origin on Windows. + $html = str_replace( ' ', '', $html ); + } + } else { + $html = $this->html; + } + // Remove stuff added by wrapHTML() + $html = \preg_replace( '/|^.*?|<\/body>.*$/s', '', $html ); + $html = $this->onHtmlReady( $html ); + + if ( $this->elementsToFlatten ) { + $elements = \implode( '|', $this->elementsToFlatten ); + $html = \preg_replace( "#]*>#is", '', $html ); + } + + return $html; + } + + /** + * Helper function for parseItemsToRemove(). This function extracts the selector type + * and the raw name of a selector from a CSS-style selector string and assigns those + * values to parameters passed by reference. For example, if given '#toc' as the + * $selector parameter, it will assign 'ID' as the $type and 'toc' as the $rawName. + * @param string $selector CSS selector to parse + * @param string $type The type of selector (ID, CLASS, TAG_CLASS, or TAG) + * @param string $rawName The raw name of the selector + * @return bool Whether the selector was successfully recognised + * @throws MWException + */ + protected function parseSelector( $selector, &$type, &$rawName ) { + if ( strpos( $selector, '.' ) === 0 ) { + $type = 'CLASS'; + $rawName = substr( $selector, 1 ); + } elseif ( strpos( $selector, '#' ) === 0 ) { + $type = 'ID'; + $rawName = substr( $selector, 1 ); + } elseif ( strpos( $selector, '.' ) !== 0 && strpos( $selector, '.' ) !== false ) { + $type = 'TAG_CLASS'; + $rawName = $selector; + } elseif ( strpos( $selector, '[' ) === false && strpos( $selector, ']' ) === false ) { + $type = 'TAG'; + $rawName = $selector; + } else { + throw new \Exception( __METHOD__ . "(): unrecognized selector '$selector'" ); + } + + return true; + } + + /** + * Transforms CSS-style selectors into an internal representation suitable for + * processing by filterContent() + * @return array + */ + protected function parseItemsToRemove() { + $removals = [ + 'ID' => [], + 'TAG' => [], + 'CLASS' => [], + 'TAG_CLASS' => [], + ]; + + foreach ( $this->itemsToRemove as $itemToRemove ) { + $type = ''; + $rawName = ''; + if ( $this->parseSelector( $itemToRemove, $type, $rawName ) ) { + $removals[$type][] = $rawName; + } + } + + if ( $this->removeMedia ) { + $removals['TAG'][] = 'img'; + $removals['TAG'][] = 'audio'; + $removals['TAG'][] = 'video'; + } + + return $removals; + } +}