X-Git-Url: https://scripts.mit.edu/gitweb/autoinstallsdev/mediawiki.git/blobdiff_plain/19e297c21b10b1b8a3acad5e73fc71dcb35db44a..6932310fd58ebef145fa01eb76edf7150284d8ea:/vendor/wikimedia/remex-html/RemexHtml/GenerateDataFiles.php diff --git a/vendor/wikimedia/remex-html/RemexHtml/GenerateDataFiles.php b/vendor/wikimedia/remex-html/RemexHtml/GenerateDataFiles.php new file mode 100644 index 00000000..13430ffd --- /dev/null +++ b/vendor/wikimedia/remex-html/RemexHtml/GenerateDataFiles.php @@ -0,0 +1,324 @@ +execute(); + } + + /** + * This is the character entity mapping table copied from + * https://www.w3.org/TR/2014/REC-html5-20141028/syntax.html#tokenizing-character-references + */ + private static $legacyNumericEntityData = << 'address, applet, area, article, aside, base, + basefont, bgsound, blockquote, body, br, button, caption, center, + col, colgroup, dd, details, dir, div, dl, dt, embed, fieldset, + figcaption, figure, footer, form, frame, frameset, h1, h2, h3, h4, + h5, h6, head, header, hr, html, iframe, img, input, li, link, + listing, main, marquee, menu, menuitem, meta, nav, noembed, + noframes, noscript, object, ol, p, param, plaintext, pre, script, + section, select, source, style, summary, table, tbody, td, template, + textarea, tfoot, th, thead, title, tr, track, ul, wbr, xmp', + self::NS_MATHML => 'mi, mo, mn, ms, mtext, annotation-xml', + self::NS_SVG => 'foreignObject, desc, title', + ]; + + // @codingStandardsIgnoreStart + /** + * The NameStartChar production from XML 1.0, but with colon excluded since + * there's a lot of ways to break namespace validation, and we actually need + * this for local names + */ + private static $nameStartChar = '[A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]'; + + /** The NameChar production from XML 1.0 */ + private static $nameChar = 'NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]'; + // @codingStandardsIgnoreEnd + + private function makeRegexAlternation( $array ) { + $regex = ''; + foreach ( $array as $value ) { + if ( $regex !== '' ) { + $regex .= '|'; + } + $regex .= "\n\t\t" . preg_quote( substr( $value, 1 ), '~' ); + } + return $regex; + } + + private function getCharRanges( $input, $nonterminals = [] ) { + $ranges = []; + + foreach ( preg_split( '/\s*\|\s*/', $input ) as $case ) { + if ( preg_match( '/^"(.)"$/', $case, $m ) ) { + // Single ASCII character + $ranges[] = [ ord( $m[1] ), ord( $m[1] ) ]; + } elseif ( preg_match( '/^\[(.)-(.)\]$/', $case, $m ) ) { + // ASCII range + $ranges[] = [ ord( $m[1] ), ord( $m[2] ) ]; + } elseif ( preg_match( '/^#x([0-9A-F]+)$/', $case, $m ) ) { + // Single encoded character + $codepoint = intval( $m[1], 16 ); + $ranges[] = [ $codepoint, $codepoint ]; + } elseif ( preg_match( '/^\[#x([0-9A-F]+)-#x([0-9A-F]+)\]$/', $case, $m ) ) { + // Encoded range + $ranges[] = [ intval( $m[1], 16 ), intval( $m[2], 16 ) ]; + } elseif ( isset( $nonterminals[$case] ) ) { + $ranges = array_merge( $ranges, $this->getCharRanges( $nonterminals[$case] ) ); + } else { + throw new \Exception( "Invalid XML char case \"$case\"" ); + } + } + usort( $ranges, function ( $a, $b ) { + return $a[0] - $b[0]; + } ); + return $ranges; + } + + private function makeConvTable( $input, $nonterminals = [] ) { + $ranges = $this->getCharRanges( $input, $nonterminals ); + + // Invert the ranges, produce a set complement + $lastEndPlusOne = 0; + $table = []; + for ( $i = 0; $i < count( $ranges ); $i++ ) { + $start = $ranges[$i][0]; + $end = $ranges[$i][1]; + // Merge consecutive ranges + for ( $j = $i + 1; $j < count( $ranges ); $j++ ) { + if ( $ranges[$j][0] === $end + 1 ) { + $end = $ranges[$j][1]; + $i = $j; + } else { + break; + } + } + + $table[] = $lastEndPlusOne; + $table[] = $start - 1; + $table[] = 0; + $table[] = 0xffffff; + + $lastEndPlusOne = $end + 1; + } + + // Last range + $table[] = $lastEndPlusOne; + $table[] = 0x10ffff; + $table[] = 0; + $table[] = 0xffffff; + + return $table; + } + + private function encodeConvTable( $table ) { + return "[\n\t\t" . implode( ",\n\t\t", array_map( + function ( $a ) { + return implode( ', ', $a ); + }, + array_chunk( $table, 4 ) ) ) . ' ]'; + } + + private function execute() { + $entitiesJson = file_get_contents( __DIR__ . '/entities.json' ); + + if ( $entitiesJson === false ) { + throw new \Exception( "Please download entities.json from " . + "https://www.w3.org/TR/2016/REC-html51-20161101/entities.json" ); + } + + $entities = (array)json_decode( $entitiesJson ); + + $entityTranslations = []; + foreach ( $entities as $entity => $info ) { + $entityTranslations[substr( $entity, 1 )] = $info->characters; + } + + // Sort descending by length + uksort( $entities, function ( $a, $b ) { + if ( strlen( $a ) > strlen( $b ) ) { + return -1; + } elseif ( strlen( $a ) < strlen( $b ) ) { + return 1; + } else { + return strcmp( $a, $b ); + } + } ); + + $entityRegex = $this->makeRegexAlternation( array_keys( $entities ) ); + + $matches = []; + preg_match_all( '/^0x([0-9A-F]+)\s+U\+([0-9A-F]+)/m', + self::$legacyNumericEntityData, $matches, PREG_SET_ORDER ); + + $legacyNumericEntities = []; + foreach ( $matches as $match ) { + $legacyNumericEntities[ intval( $match[1], 16 ) ] = + \UtfNormal\Utils::codepointToUtf8( intval( $match[2], 16 ) ); + } + + $quirkyRegex = + '~' . + $this->makeRegexAlternation( self::$quirkyPublicPrefixes ) . + '~xAi'; + + $nameStartCharConvTable = $this->makeConvTable( self::$nameStartChar ); + $nameCharConvTable = $this->makeConvTable( self::$nameChar, + [ 'NameStartChar' => self::$nameStartChar ] ); + + $encEntityRegex = var_export( $entityRegex, true ); + $encTranslations = var_export( $entityTranslations, true ); + $encLegacy = var_export( $legacyNumericEntities, true ); + $encQuirkyRegex = var_export( $quirkyRegex, true ); + $encNameStartCharConvTable = $this->encodeConvTable( $nameStartCharConvTable ); + $encNameCharConvTable = $this->encodeConvTable( $nameCharConvTable ); + + $special = []; + foreach ( self::$special as $ns => $str ) { + foreach ( explode( ',', $str ) as $name ) { + $special[$ns][trim( $name )] = true; + } + } + $encSpecial = var_export( $special, true ); + + $fileContents = '<' . <<