--- /dev/null
+<?php
+
+namespace RemexHtml;
+
+/**
+ * Generate HTMLData.php. This can be executed e.g. with
+ *
+ * echo 'RemexHtml\GenerateDataFiles::run()' | hhvm bin/test.php
+ */
+class GenerateDataFiles {
+ const NS_HTML = 'http://www.w3.org/1999/xhtml';
+ const NS_MATHML = 'http://www.w3.org/1998/Math/MathML';
+ const NS_SVG = 'http://www.w3.org/2000/svg';
+ const NS_XLINK = 'http://www.w3.org/1999/xlink';
+ const NS_XML = 'http://www.w3.org/XML/1998/namespace';
+ const NS_XMLNS = 'http://www.w3.org/2000/xmlns/';
+
+ /**
+ * The only public entry point
+ */
+ public static function run() {
+ $instance = new self;
+ $instance->execute();
+ }
+
+ /**
+ * This is the character entity mapping table copied from
+ * https://www.w3.org/TR/2014/REC-html5-20141028/syntax.html#tokenizing-character-references
+ */
+ private static $legacyNumericEntityData = <<<EOT
+0x00 U+FFFD REPLACEMENT CHARACTER
+0x80 U+20AC EURO SIGN (€)
+0x82 U+201A SINGLE LOW-9 QUOTATION MARK (‚)
+0x83 U+0192 LATIN SMALL LETTER F WITH HOOK (ƒ)
+0x84 U+201E DOUBLE LOW-9 QUOTATION MARK („)
+0x85 U+2026 HORIZONTAL ELLIPSIS (…)
+0x86 U+2020 DAGGER (†)
+0x87 U+2021 DOUBLE DAGGER (‡)
+0x88 U+02C6 MODIFIER LETTER CIRCUMFLEX ACCENT (ˆ)
+0x89 U+2030 PER MILLE SIGN (‰)
+0x8A U+0160 LATIN CAPITAL LETTER S WITH CARON (Š)
+0x8B U+2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK (‹)
+0x8C U+0152 LATIN CAPITAL LIGATURE OE (Œ)
+0x8E U+017D LATIN CAPITAL LETTER Z WITH CARON (Ž)
+0x91 U+2018 LEFT SINGLE QUOTATION MARK (‘)
+0x92 U+2019 RIGHT SINGLE QUOTATION MARK (’)
+0x93 U+201C LEFT DOUBLE QUOTATION MARK (“)
+0x94 U+201D RIGHT DOUBLE QUOTATION MARK (”)
+0x95 U+2022 BULLET (•)
+0x96 U+2013 EN DASH (–)
+0x97 U+2014 EM DASH (—)
+0x98 U+02DC SMALL TILDE (˜)
+0x99 U+2122 TRADE MARK SIGN (™)
+0x9A U+0161 LATIN SMALL LETTER S WITH CARON (š)
+0x9B U+203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (›)
+0x9C U+0153 LATIN SMALL LIGATURE OE (œ)
+0x9E U+017E LATIN SMALL LETTER Z WITH CARON (ž)
+0x9F U+0178 LATIN CAPITAL LETTER Y WITH DIAERESIS (Ÿ)
+EOT;
+
+ /**
+ * This is the list of public identifier prefixes that cause quirks mode
+ * to be set, from § 8.2.5.4.1
+ */
+ private static $quirkyPublicPrefixes = [
+ "+//Silmaril//dtd html Pro v0r11 19970101//",
+ "-//AS//DTD HTML 3.0 asWedit + extensions//",
+ "-//AdvaSoft Ltd//DTD HTML 3.0 asWedit + extensions//",
+ "-//IETF//DTD HTML 2.0 Level 1//",
+ "-//IETF//DTD HTML 2.0 Level 2//",
+ "-//IETF//DTD HTML 2.0 Strict Level 1//",
+ "-//IETF//DTD HTML 2.0 Strict Level 2//",
+ "-//IETF//DTD HTML 2.0 Strict//",
+ "-//IETF//DTD HTML 2.0//",
+ "-//IETF//DTD HTML 2.1E//",
+ "-//IETF//DTD HTML 3.0//",
+ "-//IETF//DTD HTML 3.2 Final//",
+ "-//IETF//DTD HTML 3.2//",
+ "-//IETF//DTD HTML 3//",
+ "-//IETF//DTD HTML Level 0//",
+ "-//IETF//DTD HTML Level 1//",
+ "-//IETF//DTD HTML Level 2//",
+ "-//IETF//DTD HTML Level 3//",
+ "-//IETF//DTD HTML Strict Level 0//",
+ "-//IETF//DTD HTML Strict Level 1//",
+ "-//IETF//DTD HTML Strict Level 2//",
+ "-//IETF//DTD HTML Strict Level 3//",
+ "-//IETF//DTD HTML Strict//",
+ "-//IETF//DTD HTML//",
+ "-//Metrius//DTD Metrius Presentational//",
+ "-//Microsoft//DTD Internet Explorer 2.0 HTML Strict//",
+ "-//Microsoft//DTD Internet Explorer 2.0 HTML//",
+ "-//Microsoft//DTD Internet Explorer 2.0 Tables//",
+ "-//Microsoft//DTD Internet Explorer 3.0 HTML Strict//",
+ "-//Microsoft//DTD Internet Explorer 3.0 HTML//",
+ "-//Microsoft//DTD Internet Explorer 3.0 Tables//",
+ "-//Netscape Comm. Corp.//DTD HTML//",
+ "-//Netscape Comm. Corp.//DTD Strict HTML//",
+ "-//O'Reilly and Associates//DTD HTML 2.0//",
+ "-//O'Reilly and Associates//DTD HTML Extended 1.0//",
+ "-//O'Reilly and Associates//DTD HTML Extended Relaxed 1.0//",
+ "-//SoftQuad Software//DTD HoTMetaL PRO 6.0::19990601::extensions to HTML 4.0//",
+ "-//SoftQuad//DTD HoTMetaL PRO 4.0::19971010::extensions to HTML 4.0//",
+ "-//Spyglass//DTD HTML 2.0 Extended//",
+ "-//SQ//DTD HTML 2.0 HoTMetaL + extensions//",
+ "-//Sun Microsystems Corp.//DTD HotJava HTML//",
+ "-//Sun Microsystems Corp.//DTD HotJava Strict HTML//",
+ "-//W3C//DTD HTML 3 1995-03-24//",
+ "-//W3C//DTD HTML 3.2 Draft//",
+ "-//W3C//DTD HTML 3.2 Final//",
+ "-//W3C//DTD HTML 3.2//",
+ "-//W3C//DTD HTML 3.2S Draft//",
+ "-//W3C//DTD HTML 4.0 Frameset//",
+ "-//W3C//DTD HTML 4.0 Transitional//",
+ "-//W3C//DTD HTML Experimental 19960712//",
+ "-//W3C//DTD HTML Experimental 970421//",
+ "-//W3C//DTD W3 HTML//",
+ "-//W3O//DTD W3 HTML 3.0//",
+ "-//WebTechs//DTD Mozilla HTML 2.0//",
+ "-//WebTechs//DTD Mozilla HTML//",
+ ];
+
+ private static $special = [
+ self::NS_HTML => 'address, applet, area, article, aside, base,
+ basefont, bgsound, blockquote, body, br, button, caption, center,
+ col, colgroup, dd, details, dir, div, dl, dt, embed, fieldset,
+ figcaption, figure, footer, form, frame, frameset, h1, h2, h3, h4,
+ h5, h6, head, header, hr, html, iframe, img, input, li, link,
+ listing, main, marquee, menu, menuitem, meta, nav, noembed,
+ noframes, noscript, object, ol, p, param, plaintext, pre, script,
+ section, select, source, style, summary, table, tbody, td, template,
+ textarea, tfoot, th, thead, title, tr, track, ul, wbr, xmp',
+ self::NS_MATHML => 'mi, mo, mn, ms, mtext, annotation-xml',
+ self::NS_SVG => 'foreignObject, desc, title',
+ ];
+
+ // @codingStandardsIgnoreStart
+ /**
+ * The NameStartChar production from XML 1.0, but with colon excluded since
+ * there's a lot of ways to break namespace validation, and we actually need
+ * this for local names
+ */
+ private static $nameStartChar = '[A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]';
+
+ /** The NameChar production from XML 1.0 */
+ private static $nameChar = 'NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]';
+ // @codingStandardsIgnoreEnd
+
+ private function makeRegexAlternation( $array ) {
+ $regex = '';
+ foreach ( $array as $value ) {
+ if ( $regex !== '' ) {
+ $regex .= '|';
+ }
+ $regex .= "\n\t\t" . preg_quote( substr( $value, 1 ), '~' );
+ }
+ return $regex;
+ }
+
+ private function getCharRanges( $input, $nonterminals = [] ) {
+ $ranges = [];
+
+ foreach ( preg_split( '/\s*\|\s*/', $input ) as $case ) {
+ if ( preg_match( '/^"(.)"$/', $case, $m ) ) {
+ // Single ASCII character
+ $ranges[] = [ ord( $m[1] ), ord( $m[1] ) ];
+ } elseif ( preg_match( '/^\[(.)-(.)\]$/', $case, $m ) ) {
+ // ASCII range
+ $ranges[] = [ ord( $m[1] ), ord( $m[2] ) ];
+ } elseif ( preg_match( '/^#x([0-9A-F]+)$/', $case, $m ) ) {
+ // Single encoded character
+ $codepoint = intval( $m[1], 16 );
+ $ranges[] = [ $codepoint, $codepoint ];
+ } elseif ( preg_match( '/^\[#x([0-9A-F]+)-#x([0-9A-F]+)\]$/', $case, $m ) ) {
+ // Encoded range
+ $ranges[] = [ intval( $m[1], 16 ), intval( $m[2], 16 ) ];
+ } elseif ( isset( $nonterminals[$case] ) ) {
+ $ranges = array_merge( $ranges, $this->getCharRanges( $nonterminals[$case] ) );
+ } else {
+ throw new \Exception( "Invalid XML char case \"$case\"" );
+ }
+ }
+ usort( $ranges, function ( $a, $b ) {
+ return $a[0] - $b[0];
+ } );
+ return $ranges;
+ }
+
+ private function makeConvTable( $input, $nonterminals = [] ) {
+ $ranges = $this->getCharRanges( $input, $nonterminals );
+
+ // Invert the ranges, produce a set complement
+ $lastEndPlusOne = 0;
+ $table = [];
+ for ( $i = 0; $i < count( $ranges ); $i++ ) {
+ $start = $ranges[$i][0];
+ $end = $ranges[$i][1];
+ // Merge consecutive ranges
+ for ( $j = $i + 1; $j < count( $ranges ); $j++ ) {
+ if ( $ranges[$j][0] === $end + 1 ) {
+ $end = $ranges[$j][1];
+ $i = $j;
+ } else {
+ break;
+ }
+ }
+
+ $table[] = $lastEndPlusOne;
+ $table[] = $start - 1;
+ $table[] = 0;
+ $table[] = 0xffffff;
+
+ $lastEndPlusOne = $end + 1;
+ }
+
+ // Last range
+ $table[] = $lastEndPlusOne;
+ $table[] = 0x10ffff;
+ $table[] = 0;
+ $table[] = 0xffffff;
+
+ return $table;
+ }
+
+ private function encodeConvTable( $table ) {
+ return "[\n\t\t" . implode( ",\n\t\t", array_map(
+ function ( $a ) {
+ return implode( ', ', $a );
+ },
+ array_chunk( $table, 4 ) ) ) . ' ]';
+ }
+
+ private function execute() {
+ $entitiesJson = file_get_contents( __DIR__ . '/entities.json' );
+
+ if ( $entitiesJson === false ) {
+ throw new \Exception( "Please download entities.json from " .
+ "https://www.w3.org/TR/2016/REC-html51-20161101/entities.json" );
+ }
+
+ $entities = (array)json_decode( $entitiesJson );
+
+ $entityTranslations = [];
+ foreach ( $entities as $entity => $info ) {
+ $entityTranslations[substr( $entity, 1 )] = $info->characters;
+ }
+
+ // Sort descending by length
+ uksort( $entities, function ( $a, $b ) {
+ if ( strlen( $a ) > strlen( $b ) ) {
+ return -1;
+ } elseif ( strlen( $a ) < strlen( $b ) ) {
+ return 1;
+ } else {
+ return strcmp( $a, $b );
+ }
+ } );
+
+ $entityRegex = $this->makeRegexAlternation( array_keys( $entities ) );
+
+ $matches = [];
+ preg_match_all( '/^0x([0-9A-F]+)\s+U\+([0-9A-F]+)/m',
+ self::$legacyNumericEntityData, $matches, PREG_SET_ORDER );
+
+ $legacyNumericEntities = [];
+ foreach ( $matches as $match ) {
+ $legacyNumericEntities[ intval( $match[1], 16 ) ] =
+ \UtfNormal\Utils::codepointToUtf8( intval( $match[2], 16 ) );
+ }
+
+ $quirkyRegex =
+ '~' .
+ $this->makeRegexAlternation( self::$quirkyPublicPrefixes ) .
+ '~xAi';
+
+ $nameStartCharConvTable = $this->makeConvTable( self::$nameStartChar );
+ $nameCharConvTable = $this->makeConvTable( self::$nameChar,
+ [ 'NameStartChar' => self::$nameStartChar ] );
+
+ $encEntityRegex = var_export( $entityRegex, true );
+ $encTranslations = var_export( $entityTranslations, true );
+ $encLegacy = var_export( $legacyNumericEntities, true );
+ $encQuirkyRegex = var_export( $quirkyRegex, true );
+ $encNameStartCharConvTable = $this->encodeConvTable( $nameStartCharConvTable );
+ $encNameCharConvTable = $this->encodeConvTable( $nameCharConvTable );
+
+ $special = [];
+ foreach ( self::$special as $ns => $str ) {
+ foreach ( explode( ',', $str ) as $name ) {
+ $special[$ns][trim( $name )] = true;
+ }
+ }
+ $encSpecial = var_export( $special, true );
+
+ $fileContents = '<' . <<<PHP
+?php
+
+/**
+ * This data file is machine generated, see GenerateDataFiles.php
+ */
+
+namespace RemexHtml;
+
+class HTMLData {
+ const NS_HTML = 'http://www.w3.org/1999/xhtml';
+ const NS_MATHML = 'http://www.w3.org/1998/Math/MathML';
+ const NS_SVG = 'http://www.w3.org/2000/svg';
+ const NS_XLINK = 'http://www.w3.org/1999/xlink';
+ const NS_XML = 'http://www.w3.org/XML/1998/namespace';
+ const NS_XMLNS = 'http://www.w3.org/2000/xmlns/';
+
+ static public \$special = $encSpecial;
+ static public \$namedEntityRegex = $encEntityRegex;
+ static public \$namedEntityTranslations = $encTranslations;
+ static public \$legacyNumericEntities = $encLegacy;
+ static public \$quirkyPrefixRegex = $encQuirkyRegex;
+ static public \$nameStartCharConvTable = $encNameStartCharConvTable;
+ static public \$nameCharConvTable = $encNameCharConvTable;
+}
+PHP;
+
+ file_put_contents( __DIR__ . '/HTMLData.php', $fileContents );
+ }
+}