]> scripts.mit.edu Git - autoinstallsdev/mediawiki.git/blobdiff - vendor/wikimedia/remex-html/RemexHtml/Tokenizer/Tokenizer.php
MediaWiki 1.30.2
[autoinstallsdev/mediawiki.git] / vendor / wikimedia / remex-html / RemexHtml / Tokenizer / Tokenizer.php
diff --git a/vendor/wikimedia/remex-html/RemexHtml/Tokenizer/Tokenizer.php b/vendor/wikimedia/remex-html/RemexHtml/Tokenizer/Tokenizer.php
new file mode 100644 (file)
index 0000000..3cf3540
--- /dev/null
@@ -0,0 +1,1485 @@
+<?php
+
+namespace RemexHtml\Tokenizer;
+use RemexHtml\HTMLData;
+use RemexHtml\PropGuard;
+
+/**
+ * HTML 5 tokenizer
+ *
+ * Based on the W3C recommendation as published 01 November 2016:
+ * https://www.w3.org/TR/2016/REC-html51-20161101/
+ */
+class Tokenizer {
+       // States
+       const STATE_START = 1;
+       const STATE_DATA = 2;
+       const STATE_RCDATA = 3;
+       const STATE_RAWTEXT = 4;
+       const STATE_SCRIPT_DATA = 5;
+       const STATE_PLAINTEXT = 6;
+       const STATE_EOF = 7;
+       const STATE_CURRENT = 8;
+
+       // Match indices for the data state regex
+       const MD_END_TAG_OPEN = 1;
+       const MD_TAG_NAME = 2;
+       const MD_COMMENT = 3;
+       const MD_COMMENT_INNER = 4;
+       const MD_COMMENT_END = 5;
+       const MD_DOCTYPE = 6;
+       const MD_DT_NAME_WS = 7;
+       const MD_DT_NAME = 8;
+       const MD_DT_PUBLIC_WS = 9;
+       const MD_DT_PUBLIC_DQ = 10;
+       const MD_DT_PUBLIC_SQ = 11;
+       const MD_DT_PUBSYS_WS = 12;
+       const MD_DT_PUBSYS_DQ = 13;
+       const MD_DT_PUBSYS_SQ = 14;
+       const MD_DT_SYSTEM_WS = 15;
+       const MD_DT_SYSTEM_DQ = 16;
+       const MD_DT_SYSTEM_SQ = 17;
+       const MD_DT_BOGUS = 18;
+       const MD_DT_END = 19;
+       const MD_CDATA = 20;
+       const MD_BOGUS_COMMENT = 21;
+
+       // Match indices for the character reference regex
+       const MC_PREFIX = 1;
+       const MC_DECIMAL = 2;
+       const MC_HEXDEC = 3;
+       const MC_SEMICOLON = 4;
+       const MC_HASH = 5;
+       const MC_NAMED = 6;
+       const MC_SUFFIX = 7;
+       const MC_INVALID = 8;
+
+       // Match indices for the attribute regex
+       const MA_SLASH = 1;
+       const MA_NAME = 2;
+       const MA_DQUOTED = 3;
+       const MA_SQUOTED = 4;
+       const MA_UNQUOTED = 5;
+
+       // Characters
+       const REPLACEMENT_CHAR = "\xef\xbf\xbd";
+       const BYTE_ORDER_MARK = "\xef\xbb\xbf";
+
+       protected $ignoreErrors;
+       protected $ignoreCharRefs;
+       protected $ignoreNulls;
+       protected $skipPreprocess;
+       protected $appropriateEndTag;
+       protected $listener;
+       protected $state;
+       protected $preprocessed;
+       protected $text;
+       protected $pos;
+       protected $length;
+       protected $enableCdataCallback;
+       protected $fragmentNamespace;
+       protected $fragmentName;
+
+       /**
+        * Constructor
+        *
+        * @param TokenHandler $listener The object which receives token events
+        * @param string $text The text to tokenize
+        * @param array $options Associative array of options, including:
+        *   - ignoreErrors: True to improve performance by ignoring errors. The
+        *     token stream should still be the same, except that error() won't be
+        *     called.
+        *   - ignoreCharRefs: True to ignore character references. Character tokens
+        *     will contain the unexpanded character references, and no errors
+        *     related to invalid character references will be raised. Performance
+        *     will be improved. This is not compliant behaviour.
+        *   - ignoreNulls: True to ignore NULL bytes in the input stream, instead
+        *     of raising errors and converting them to U+FFFD as is usually
+        *     required by the spec.
+        *   - skipPreprocess: True to skip the "preprocessing the input stream"
+        *     stage, which normalizes line endings and raises errors on certain
+        *     control characters. Advisable if the input stream is already
+        *     appropriately normalized.
+        */
+       public function __construct( TokenHandler $listener, $text, $options ) {
+               $this->listener = $listener;
+               $this->text = $text;
+               $this->pos = 0;
+               $this->preprocessed = false;
+               $this->length = strlen( $text );
+               $this->ignoreErrors = !empty( $options['ignoreErrors'] );
+               $this->ignoreCharRefs = !empty( $options['ignoreCharRefs'] );
+               $this->ignoreNulls = !empty( $options['ignoreNulls'] );
+               $this->skipPreprocess = !empty( $options['skipPreprocess'] );
+       }
+
+       public function __set( $name, $value ) {
+               PropGuard::set( $this, $name, $value );
+       }
+
+       public function setEnableCdataCallback( $cb ) {
+               $this->enableCdataCallback = $cb;
+       }
+
+       /**
+        * Run the tokenizer on the whole input stream. This is the normal entry point.
+        *
+        * @param array $options An associative array of options:
+        *   - state : One of the STATE_* constants, a state in which to start.
+        *   - appropriateEndTag : The "appropriate end tag", which needs to be set
+        *     if entering one of the raw text states.
+        *   - fragmentNamespace : The fragment namespace
+        *   - fragmentName : The fragment tag name
+        */
+       public function execute( $options = [] ) {
+               if ( isset( $options['state'] ) ) {
+                       $this->state = $options['state'];
+               } else {
+                       $this->state = self::STATE_START;
+               }
+
+               if ( isset( $options['fragmentNamespace'] ) ) {
+                       $this->setFragmentContext( $options['fragmentNamespace'], $options['fragmentName'] );
+               } else {
+                       $this->fragmentNamespace = null;
+                       $this->fragmentName = null;
+               }
+               $this->appropriateEndTag = isset( $options['appropriateEndTag'] ) ?
+                       $options['appropriateEndTag'] : null;
+               $this->preprocess();
+               $this->listener->startDocument( $this, $this->fragmentNamespace, $this->fragmentName );
+
+               $this->executeInternal( true );
+       }
+
+       /**
+        * Get the preprocessed input text. Source offsets in event parameters are
+        * relative to this string. If skipPreprocess was specified, this will be
+        * the same as the input string.
+        */
+       public function getPreprocessedText() {
+               $this->preprocess();
+               return $this->text;
+       }
+
+       /**
+        * Change the state of the tokenizer during parsing. This for use by the
+        * tree builder to switch the tokenizer into one of the raw text states.
+        *
+        * @param integer $state One of the STATE_* constants
+        * @param string $appropriateEndTag The appropriate end tag
+        */
+       public function switchState( $state, $appropriateEndTag ) {
+               $this->state = $state;
+               $this->appropriateEndTag = $appropriateEndTag;
+       }
+
+       /**
+        * Initialize the tokenizer for fragment parsing
+        *
+        * @param string $namespace The namespace of the context element
+        * @param string $tagName The name of the context element
+        */
+       public function setFragmentContext( $namespace, $tagName ) {
+               $this->fragmentNamespace = $namespace;
+               $this->fragmentName = $tagName;
+
+               if ( strval( $namespace ) !== '' && $namespace !== HTMLData::NS_HTML ) {
+                       return;
+               }
+
+               switch ( $tagName ) {
+               case 'title':
+               case 'textarea':
+                       $this->state = Tokenizer::STATE_RCDATA;
+                       break;
+
+               case 'style':
+               case 'xmp':
+               case 'iframe':
+               case 'noembed':
+               case 'noframes':
+                       $this->state = Tokenizer::STATE_RAWTEXT;
+                       break;
+
+               case 'script':
+                       $this->state = Tokenizer::STATE_SCRIPT_DATA;
+                       break;
+
+               case 'noscript':
+                       if ( $this->scriptingFlag ) {
+                               $this->state = Tokenizer::STATE_RAWTEXT;
+                       }
+                       break;
+
+               case 'plaintext':
+                       $this->state = Tokenizer::STATE_PLAINTEXT;
+                       break;
+               }
+       }
+
+       /**
+        * Notify the tokenizer that the document will be tokenized by repeated step()
+        * calls. This must be called once only, before the first call to step().
+        */
+       public function beginStepping() {
+               $this->state = self::STATE_START;
+               $this->preprocess();
+               $this->listener->startDocument( $this, null, null );
+       }
+
+       /**
+        * Tokenize a minimum amount of text from the input stream, and emit the
+        * resulting events.
+        *
+        * @return bool True if the input continues and step() should be called
+        *   again, false on EOF
+        */
+       public function step() {
+               if ( $this->state === null ) {
+                       $this->fatal( "beginStepping() must be called before step()" );
+               }
+               return $this->executeInternal( false );
+       }
+
+       /**
+        * Preprocess the input text, if it hasn't been done already.
+        */
+       protected function preprocess() {
+               if ( $this->preprocessed || $this->skipPreprocess ) {
+                       return;
+               }
+
+               // Normalize line endings
+               $this->text = strtr( $this->text, [
+                       "\r\n" => "\n",
+                       "\r" => "\n" ] );
+               $this->length = strlen( $this->text );
+
+               // Raise parse errors for any control characters
+               if ( !$this->ignoreErrors ) {
+                       $pos = 0;
+                       $re = '/[' .
+                               '\x{0001}-\x{0008}' .
+                               '\x{000E}-\x{001F}' .
+                               '\x{007F}-\x{009F}' .
+                               '\x{FDD0}-\x{FDEF}' .
+                               '\x{000B}' .
+                               '\x{FFFE}\x{FFFF}' .
+                               '\x{1FFFE}\x{1FFFF}' .
+                               '\x{2FFFE}\x{2FFFF}' .
+                               '\x{3FFFE}\x{3FFFF}' .
+                               '\x{4FFFE}\x{4FFFF}' .
+                               '\x{5FFFE}\x{5FFFF}' .
+                               '\x{6FFFE}\x{6FFFF}' .
+                               '\x{7FFFE}\x{7FFFF}' .
+                               '\x{8FFFE}\x{8FFFF}' .
+                               '\x{9FFFE}\x{9FFFF}' .
+                               '\x{AFFFE}\x{AFFFF}' .
+                               '\x{BFFFE}\x{BFFFF}' .
+                               '\x{CFFFE}\x{CFFFF}' .
+                               '\x{DFFFE}\x{DFFFF}' .
+                               '\x{EFFFE}\x{EFFFF}' .
+                               '\x{FFFFE}\x{FFFFF}' .
+                               '\x{10FFFE}\x{10FFFF}]/u';
+                       while ( $pos < $this->length ) {
+                               $count = preg_match( $re, $this->text, $m, PREG_OFFSET_CAPTURE, $pos );
+                               if ( $count === false ) {
+                                       $this->fatal( "Invalid UTF-8 sequence given to Tokenizer" );
+                               } elseif ( !$count ) {
+                                       break;
+                               }
+                               $pos = $m[0][1];
+                               $this->error( "disallowed control character", $pos );
+                               $pos += strlen( $m[0][0] );
+                       }
+               }
+       }
+
+       /**
+        * The main state machine, the common implementation of step() and execute().
+        * @param bool $loop Set to true to loop until finished, false to step once.
+        * @return bool True if the input continues, false on EOF
+        */
+       protected function executeInternal( $loop ) {
+               $eof = false;
+
+               do {
+                       switch ( $this->state ) {
+                       case self::STATE_DATA:
+                               $this->state = $this->dataState( $loop );
+                               break;
+
+                       case self::STATE_RCDATA:
+                               $this->state = $this->textElementState( false );
+                               break;
+
+                       case self::STATE_RAWTEXT:
+                               $this->state = $this->textElementState( true );
+                               break;
+
+                       case self::STATE_SCRIPT_DATA:
+                               $this->state = $this->scriptDataState();
+                               break;
+
+                       case self::STATE_PLAINTEXT:
+                               $this->state = $this->plaintextState();
+                               break;
+
+                       case self::STATE_START:
+                               $this->state = self::STATE_DATA;
+                               break;
+
+                       case self::STATE_EOF:
+                               $this->listener->endDocument( $this->length );
+                               $eof = true;
+                               break 2;
+
+                       default:
+                               $this->fatal( 'invalid state' );
+                       }
+               } while ( $loop );
+
+               return !$eof;
+       }
+
+       /**
+        * Consume input text starting from the "data state".
+        *
+        * @param bool $loop True to loop while still in the data state, false to
+        *   process a single less-than sign.
+        * @return integer The next state index
+        */
+       protected function dataState( $loop ) {
+               $re = "~ <
+                       (?:
+                               ( /? )                        # 1. End tag open
+
+                               (                             # 2. Tag name
+                                       # Try to match the ASCII letter required for the start of a start
+                                       # or end tag. If this fails, a slash matched above can be
+                                       # backtracked and then fed into the bogus comment alternative below.
+                                       [a-zA-Z]
+
+                                       # Then capture the rest of the tag name
+                                       [^\t\n\f />]*
+                               ) |
+
+                               # Comment
+                               !--
+                               (                             # 3. Comment match detector
+                                       > | -> | # Invalid short close
+                                       (                         # 4. Comment contents
+                                               (?:
+                                                       (?! --> )
+                                                       (?! --!> )
+                                                       (?! --! \\z )
+                                                       (?! -- \\z )
+                                                       (?! - \\z )
+                                                       .
+                                               )*+
+                                       )
+                                       (                         # 5. Comment close
+                                               --> |   # Normal close
+                                               --!> |  # Comment end bang
+                                               --! |   # EOF in comment end bang state
+                                               -- |    # EOF in comment end state
+                                               -  |    # EOF in comment end dash state
+                                                       # EOF in comment state
+                                       )
+                               ) |
+                               ( (?i)                        # 6. Doctype
+                                       ! DOCTYPE
+
+                                       # There must be at least one whitespace character to suppress
+                                       # a parse error, but if there isn't one, this is still a
+                                       # DOCTYPE. There is no way for the DOCTYPE string to end up
+                                       # as a character node, the DOCTYPE subexpression must always
+                                       # wholly match if we matched up to this point.
+
+                                       ( [\t\n\f ]*+ )           # 7. Required whitespace
+                                       ( [^\t\n\f >]*+ )         # 8. DOCTYPE name
+                                       [\t\n\f ]*+
+                                       (?:
+                                               # After DOCTYPE name state
+                                               PUBLIC
+                                               ( [\t\n\f ]* )            # 9. Required whitespace
+                                               (?:
+                                                       \" ( [^\">]* ) \"? |  # 10. Double-quoted identifier
+                                                       ' ( [^'>]* ) '? |     # 11. Single-quoted identifier
+                                                       # Non-match: bogus
+                                               )
+                                               (?:
+                                                       # After DOCTYPE public identifier state
+                                                       # Assert quoted identifier before here
+                                                       (?<= \" | ' )
+                                                       ( [\t\n\f ]* )            # 12. Required whitespace
+                                                       (?:
+                                                               \" ( [^\">]* ) \"? |  # 13. Double-quoted identifier
+                                                               ' ( [^'>]* ) '? |     # 14. Single-quoted identifier
+                                                               # Non-match: no system ID
+                                                       )
+                                               )?
+                                               |
+                                               SYSTEM
+                                               ( [\t\n\f ]* )            # 15. Required whitespace
+                                               (?:
+                                                       \" ( [^\">]* ) \"? |  # 16. Double-quoted identifier
+                                                       ' ( [^'>]* ) '? |     # 17. Single-quoted identifier
+                                                       # Non-match: bogus
+                                               )
+                                               |  # No keyword is OK
+                                       )
+                                       [\t\n\f ]*
+                                       ( [^>]*+ )                # 18. Bogus DOCTYPE
+                                       ( >? )                    # 19. End of DOCTYPE
+                               ) |
+                               ( ! \[CDATA\[ ) |             # 20. CDATA section
+                               ( [!?/] [^>]*+ ) >?           # 21. Bogus comment
+
+                               # Anything else: parse error and emit literal less-than sign.
+                               # We will let the match fail at this position and later check
+                               # for less-than signs in the resulting text node.
+                       )
+                       ~xs";
+
+               $nextState = self::STATE_DATA;
+               do {
+                       $count = preg_match( $re, $this->text, $m, PREG_OFFSET_CAPTURE, $this->pos );
+                       if ( $count === false ) {
+                               $this->throwPregError();
+                       } elseif ( !$count ) {
+                               // Text runs to end
+                               $this->emitDataRange( $this->pos, $this->length - $this->pos );
+                               $this->pos = $this->length;
+                               $nextState = self::STATE_EOF;
+                               break;
+                       }
+
+                       $startPos = $m[0][1];
+                       $tagName = isset( $m[self::MD_TAG_NAME] ) ? $m[self::MD_TAG_NAME][0] : '';
+
+                       $this->emitDataRange( $this->pos, $startPos - $this->pos );
+                       $this->pos = $startPos;
+                       $nextPos = $m[0][1] + strlen( $m[0][0] );
+
+                       if ( isset( $m[self::MD_CDATA] ) && $m[self::MD_CDATA][1] >= 0 ) {
+                               if ( $this->enableCdataCallback ) {
+                                       $isCdata = call_user_func( $this->enableCdataCallback );
+                               } else {
+                                       $isCdata = false;
+                               }
+                               if ( !$isCdata ) {
+                                       $m[self::MD_BOGUS_COMMENT] = $m[self::MD_CDATA];
+                               }
+                       } else {
+                               $isCdata = false;
+                       }
+
+                       if ( strlen( $tagName ) ) {
+                               // Tag
+                               $isEndTag = (bool)strlen( $m[self::MD_END_TAG_OPEN][0] );
+                               if ( !$this->ignoreNulls ) {
+                                       $tagName = $this->handleNulls( $tagName, $m[self::MD_TAG_NAME][1] );
+                               }
+                               $tagName = strtolower( $tagName );
+                               $this->pos = $nextPos;
+                               $nextState = $this->handleAttribsAndClose( self::STATE_DATA,
+                                       $tagName, $isEndTag, $startPos );
+                               $nextPos = $this->pos;
+                               if ( $nextState === self::STATE_EOF ) {
+                                       break;
+                               }
+
+                               // Respect any state switch imposed by the parser
+                               $nextState = $this->state;
+
+                       } elseif ( isset( $m[self::MD_COMMENT] ) && $m[self::MD_COMMENT][1] >= 0 ) {
+                               // Comment
+                               $this->interpretCommentMatches( $m );
+                       } elseif ( isset( $m[self::MD_DOCTYPE] ) && $m[self::MD_DOCTYPE][1] >= 0 ) {
+                               // DOCTYPE
+                               $this->interpretDoctypeMatches( $m );
+                       } elseif ( isset( $m[self::MD_CDATA] ) && $m[self::MD_CDATA][1] >= 0 ) {
+                               // CDATA
+                               if ( $this->enableCdataCallback
+                                       && call_user_func( $this->enableCdataCallback )
+                               ) {
+                                       $this->pos += strlen( $m[self::MD_CDATA][0] ) + 1;
+                                       $endPos = strpos( $this->text, ']]>', $this->pos );
+                                       if ( $endPos === false ) {
+                                               $this->emitCdataRange( $this->pos, $this->length - $this->pos,
+                                                       $startPos, $this->length - $startPos );
+                                               $this->pos = $this->length;
+                                               $nextState = self::STATE_EOF;
+                                               break;
+                                       } else {
+                                               $outerEndPos = $endPos + strlen( ']]>' );
+                                               $this->emitCdataRange( $this->pos, $endPos - $this->pos,
+                                                       $startPos, $outerEndPos - $startPos );
+                                               $nextPos = $outerEndPos;
+                                       }
+                               } else {
+                                       // Bogus comment
+                                       $this->error( "unexpected CDATA interpreted as bogus comment" );
+                                       $endPos = strpos( $this->text, '>', $this->pos );
+                                       $bogusPos = $this->pos + 2;
+                                       if ( $endPos === false ) {
+                                               $nextPos = $this->length;
+                                               $contents = substr( $this->text, $bogusPos );
+                                       } else {
+                                               $nextPos = $endPos + 1;
+                                               $contents = substr( $this->text, $bogusPos, $endPos - $bogusPos );
+                                       }
+                                       $contents = $this->handleNulls( $contents, $bogusPos );
+                                       $this->listener->comment( $contents, $this->pos, $endPos - $this->pos );
+                               }
+                       } elseif ( isset ( $m[self::MD_BOGUS_COMMENT] ) && $m[self::MD_BOGUS_COMMENT][1] >= 0 ) {
+                               // Bogus comment
+                               $contents = $m[self::MD_BOGUS_COMMENT][0];
+                               $bogusPos = $m[self::MD_BOGUS_COMMENT][1];
+                               if ( $m[0][0] === '</>' ) {
+                                       $this->error( "empty end tag" );
+                                       // No token emitted
+                               } elseif ( $m[0][0] === '</' ) {
+                                       $this->error( 'EOF in end tag' );
+                                       $this->listener->characters( '</', 0, 2, $m[0][1], 2 );
+                               } else {
+                                       $this->error( "unexpected <{$contents[0]} interpreted as bogus comment" );
+                                       if ( $contents[0] !== '?' ) {
+                                               // For starting types other than <?, the initial character is
+                                               // not in the tag contents
+                                               $contents = substr( $contents, 1 );
+                                               $bogusPos++;
+                                       }
+
+                                       $contents = $this->handleNulls( $contents, $bogusPos );
+                                       $this->listener->comment( $contents, $startPos, $nextPos - $startPos );
+                               }
+                       } else {
+                               $this->fatal( 'unexpected data state match' );
+                       }
+                       $this->pos = $nextPos;
+               } while ( $loop && $nextState === self::STATE_DATA );
+
+               return $nextState;
+       }
+
+       /**
+        * Interpret the data state match results for a detected comment, and emit
+        * events as appropriate.
+        *
+        * @param array $m The match array
+        */
+       protected function interpretCommentMatches( $m ) {
+               $outerStart = $m[0][1];
+               $outerLength = strlen( $m[0][0] );
+               $innerStart = $outerStart + strlen( '<!--' );
+               $innerLength = isset( $m[self::MD_COMMENT_INNER] ) ? strlen( $m[self::MD_COMMENT_INNER][0] ) : 0;
+               $contents = $innerLength ? $m[self::MD_COMMENT_INNER][0] : '';
+
+               if ( $m[0][0] === '<!-->' || $m[0][0] === '<!--->' ) {
+                       // These are special cases in the comment start state
+                       $this->error( 'not enough dashes in empty comment', $outerStart );
+                       $this->listener->comment( '', $outerStart, $outerLength );
+                       return;
+               }
+
+               if ( !$this->ignoreNulls ) {
+                       $contents = $this->handleNulls( $contents, $innerStart );
+               }
+               $close = $m[self::MD_COMMENT_END][0];
+               $closePos = $m[self::MD_COMMENT_END][1];
+
+               if ( !$this->ignoreErrors ) {
+                       if ( $close === '--!>' ) {
+                               $this->error( 'invalid comment end bang', $closePos );
+                       } elseif ( $close === '-' || $close === '--' || $close === '--!' ) {
+                               $this->error( 'EOF part way through comment close', $closePos );
+                       } elseif ( $close === '' ) {
+                               $this->error( 'EOF in comment', $closePos );
+                       }
+
+                       $dashSearchLength = $innerLength;
+                       while ( $dashSearchLength > 0 && $contents[$dashSearchLength - 1] === '-' ) {
+                               $this->error( 'invalid extra dash at comment end',
+                                       $innerStart + $dashSearchLength - 1 );
+                               $dashSearchLength--;
+                       }
+
+                       $offset = 0;
+                       while ( $offset !== false && $offset < $dashSearchLength ) {
+                               $offset = strpos( $contents, '--', $offset );
+                               if ( $offset !== false ) {
+                                       $this->error( 'bare "--" found in comment', $innerStart + $offset );
+                                       $offset += 2;
+                               }
+                       }
+               }
+
+               $this->listener->comment( $contents, $outerStart, $outerLength );
+       }
+
+       /**
+        * Interpret the data state match results for a detected DOCTYPE token,
+        * and emit events as appropriate.
+        *
+        * @param array $m The match array
+        */
+       protected function interpretDoctypeMatches( $m ) {
+               $igerr = $this->ignoreErrors;
+               $name = null;
+               $public = null;
+               $system = null;
+               $quirks = false;
+
+               // Missing ">" can only be caused by EOF
+               $eof = !strlen( $m[self::MD_DT_END][0] );
+
+               if ( strlen( $m[self::MD_DT_BOGUS][0] ) ) {
+                       // Bogus DOCTYPE state
+                       if ( !$igerr ) {
+                               $this->error( 'invalid DOCTYPE contents', $m[self::MD_DT_BOGUS][1] );
+                       }
+                       // Set quirks mode unless there was a properly quoted SYSTEM identifier
+                       $haveDq = isset( $m[self::MD_DT_SYSTEM_DQ] ) && $m[self::MD_DT_SYSTEM_DQ][1] >= 0;
+                       $haveSq = isset( $m[self::MD_DT_SYSTEM_SQ] ) && $m[self::MD_DT_SYSTEM_SQ][1] >= 0;
+                       if ( !$haveDq && !$haveSq ) {
+                               $quirks = true;
+                       }
+                       // EOF in the bogus state does not set quirks mode (but it is a parse error)
+                       if ( $eof && !$igerr ) {
+                               $this->error( 'unterminated DOCTYPE' );
+                       }
+               } elseif ( $eof ) {
+                       if ( !$igerr ) {
+                               $this->error( 'unterminated DOCTYPE' );
+                       }
+                       $quirks = true;
+               }
+
+               if ( !$igerr && !$eof && !strlen( $m[self::MD_DT_NAME_WS][0] ) ) {
+                       $this->error( 'missing whitespace', $m[self::MD_DT_NAME_WS][1] );
+               }
+
+               if ( strlen( $m[self::MD_DT_NAME][0] ) ) {
+                       // DOCTYPE name
+                       $name = $this->handleNulls( strtolower( $m[self::MD_DT_NAME][0] ), $m[self::MD_DT_NAME][1] );
+               } else {
+                       if ( !$eof && !$igerr ) {
+                               $this->error( 'missing DOCTYPE name',
+                                       $m[self::MD_DOCTYPE][1] + strlen( '!DOCTYPE' ) );
+                       }
+                       $quirks = true;
+               }
+
+               if ( isset( $m[self::MD_DT_PUBLIC_WS] ) && $m[self::MD_DT_PUBLIC_WS][1] >= 0 ) {
+                       // PUBLIC keyword found
+                       $public = $this->interpretDoctypeQuoted( $m,
+                               self::MD_DT_PUBLIC_DQ, self::MD_DT_PUBLIC_SQ, $quirks );
+                       if ( $public === null ) {
+                               $quirks = true;
+                               if ( !$eof && !$igerr ) {
+                                       $this->error( 'missing public identifier', $m[self::MD_DT_PUBLIC_WS][1] );
+                               }
+                       } elseif ( !$igerr && !$eof && !strlen( $m[self::MD_DT_PUBLIC_WS][0] ) ) {
+                               $this->error( 'missing whitespace', $m[self::MD_DT_PUBLIC_WS][1] );
+                       }
+
+                       // Check for a system ID after the public ID
+                       $haveDq = isset( $m[self::MD_DT_PUBSYS_DQ] ) && $m[self::MD_DT_PUBSYS_DQ][1] >= 0;
+                       $haveSq = isset( $m[self::MD_DT_PUBSYS_SQ] ) && $m[self::MD_DT_PUBSYS_SQ][1] >= 0;
+                       if ( $haveDq || $haveSq ) {
+                               if ( !$igerr && !strlen( $m[self::MD_DT_PUBSYS_WS][0] ) ) {
+                                       $this->error( 'missing whitespace', $m[self::MD_DT_PUBSYS_WS][1] );
+                               }
+                               $system = $this->interpretDoctypeQuoted( $m,
+                                       self::MD_DT_PUBSYS_DQ, self::MD_DT_PUBSYS_SQ, $quirks );
+                       }
+               } elseif ( isset( $m[self::MD_DT_SYSTEM_WS] ) && $m[self::MD_DT_SYSTEM_WS][1] >= 0 ) {
+                       // SYSTEM keyword found
+                       $system = $this->interpretDoctypeQuoted( $m,
+                               self::MD_DT_SYSTEM_DQ, self::MD_DT_SYSTEM_SQ, $quirks );
+                       if ( $system === null ) {
+                               $quirks = true;
+                               $this->error( 'missing system identifier', $m[self::MD_DT_SYSTEM_WS][1] );
+                       } elseif ( !$igerr && !strlen( $m[self::MD_DT_SYSTEM_WS][0] ) ) {
+                               $this->error( 'missing whitespace', $m[self::MD_DT_SYSTEM_WS][1] );
+                       }
+
+               }
+               $this->listener->doctype( $name, $public, $system, $quirks, $m[0][1], strlen( $m[0][0] ) );
+       }
+
+       /**
+        * DOCTYPE helper which interprets a quoted string (or lack thereof)
+        * @return string|null The quoted value, with nulls replaced.
+        */
+       protected function interpretDoctypeQuoted( $m, $dq, $sq, &$quirks ) {
+               if ( isset( $m[$dq] ) && $m[$dq][1] >= 0 ) {
+                       $value = $m[$dq][0];
+                       $startPos = $m[$dq][1];
+               } elseif ( isset( $m[$sq] ) && $m[$sq][1] >= 0 ) {
+                       $value = $m[$sq][0];
+                       $startPos = $m[$sq][1];
+               } else {
+                       return null;
+               }
+               $endPos = $startPos + strlen( $value );
+               if ( $endPos >= $this->length ) {
+                       // This is a parse error, but we already emitted a generic EOF error
+                       $quirks = true;
+               } elseif ( $this->text[$endPos] === '>' ) {
+                       $this->error( 'DOCTYPE identifier terminated by ">"', $endPos );
+                       $quirks = true;
+               }
+               $value = $this->handleNulls( $value, $startPos );
+               return $value;
+       }
+
+       /**
+        * Generic helper for all those points in the spec where U+0000 needs to be
+        * replaced with U+FFFD with a parse error issued.
+        *
+        * @param string $text The text to be converted
+        * @param integer $sourcePos The input byte offset from which $text was
+        *   extracted, for error position reporting.
+        * @return string The converted text
+        */
+       protected function handleNulls( $text, $sourcePos ) {
+               if ( $this->ignoreNulls ) {
+                       return $text;
+               }
+               if ( !$this->ignoreErrors ) {
+                       $offset = 0;
+                       while ( true ) {
+                               $nullPos = strpos( $text, "\0", $offset );
+                               if ( $nullPos === false ) {
+                                       break;
+                               }
+                               $this->error( "replaced null character", $sourcePos + $nullPos );
+                               if ( $nullPos < strlen( $text ) - 1 ) {
+                                       $offset = $nullPos + 1;
+                               } else {
+                                       break;
+                               }
+                       }
+               }
+               return str_replace( "\0", self::REPLACEMENT_CHAR, $text );
+       }
+
+       /**
+        * Generic helper for points in the spec which say that an error should
+        * be issued when certain ASCII characters are seen, with no other action
+        * taken.
+        *
+        * @param string $mask Mask for strcspn
+        * @param string $text The input text
+        * @param integer $offset The start of the range within $text to search
+        * @param integer $length The length of the range within $text to search
+        * @param integer $sourcePos The offset within the input text corresponding
+        *   to $text, for error position reporting.
+        */
+       protected function handleAsciiErrors( $mask, $text, $offset, $length, $sourcePos ) {
+               while ( $length > 0 ) {
+                       $validLength = strcspn( $text, $mask, $offset, $length );
+                       $offset += $validLength;
+                       $length -= $validLength;
+                       if ( $length <= 0 ) {
+                               break;
+                       }
+                       $char = $text[$offset];
+                       $codepoint = ord( $char );
+                       if ( $codepoint < 0x20 || $codepoint >= 0x7f ) {
+                               $this->error( sprintf( 'unexpected U+00%02X', $codepoint ), $offset + $sourcePos );
+                       } else {
+                               $this->error( "unexpected \"$char\"", $offset + $sourcePos );
+                       }
+                       $offset++;
+                       $length--;
+               }
+       }
+
+       /**
+        * Expand character references in some text, and emit errors as appropriate.
+        * @param string $text The text to expand
+        * @param integer $sourcePos The input position of $text
+        * @param bool $inAttr True if the text is within an attribute value
+        * @param string $additionalAllowedChar An unused string which the spec
+        *   inexplicably spends a lot of space telling you how to derive. It
+        *   suppresses errors in a place where no errors are emitted anyway.
+        * @return string The expanded text
+        */
+       protected function handleCharRefs( $text, $sourcePos, $inAttr = false,
+               $additionalAllowedChar = ''
+       ) {
+               if ( $this->ignoreCharRefs ) {
+                       return $text;
+               }
+               // Efficiently translate a few common cases.
+               // Although this doesn't translate any error cases, running this
+               // function in !$ignoreError mode would cause the string offsets to
+               // be wrong when we come to the preg_match_all.
+               //
+               // In HHVM this is way too broken to be usable. (@todo bug/PR)
+               if ( !defined( 'HHVM_VERSION' ) && $this->ignoreErrors ) {
+                       $text = html_entity_decode( $text, ENT_HTML5 | ENT_QUOTES );
+               }
+
+               static $re;
+               if ( $re === null ) {
+                       $knownNamed = HTMLData::$namedEntityRegex;
+                       $re = "~
+                               ( .*? )                      # 1. prefix
+                               &
+                               (?:
+                                       \# (?:
+                                               0*(\d+)           |  # 2. decimal
+                                               [xX]0*([0-9A-Fa-f]+) # 3. hexadecimal
+                                       )
+                                       ( ; ) ?                  # 4. semicolon
+                                       |
+                                       ( \# )                   # 5. bare hash
+                                       |
+                                       ($knownNamed)            # 6. known named
+                                       (?:
+                                               (?<! ; )             # Assert no semicolon prior
+                                               ( [=a-zA-Z0-9] )     # 7. attribute suffix
+                                       )?
+                                       |
+                                       ( [a-zA-Z0-9]+ ; )       # 8. invalid named
+                               )
+                               # S = study, for efficient knownNamed
+                               # A = anchor, to avoid unnecessary movement of the whole pattern on failure
+                               ~xAsS";
+               }
+               $out = '';
+               $pos = 0;
+               $length = strlen( $text );
+               $matches = [];
+               $count = preg_match_all( $re, $text, $matches, PREG_SET_ORDER );
+               if ( $count === false ) {
+                       $this->throwPregError();
+               }
+
+               foreach ( $matches as $m ) {
+                       $out .= $m[self::MC_PREFIX];
+                       $errorPos = $sourcePos + $pos + strlen( $m[self::MC_PREFIX] );
+                       $lastPos = $pos;
+                       $pos += strlen( $m[0] );
+
+                       if ( isset( $m[self::MC_HASH] ) && strlen( $m[self::MC_HASH] ) ) {
+                               // Bare &#
+                               $this->error( 'Expected digits after &#', $errorPos );
+                               $out .= '&#';
+                               continue;
+                       }
+
+                       $knownNamed = isset( $m[self::MC_NAMED] ) ? $m[self::MC_NAMED] : '';
+                       $attributeSuffix = isset( $m[self::MC_SUFFIX] ) ? $m[self::MC_SUFFIX] : '';
+
+                       $haveSemicolon =
+                               ( isset( $m[self::MC_SEMICOLON] ) && strlen( $m[self::MC_SEMICOLON] ) )
+                               || ( strlen( $knownNamed ) && $knownNamed[ strlen( $knownNamed ) - 1 ] === ';' )
+                               || ( isset( $m[self::MC_INVALID] ) && strlen( $m[self::MC_INVALID] ) );
+
+                       if ( $inAttr && !$haveSemicolon ) {
+                               if ( strlen( $attributeSuffix ) ) {
+                                       if ( !$this->ignoreErrors && $attributeSuffix === '=' ) {
+                                               $this->error( 'invalid equals sign after named character reference' );
+                                       }
+                                       $out .= '&' . $knownNamed . $attributeSuffix;
+                                       continue;
+                               }
+                       }
+
+                       if ( !$this->ignoreErrors && !$haveSemicolon ) {
+                               $this->error( 'character reference missing semicolon', $errorPos );
+                       }
+
+                       if ( isset( $m[self::MC_DECIMAL] ) && strlen( $m[self::MC_DECIMAL] ) ) {
+                               // Decimal
+                               if ( strlen( $m[self::MC_DECIMAL] ) > 7 ) {
+                                       $this->error( 'invalid numeric reference', $errorPos );
+                                       $out .= self::REPLACEMENT_CHAR;
+                                       continue;
+                               }
+                               $codepoint = intval( $m[self::MC_DECIMAL] );
+                       } elseif ( isset( $m[self::MC_HEXDEC] ) && strlen( $m[self::MC_HEXDEC] ) ) {
+                               // Hexadecimal
+                               if ( strlen( $m[self::MC_HEXDEC] ) > 6 ) {
+                                       $this->error( 'invalid numeric reference', $errorPos );
+                                       $out .= self::REPLACEMENT_CHAR;
+                                       continue;
+                               }
+                               $codepoint = intval( $m[self::MC_HEXDEC], 16 );
+                       } elseif ( $knownNamed !== '' ) {
+                               $out .= HTMLData::$namedEntityTranslations[$knownNamed] . $attributeSuffix;
+                               continue;
+                       } elseif ( isset( $m[self::MC_INVALID] ) && strlen( $m[self::MC_INVALID] ) ) {
+                               if ( !$this->ignoreErrors ) {
+                                       $this->error( 'invalid named reference', $errorPos );
+                               }
+                               $out .= '&' . $m[self::MC_INVALID];
+                               continue;
+                       } else {
+                               $this->fatal( 'unable to identify char ref submatch' );
+                       }
+
+                       // Interpret $codepoint
+                       if ( $codepoint === 0
+                               || ( $codepoint >= 0xD800 && $codepoint <= 0xDFFF )
+                               || $codepoint > 0x10FFFF
+                       ) {
+                               if ( !$this->ignoreErrors ) {
+                                       $this->error( 'invalid numeric reference', $errorPos );
+                               }
+                               $out .= self::REPLACEMENT_CHAR;
+                       } elseif ( isset( HTMLData::$legacyNumericEntities[$codepoint] ) ) {
+                               if ( !$this->ignoreErrors ) {
+                                       $this->error( 'invalid reference to non-ASCII control character', $errorPos );
+                               }
+                               $out .= HTMLData::$legacyNumericEntities[$codepoint];
+                       } else {
+                               if ( !$this->ignoreErrors ) {
+                                       $disallowedCodepoints = [
+                                               0x000B => true,
+                                               0xFFFE => true, 0xFFFF => true,
+                                               0x1FFFE => true, 0x1FFFF => true,
+                                               0x2FFFE => true, 0x2FFFF => true,
+                                               0x3FFFE => true, 0x3FFFF => true,
+                                               0x4FFFE => true, 0x4FFFF => true,
+                                               0x5FFFE => true, 0x5FFFF => true,
+                                               0x6FFFE => true, 0x6FFFF => true,
+                                               0x7FFFE => true, 0x7FFFF => true,
+                                               0x8FFFE => true, 0x8FFFF => true,
+                                               0x9FFFE => true, 0x9FFFF => true,
+                                               0xAFFFE => true, 0xAFFFF => true,
+                                               0xBFFFE => true, 0xBFFFF => true,
+                                               0xCFFFE => true, 0xCFFFF => true,
+                                               0xDFFFE => true, 0xDFFFF => true,
+                                               0xEFFFE => true, 0xEFFFF => true,
+                                               0xFFFFE => true, 0xFFFFF => true,
+                                               0x10FFFE => true, 0x10FFFF => true ];
+                                       if (
+                                               ( $codepoint >= 1 && $codepoint <= 8 ) ||
+                                               ( $codepoint >= 0x0d && $codepoint <= 0x1f ) ||
+                                               ( $codepoint >= 0x7f && $codepoint <= 0x9f ) ||
+                                               ( $codepoint >= 0xfdd0 && $codepoint <= 0xfdef ) ||
+                                               isset( $disallowedCodepoints[$codepoint] )
+                                       ) {
+                                               $this->error( 'invalid numeric reference to control character',
+                                                       $errorPos );
+                                       }
+                               }
+
+                               $out .= \UtfNormal\Utils::codepointToUtf8( $codepoint );
+                       }
+               }
+               if ( $pos < $length ) {
+                       $out .= substr( $text, $pos );
+               }
+               return $out;
+       }
+
+       /**
+        * Emit a range of the input text as a character token, and emit related
+        * errors, with validity rules as per the data state.
+        *
+        * @param integer $pos Offset within the input text
+        * @param integer $length The length of the range
+        */
+       protected function emitDataRange( $pos, $length ) {
+               if ( $length === 0 ) {
+                       return;
+               }
+               if ( $this->ignoreCharRefs && $this->ignoreNulls && $this->ignoreErrors ) {
+                       $this->listener->characters( $this->text, $pos, $length, $pos, $length );
+               } else {
+                       if ( !$this->ignoreErrors ) {
+                               // Any bare "<" in a data state text node is a parse error.
+                               // Uniquely to the data state, nulls are just flagged as errors
+                               // and passed through, they are not replaced.
+                               $this->handleAsciiErrors( "<\0", $this->text, $pos, $length, 0 );
+                       }
+
+                       $text = substr( $this->text, $pos, $length );
+                       $text = $this->handleCharRefs( $text, $pos );
+                       $this->listener->characters( $text, 0, strlen( $text ), $pos, $length );
+               }
+       }
+
+       /**
+        * Emit a range of characters from the input text, with validity rules as
+        * per the CDATA section state.
+        *
+        * @param $innerPos The position after the <![CDATA[
+        * @param $innerLength The length of the string not including the terminating ]]>
+        * @param $outerPos The position of the start of the <!CDATA[
+        * @param $outerLength The length of the whole input region being emitted
+        */
+       protected function emitCdataRange( $innerPos, $innerLength, $outerPos, $outerLength ) {
+               $this->listener->characters( $this->text, $innerPos, $innerLength,
+                       $outerPos, $outerLength );
+       }
+
+       /**
+        * Emit a range of characters from the input text, either from RCDATA,
+        * RAWTEXT, script data or PLAINTEXT. The only difference between these
+        * states is whether or not character references are expanded, so we take
+        * that as a parameter.
+        *
+        * @param bool $ignoreCharRefs
+        * @param integer $pos The input position
+        * @param integer $length The length of the range to be emitted
+        */
+       protected function emitRawTextRange( $ignoreCharRefs, $pos, $length ) {
+               if ( $length === 0 ) {
+                       return;
+               }
+               $ignoreCharRefs = $ignoreCharRefs || $this->ignoreCharRefs;
+               if ( $ignoreCharRefs && $this->ignoreNulls ) {
+                       $this->listener->characters( $this->text, $pos, $length, $pos, $length );
+               } else {
+                       $text = substr( $this->text, $pos, $length );
+                       if ( !$ignoreCharRefs ) {
+                               $text = $this->handleCharRefs( $text, $pos );
+                       }
+                       $text = $this->handleNulls( $text, $pos );
+                       $this->listener->characters( $text, 0, strlen( $text ), $pos, $length );
+               }
+       }
+
+       /**
+        * The entry point for the RCDATA and RAWTEXT states.
+        * @param bool $ignoreCharRefs True to ignore character references regardless
+        *   of configuration, false to respect the configuration.
+        * @return integer The next state index
+        */
+       protected function textElementState( $ignoreCharRefs ) {
+               if ( $this->appropriateEndTag === null ) {
+                       $this->emitRawTextRange( $ignoreCharRefs, $this->pos, $this->length - $this->pos );
+                       $this->pos = $this->length;
+                       return self::STATE_EOF;
+               }
+
+               $re = "~</
+                       {$this->appropriateEndTag}
+                       # Assert that the end tag name state is exited appropriately,
+                       # since the anything else case leads to the tag being treated as
+                       # a literal
+                       (?=[\t\n\f />])
+                       ~ix";
+
+               do {
+                       $count = preg_match( $re, $this->text, $m, PREG_OFFSET_CAPTURE, $this->pos );
+
+                       if ( $count === false ) {
+                               $this->throwPregError();
+                       } elseif ( !$count ) {
+                               // Text runs to end
+                               $this->emitRawTextRange( $ignoreCharRefs, $this->pos, $this->length - $this->pos );
+                               $this->pos = $this->length;
+                               return self::STATE_EOF;
+                       }
+                       $startPos = $m[0][1];
+
+                       // Emit text before tag
+                       $this->emitRawTextRange( $ignoreCharRefs, $this->pos, $startPos - $this->pos );
+
+                       $matchLength = strlen( $m[0][0] );
+                       $this->pos = $startPos + $matchLength;
+                       $nextState = $this->handleAttribsAndClose( self::STATE_RCDATA,
+                               $this->appropriateEndTag, true, $startPos );
+               } while ( $nextState === self::STATE_RCDATA );
+               return $nextState;
+       }
+
+       /**
+        * Advance $this->pos, consuming all tag attributes found at the current
+        * position. The new position will be at the end of the tag or at the end
+        * of the input string.
+        *
+        * To improve performance of consumers which don't need to read the
+        * attribute array, interpretation of the PCRE match results is deferred.
+        *
+        * - @todo: Make deferral configurable.
+        * - @todo: Measure performance improvement, assess whether the LazyAttributes
+        *   feature is warranted.
+        *
+        * @return array Attributes
+        */
+       protected function consumeAttribs() {
+               $re = '~
+                       [\t\n\f ]*+  # Ignored whitespace before attribute name
+                       (?! /> )     # Do not consume self-closing end of tag
+                       (?! > )      # Do not consume normal closing bracket
+
+                       (?:
+                               # Before attribute name state
+                               # A bare slash at this point, not part of a self-closing end tag, is
+                               # consumed and ignored (with a parse error), returning to the before
+                               # attribute name state.
+                               ( / ) |    # 1. Bare slash
+
+                               # Attribute name state
+                               # Note that the first character can be an equals sign, this is a parse error
+                               # but still generates an attribute called "=". Thus the only way the match
+                               # could fail here is due to EOF.
+
+                               ( [^\t\n\f />] [^\t\n\f =/>]*+ )  # 2. Attribute name
+
+                               # After attribute name state
+                               [\t\n\f ]*
+
+                               (?:
+                                       =
+                                       # Before attribute value state
+                                       # Ignore whitespace
+                                       [\t\n\f ]*+
+                                       (?:
+                                               # If an end-quote is omitted, the attribute will run to the end of the
+                                               # string, leaving no closing bracket. So the caller will detect the
+                                               # unexpected EOF and will not emit the tag, which is correct.
+                                               " ( [^"]*+ ) "? |       # 3. Double-quoted attribute value
+                                               \' ( [^\']*+ ) \'? |    # 4. Single-quoted attribute value
+                                               ( [^\t\n\f >]*+ )       # 5. Unquoted attribute value
+                                       )
+                                       # Or nothing: an attribute with an empty value. The attribute name was
+                                       # terminated by a slash, closing bracket or EOF
+                                       |
+                               )
+                       )
+                       # The /A modifier causes preg_match_all to give contiguous chunks
+                       ~xA';
+               $count = preg_match_all( $re, $this->text, $m,
+                       PREG_SET_ORDER | PREG_OFFSET_CAPTURE, $this->pos );
+               if ( $count === false ) {
+                       $this->throwPregError();
+               } elseif ( $count ) {
+                       $this->pos = $m[$count - 1][0][1] + strlen( $m[$count - 1][0][0] );
+                       $attribs = new LazyAttributes( $m, function ( $m ) {
+                               return $this->interpretAttribMatches( $m );
+                       } );
+               } else {
+                       $attribs = new PlainAttributes();
+               }
+
+               // Consume trailing whitespace. This is strictly part of the before attribute
+               // name state, but we didn't consume it in the regex since we used a principle
+               // of one match equals one attribute.
+               $this->pos += strspn( $this->text, "\t\n\f ", $this->pos );
+               return $attribs;
+       }
+
+       /**
+        * Interpret the results of the attribute preg_match_all(). Emit errors as
+        * appropriate and return an associative array.
+        *
+        * @param array $matches
+        * @return array
+        */
+       protected function interpretAttribMatches( $matches ) {
+               $attributes = [];
+               foreach ( $matches as $m ) {
+                       if ( strlen( $m[self::MA_SLASH][0] ) ) {
+                               $this->error( 'unexpected bare slash', $m[self::MA_SLASH][1] );
+                               continue;
+                       }
+                       $name = $m[self::MA_NAME][0];
+                       if ( !$this->ignoreErrors ) {
+                               $this->handleAsciiErrors( "\"'<=", $name, 0, strlen( $name ), $m[self::MA_NAME][1] );
+                       }
+                       if ( !$this->ignoreNulls ) {
+                               $name = $this->handleNulls( $m[self::MA_NAME][0], $m[self::MA_NAME][1] );
+                       }
+                       $name = strtolower( $name );
+                       $additionalAllowedChar = '';
+                       if ( isset( $m[self::MA_DQUOTED] ) && $m[self::MA_DQUOTED][1] >= 0 ) {
+                               // Double-quoted attribute value
+                               $additionalAllowedChar = '"';
+                               $value = $m[self::MA_DQUOTED][0];
+                               $pos = $m[self::MA_DQUOTED][1];
+                       } elseif ( isset( $m[self::MA_SQUOTED] ) && $m[self::MA_SQUOTED][1] >= 0 ) {
+                               // Single-quoted attribute value
+                               $additionalAllowedChar = "'";
+                               $value = $m[self::MA_SQUOTED][0];
+                               $pos = $m[self::MA_SQUOTED][1];
+                       } elseif ( isset( $m[self::MA_UNQUOTED] ) && $m[self::MA_UNQUOTED][1] >= 0 ) {
+                               // Unquoted attribute value
+                               $value = $m[self::MA_UNQUOTED][0];
+                               $pos = $m[self::MA_UNQUOTED][1];
+                               // Search for parse errors
+                               if ( !$this->ignoreErrors ) {
+                                       if ( $value === '' ) {
+                                               // ">" in the before attribute value state is a parse error
+                                               $this->error( 'empty unquoted attribute', $pos );
+                                       }
+                                       $this->handleAsciiErrors( "\"'<=`", $value, 0, strlen( $value ), $pos );
+                               }
+                       } else {
+                               $value = '';
+                       }
+                       if ( $additionalAllowedChar && !$this->ignoreErrors ) {
+                               // After attribute value (quoted) state
+                               // Quoted attributes must be followed by a space, "/" or ">"
+                               $aavPos = $m[0][1] + strlen( $m[0][0] );
+                               if ( $aavPos < $this->length ) {
+                                       $aavChar = $this->text[$aavPos];
+                                       if ( !preg_match( '~^[\t\n\f />]~', $aavChar ) ) {
+                                               $this->error( 'missing space between attributes', $aavPos );
+                                       }
+                               }
+                       }
+                       if ( $value !== '' ) {
+                               if ( !$this->ignoreNulls ) {
+                                       $value = $this->handleNulls( $value, $pos );
+                               }
+                               if ( !$this->ignoreCharRefs ) {
+                                       $value = $this->handleCharRefs( $value, $pos, true, $additionalAllowedChar );
+                               }
+                       }
+                       if ( isset( $attributes[$name] ) ) {
+                               $this->error( "duplicate attribute", $m[0][1] );
+                       } else {
+                               $attributes[$name] = $value;
+                       }
+               }
+               return $attributes;
+       }
+
+       /**
+        * Consume attributes, and the closing bracket which follows attributes.
+        * Emit the appropriate tag event, or in the case of broken attributes in
+        * text states, emit characters.
+        *
+        * @param integer $state The current state
+        * @param string $tagName The normalized tag name
+        * @param bool $isEndTag True if this is an end tag, false if it is a start tag
+        * @param integer $startPos The input position of the start of the current tag.
+        * @return integer The next state
+        */
+       protected function handleAttribsAndClose( $state, $tagName, $isEndTag, $startPos ) {
+               $attribStartPos = $this->pos;
+               $attribs = $this->consumeAttribs();
+               $pos = $this->pos;
+
+               // Literal characters are emitted on EOF or "anything else" from the
+               // end tag substates of the text states.
+               // (spec ref 8.2.4 sections 11-19, 25-27)
+               $isDataState = $state === self::STATE_DATA;
+               $isLiteral = $attribStartPos === $pos && !$isDataState;
+
+               if ( $pos >= $this->length ) {
+                       $this->error( 'unexpected end of file inside tag' );
+                       if ( $isLiteral ) {
+                               $this->listener->characters( $this->text,
+                                       $startPos, $this->length - $startPos,
+                                       $startPos, $this->length - $startPos );
+                       }
+                       return self::STATE_EOF;
+               }
+               if ( $isEndTag && !$this->ignoreErrors && $attribs->count() ) {
+                       $this->error( 'end tag has an attribute' );
+               }
+
+               if ( $this->text[$pos] === '/' && $this->text[$pos + 1] === '>' ) {
+                       $pos += 2;
+                       $selfClose = true;
+               } elseif ( $this->text[$pos] === '>' ) {
+                       $pos++;
+                       $selfClose = false;
+               } elseif ( $isLiteral ) {
+                       $this->listener->characters( $this->text,
+                               $startPos, $attribStartPos - $startPos,
+                               $startPos, $attribStartPos - $startPos );
+                       return $state;
+               } else {
+                       $this->fatal( 'failed to find an already-matched ">"' );
+               }
+               $this->pos = $pos;
+               if ( $isEndTag ) {
+                       if ( $selfClose ) {
+                               $this->error( 'self-closing end tag' );
+                       }
+                       $this->listener->endTag( $tagName, $startPos, $pos - $startPos );
+               } else {
+                       $this->listener->startTag( $tagName, $attribs, $selfClose,
+                               $startPos, $pos - $startPos );
+               }
+               return self::STATE_DATA;
+       }
+
+       /**
+        * Process input text in the PLAINTEXT state
+        * @return integer The next state index
+        */
+       protected function plaintextState() {
+               $this->emitRawTextRange( true, $this->pos, $this->length - $this->pos );
+               return self::STATE_EOF;
+       }
+
+       /**
+        * Process input text in the script data state
+        * @return integer The next state index
+        */
+       protected function scriptDataState() {
+               if ( $this->appropriateEndTag === null ) {
+                       $this->pos = $this->length;
+                       return self::STATE_EOF;
+               }
+
+               $re = <<<REGEX
+~
+                       (?: # Outer loop start
+                               # Script data state
+                               # Stop iteration if we previously matched an appropriate end tag.
+                               # This is a conditional subpattern: if capture 1 previously
+                               # matched, then run the pattern /$./ which always fails.
+                               (?(1) $. )
+                               .*?
+                               (?:
+                                       $ |
+                                       (
+                                               </ {$this->appropriateEndTag}
+                                               # If we hit the "anything else" case in the script data
+                                               # end tag name state, don't exit
+                                               (?= [\t\n\f />] )
+                                       ) | # 1. Appropriate end tag
+                                       <!--
+                                       # Script data escaped dash dash state
+                                       # Hyphens at this point are consumed without a state transition
+                                       # and so are not part of a comment-end.
+                                       -*+
+
+                                       (?: # Inner loop start
+                                               # Script data escaped state
+                                               .*?
+                                               (?:
+                                                       $ |
+                                                       # Stop at, but do not consume, comment-close or end tag.
+                                                       # This causes the inner loop to exit, since restarting the
+                                                       # inner loop at this input position will cause the loop
+                                                       # body to match zero characters. Repeating a zero-character
+                                                       # match causes the repeat to terminate.
+                                                       (?= --> ) |
+                                                       (?= </ {$this->appropriateEndTag} [\t\n\f />] ) |
+                                                       <script [\t\n\f />]
+                                                       # Script data double escaped state
+                                                       .*?
+                                                       (?:
+                                                               $ |
+                                                               # Stop at, but do not consume, comment-close
+                                                               (?= --> ) |
+                                                               </script [\t\n\f />]
+                                                       )
+                                               )
+                                       )*
+
+
+                                       # Consume the comment close which exited the inner loop, if any
+                                       (?: --> )?
+                               )
+                       )*+
+                       ~xsiA
+REGEX;
+
+               do {
+                       $count = preg_match( $re, $this->text, $m, 0, $this->pos );
+                       if ( $count === false ) {
+                               $this->throwPregError();
+                       } elseif ( !$count ) {
+                               $this->fatal( 'unexpected regex failure: this pattern can match zero characters' );
+                       }
+
+                       $startPos = $this->pos;
+                       $matchLength = strlen( $m[0] );
+                       $endTagLength = isset( $m[1] ) ? strlen( $m[1] ) : 0;
+                       $textLength = $matchLength - $endTagLength;
+                       $this->emitRawTextRange( true, $startPos, $textLength );
+                       $this->pos = $startPos + $matchLength;
+                       $tagStartPos = $startPos + $textLength;
+
+                       if ( $endTagLength ) {
+                               $nextState = $this->handleAttribsAndClose( self::STATE_SCRIPT_DATA,
+                                       $this->appropriateEndTag, true, $tagStartPos );
+                       } else {
+                               $nextState = self::STATE_EOF;
+                       }
+               } while ( $nextState === self::STATE_SCRIPT_DATA );
+               return $nextState;
+       }
+
+       /**
+        * Emit a parse error event.
+        * @param string $text The error message
+        * @param integer|null $pos The error position, or null to use the current position
+        */
+       protected function error( $text, $pos = null ) {
+               if ( !$this->ignoreErrors ) {
+                       if ( $pos === null ) {
+                               $pos = $this->pos;
+                       }
+                       $this->listener->error( $text, $pos );
+               }
+       }
+
+       /**
+        * Throw an exception for a specified reason. This is used for API errors
+        * and assertion-like sanity checks.
+        * @param string $text The error message
+        */
+       protected function fatal( $text ) {
+               throw new TokenizerError( __CLASS__ . ": " . $text );
+       }
+
+       /**
+        * Interpret preg_last_error() and throw a suitable exception. This is
+        * called when preg_match() or similar returns false.
+        *
+        * Notes for users:
+        *
+        * - PCRE internal error: may be due to JIT stack space exhaustion prior
+        *   to PHP 7, due to excessive recursion. Increase stack space.
+        *
+        * - pcre.backtrack_limit exhausted: The backtrack limit should be at least
+        *   double the input size, the defaults are way too small. Increase it in
+        *   configuration.
+        */
+       protected function throwPregError() {
+               if ( defined( 'PREG_JIT_STACKLIMIT_ERROR' ) ) {
+                       $PREG_JIT_STACKLIMIT_ERROR = PREG_JIT_STACKLIMIT_ERROR;
+               } else {
+                       $PREG_JIT_STACKLIMIT_ERROR = 'undefined error';
+               }
+               switch ( preg_last_error() ) {
+               case PREG_NO_ERROR:
+                       $msg = "PCRE returned false but gave PREG_NO_ERROR";
+                       break;
+
+               case PREG_INTERNAL_ERROR:
+                       $msg = "PCRE internal error";
+                       break;
+
+               case PREG_BACKTRACK_LIMIT_ERROR:
+                       $msg = "pcre.backtrack_limit exhausted";
+                       break;
+
+               case PREG_RECURSION_LIMIT_ERROR:
+                       $msg = "pcre.recursion_limit exhausted";
+                       break;
+
+               case $PREG_JIT_STACKLIMIT_ERROR:
+                       $msg = "PCRE JIT stack space exhausted";
+                       break;
+
+               case PREG_BAD_UTF8_ERROR:
+               case PREG_BAD_UTF8_OFFSET_ERROR:
+               default:
+                       $msg = "PCRE unexpected error";
+               }
+
+               throw new TokenizerError( __CLASS__.": $msg" );
+       }
+}
+