--- /dev/null
+<?php
+
+namespace RemexHtml\Tokenizer;
+use RemexHtml\HTMLData;
+use RemexHtml\PropGuard;
+
+/**
+ * HTML 5 tokenizer
+ *
+ * Based on the W3C recommendation as published 01 November 2016:
+ * https://www.w3.org/TR/2016/REC-html51-20161101/
+ */
+class Tokenizer {
+ // States
+ const STATE_START = 1;
+ const STATE_DATA = 2;
+ const STATE_RCDATA = 3;
+ const STATE_RAWTEXT = 4;
+ const STATE_SCRIPT_DATA = 5;
+ const STATE_PLAINTEXT = 6;
+ const STATE_EOF = 7;
+ const STATE_CURRENT = 8;
+
+ // Match indices for the data state regex
+ const MD_END_TAG_OPEN = 1;
+ const MD_TAG_NAME = 2;
+ const MD_COMMENT = 3;
+ const MD_COMMENT_INNER = 4;
+ const MD_COMMENT_END = 5;
+ const MD_DOCTYPE = 6;
+ const MD_DT_NAME_WS = 7;
+ const MD_DT_NAME = 8;
+ const MD_DT_PUBLIC_WS = 9;
+ const MD_DT_PUBLIC_DQ = 10;
+ const MD_DT_PUBLIC_SQ = 11;
+ const MD_DT_PUBSYS_WS = 12;
+ const MD_DT_PUBSYS_DQ = 13;
+ const MD_DT_PUBSYS_SQ = 14;
+ const MD_DT_SYSTEM_WS = 15;
+ const MD_DT_SYSTEM_DQ = 16;
+ const MD_DT_SYSTEM_SQ = 17;
+ const MD_DT_BOGUS = 18;
+ const MD_DT_END = 19;
+ const MD_CDATA = 20;
+ const MD_BOGUS_COMMENT = 21;
+
+ // Match indices for the character reference regex
+ const MC_PREFIX = 1;
+ const MC_DECIMAL = 2;
+ const MC_HEXDEC = 3;
+ const MC_SEMICOLON = 4;
+ const MC_HASH = 5;
+ const MC_NAMED = 6;
+ const MC_SUFFIX = 7;
+ const MC_INVALID = 8;
+
+ // Match indices for the attribute regex
+ const MA_SLASH = 1;
+ const MA_NAME = 2;
+ const MA_DQUOTED = 3;
+ const MA_SQUOTED = 4;
+ const MA_UNQUOTED = 5;
+
+ // Characters
+ const REPLACEMENT_CHAR = "\xef\xbf\xbd";
+ const BYTE_ORDER_MARK = "\xef\xbb\xbf";
+
+ protected $ignoreErrors;
+ protected $ignoreCharRefs;
+ protected $ignoreNulls;
+ protected $skipPreprocess;
+ protected $appropriateEndTag;
+ protected $listener;
+ protected $state;
+ protected $preprocessed;
+ protected $text;
+ protected $pos;
+ protected $length;
+ protected $enableCdataCallback;
+ protected $fragmentNamespace;
+ protected $fragmentName;
+
+ /**
+ * Constructor
+ *
+ * @param TokenHandler $listener The object which receives token events
+ * @param string $text The text to tokenize
+ * @param array $options Associative array of options, including:
+ * - ignoreErrors: True to improve performance by ignoring errors. The
+ * token stream should still be the same, except that error() won't be
+ * called.
+ * - ignoreCharRefs: True to ignore character references. Character tokens
+ * will contain the unexpanded character references, and no errors
+ * related to invalid character references will be raised. Performance
+ * will be improved. This is not compliant behaviour.
+ * - ignoreNulls: True to ignore NULL bytes in the input stream, instead
+ * of raising errors and converting them to U+FFFD as is usually
+ * required by the spec.
+ * - skipPreprocess: True to skip the "preprocessing the input stream"
+ * stage, which normalizes line endings and raises errors on certain
+ * control characters. Advisable if the input stream is already
+ * appropriately normalized.
+ */
+ public function __construct( TokenHandler $listener, $text, $options ) {
+ $this->listener = $listener;
+ $this->text = $text;
+ $this->pos = 0;
+ $this->preprocessed = false;
+ $this->length = strlen( $text );
+ $this->ignoreErrors = !empty( $options['ignoreErrors'] );
+ $this->ignoreCharRefs = !empty( $options['ignoreCharRefs'] );
+ $this->ignoreNulls = !empty( $options['ignoreNulls'] );
+ $this->skipPreprocess = !empty( $options['skipPreprocess'] );
+ }
+
+ public function __set( $name, $value ) {
+ PropGuard::set( $this, $name, $value );
+ }
+
+ public function setEnableCdataCallback( $cb ) {
+ $this->enableCdataCallback = $cb;
+ }
+
+ /**
+ * Run the tokenizer on the whole input stream. This is the normal entry point.
+ *
+ * @param array $options An associative array of options:
+ * - state : One of the STATE_* constants, a state in which to start.
+ * - appropriateEndTag : The "appropriate end tag", which needs to be set
+ * if entering one of the raw text states.
+ * - fragmentNamespace : The fragment namespace
+ * - fragmentName : The fragment tag name
+ */
+ public function execute( $options = [] ) {
+ if ( isset( $options['state'] ) ) {
+ $this->state = $options['state'];
+ } else {
+ $this->state = self::STATE_START;
+ }
+
+ if ( isset( $options['fragmentNamespace'] ) ) {
+ $this->setFragmentContext( $options['fragmentNamespace'], $options['fragmentName'] );
+ } else {
+ $this->fragmentNamespace = null;
+ $this->fragmentName = null;
+ }
+ $this->appropriateEndTag = isset( $options['appropriateEndTag'] ) ?
+ $options['appropriateEndTag'] : null;
+ $this->preprocess();
+ $this->listener->startDocument( $this, $this->fragmentNamespace, $this->fragmentName );
+
+ $this->executeInternal( true );
+ }
+
+ /**
+ * Get the preprocessed input text. Source offsets in event parameters are
+ * relative to this string. If skipPreprocess was specified, this will be
+ * the same as the input string.
+ */
+ public function getPreprocessedText() {
+ $this->preprocess();
+ return $this->text;
+ }
+
+ /**
+ * Change the state of the tokenizer during parsing. This for use by the
+ * tree builder to switch the tokenizer into one of the raw text states.
+ *
+ * @param integer $state One of the STATE_* constants
+ * @param string $appropriateEndTag The appropriate end tag
+ */
+ public function switchState( $state, $appropriateEndTag ) {
+ $this->state = $state;
+ $this->appropriateEndTag = $appropriateEndTag;
+ }
+
+ /**
+ * Initialize the tokenizer for fragment parsing
+ *
+ * @param string $namespace The namespace of the context element
+ * @param string $tagName The name of the context element
+ */
+ public function setFragmentContext( $namespace, $tagName ) {
+ $this->fragmentNamespace = $namespace;
+ $this->fragmentName = $tagName;
+
+ if ( strval( $namespace ) !== '' && $namespace !== HTMLData::NS_HTML ) {
+ return;
+ }
+
+ switch ( $tagName ) {
+ case 'title':
+ case 'textarea':
+ $this->state = Tokenizer::STATE_RCDATA;
+ break;
+
+ case 'style':
+ case 'xmp':
+ case 'iframe':
+ case 'noembed':
+ case 'noframes':
+ $this->state = Tokenizer::STATE_RAWTEXT;
+ break;
+
+ case 'script':
+ $this->state = Tokenizer::STATE_SCRIPT_DATA;
+ break;
+
+ case 'noscript':
+ if ( $this->scriptingFlag ) {
+ $this->state = Tokenizer::STATE_RAWTEXT;
+ }
+ break;
+
+ case 'plaintext':
+ $this->state = Tokenizer::STATE_PLAINTEXT;
+ break;
+ }
+ }
+
+ /**
+ * Notify the tokenizer that the document will be tokenized by repeated step()
+ * calls. This must be called once only, before the first call to step().
+ */
+ public function beginStepping() {
+ $this->state = self::STATE_START;
+ $this->preprocess();
+ $this->listener->startDocument( $this, null, null );
+ }
+
+ /**
+ * Tokenize a minimum amount of text from the input stream, and emit the
+ * resulting events.
+ *
+ * @return bool True if the input continues and step() should be called
+ * again, false on EOF
+ */
+ public function step() {
+ if ( $this->state === null ) {
+ $this->fatal( "beginStepping() must be called before step()" );
+ }
+ return $this->executeInternal( false );
+ }
+
+ /**
+ * Preprocess the input text, if it hasn't been done already.
+ */
+ protected function preprocess() {
+ if ( $this->preprocessed || $this->skipPreprocess ) {
+ return;
+ }
+
+ // Normalize line endings
+ $this->text = strtr( $this->text, [
+ "\r\n" => "\n",
+ "\r" => "\n" ] );
+ $this->length = strlen( $this->text );
+
+ // Raise parse errors for any control characters
+ if ( !$this->ignoreErrors ) {
+ $pos = 0;
+ $re = '/[' .
+ '\x{0001}-\x{0008}' .
+ '\x{000E}-\x{001F}' .
+ '\x{007F}-\x{009F}' .
+ '\x{FDD0}-\x{FDEF}' .
+ '\x{000B}' .
+ '\x{FFFE}\x{FFFF}' .
+ '\x{1FFFE}\x{1FFFF}' .
+ '\x{2FFFE}\x{2FFFF}' .
+ '\x{3FFFE}\x{3FFFF}' .
+ '\x{4FFFE}\x{4FFFF}' .
+ '\x{5FFFE}\x{5FFFF}' .
+ '\x{6FFFE}\x{6FFFF}' .
+ '\x{7FFFE}\x{7FFFF}' .
+ '\x{8FFFE}\x{8FFFF}' .
+ '\x{9FFFE}\x{9FFFF}' .
+ '\x{AFFFE}\x{AFFFF}' .
+ '\x{BFFFE}\x{BFFFF}' .
+ '\x{CFFFE}\x{CFFFF}' .
+ '\x{DFFFE}\x{DFFFF}' .
+ '\x{EFFFE}\x{EFFFF}' .
+ '\x{FFFFE}\x{FFFFF}' .
+ '\x{10FFFE}\x{10FFFF}]/u';
+ while ( $pos < $this->length ) {
+ $count = preg_match( $re, $this->text, $m, PREG_OFFSET_CAPTURE, $pos );
+ if ( $count === false ) {
+ $this->fatal( "Invalid UTF-8 sequence given to Tokenizer" );
+ } elseif ( !$count ) {
+ break;
+ }
+ $pos = $m[0][1];
+ $this->error( "disallowed control character", $pos );
+ $pos += strlen( $m[0][0] );
+ }
+ }
+ }
+
+ /**
+ * The main state machine, the common implementation of step() and execute().
+ * @param bool $loop Set to true to loop until finished, false to step once.
+ * @return bool True if the input continues, false on EOF
+ */
+ protected function executeInternal( $loop ) {
+ $eof = false;
+
+ do {
+ switch ( $this->state ) {
+ case self::STATE_DATA:
+ $this->state = $this->dataState( $loop );
+ break;
+
+ case self::STATE_RCDATA:
+ $this->state = $this->textElementState( false );
+ break;
+
+ case self::STATE_RAWTEXT:
+ $this->state = $this->textElementState( true );
+ break;
+
+ case self::STATE_SCRIPT_DATA:
+ $this->state = $this->scriptDataState();
+ break;
+
+ case self::STATE_PLAINTEXT:
+ $this->state = $this->plaintextState();
+ break;
+
+ case self::STATE_START:
+ $this->state = self::STATE_DATA;
+ break;
+
+ case self::STATE_EOF:
+ $this->listener->endDocument( $this->length );
+ $eof = true;
+ break 2;
+
+ default:
+ $this->fatal( 'invalid state' );
+ }
+ } while ( $loop );
+
+ return !$eof;
+ }
+
+ /**
+ * Consume input text starting from the "data state".
+ *
+ * @param bool $loop True to loop while still in the data state, false to
+ * process a single less-than sign.
+ * @return integer The next state index
+ */
+ protected function dataState( $loop ) {
+ $re = "~ <
+ (?:
+ ( /? ) # 1. End tag open
+
+ ( # 2. Tag name
+ # Try to match the ASCII letter required for the start of a start
+ # or end tag. If this fails, a slash matched above can be
+ # backtracked and then fed into the bogus comment alternative below.
+ [a-zA-Z]
+
+ # Then capture the rest of the tag name
+ [^\t\n\f />]*
+ ) |
+
+ # Comment
+ !--
+ ( # 3. Comment match detector
+ > | -> | # Invalid short close
+ ( # 4. Comment contents
+ (?:
+ (?! --> )
+ (?! --!> )
+ (?! --! \\z )
+ (?! -- \\z )
+ (?! - \\z )
+ .
+ )*+
+ )
+ ( # 5. Comment close
+ --> | # Normal close
+ --!> | # Comment end bang
+ --! | # EOF in comment end bang state
+ -- | # EOF in comment end state
+ - | # EOF in comment end dash state
+ # EOF in comment state
+ )
+ ) |
+ ( (?i) # 6. Doctype
+ ! DOCTYPE
+
+ # There must be at least one whitespace character to suppress
+ # a parse error, but if there isn't one, this is still a
+ # DOCTYPE. There is no way for the DOCTYPE string to end up
+ # as a character node, the DOCTYPE subexpression must always
+ # wholly match if we matched up to this point.
+
+ ( [\t\n\f ]*+ ) # 7. Required whitespace
+ ( [^\t\n\f >]*+ ) # 8. DOCTYPE name
+ [\t\n\f ]*+
+ (?:
+ # After DOCTYPE name state
+ PUBLIC
+ ( [\t\n\f ]* ) # 9. Required whitespace
+ (?:
+ \" ( [^\">]* ) \"? | # 10. Double-quoted identifier
+ ' ( [^'>]* ) '? | # 11. Single-quoted identifier
+ # Non-match: bogus
+ )
+ (?:
+ # After DOCTYPE public identifier state
+ # Assert quoted identifier before here
+ (?<= \" | ' )
+ ( [\t\n\f ]* ) # 12. Required whitespace
+ (?:
+ \" ( [^\">]* ) \"? | # 13. Double-quoted identifier
+ ' ( [^'>]* ) '? | # 14. Single-quoted identifier
+ # Non-match: no system ID
+ )
+ )?
+ |
+ SYSTEM
+ ( [\t\n\f ]* ) # 15. Required whitespace
+ (?:
+ \" ( [^\">]* ) \"? | # 16. Double-quoted identifier
+ ' ( [^'>]* ) '? | # 17. Single-quoted identifier
+ # Non-match: bogus
+ )
+ | # No keyword is OK
+ )
+ [\t\n\f ]*
+ ( [^>]*+ ) # 18. Bogus DOCTYPE
+ ( >? ) # 19. End of DOCTYPE
+ ) |
+ ( ! \[CDATA\[ ) | # 20. CDATA section
+ ( [!?/] [^>]*+ ) >? # 21. Bogus comment
+
+ # Anything else: parse error and emit literal less-than sign.
+ # We will let the match fail at this position and later check
+ # for less-than signs in the resulting text node.
+ )
+ ~xs";
+
+ $nextState = self::STATE_DATA;
+ do {
+ $count = preg_match( $re, $this->text, $m, PREG_OFFSET_CAPTURE, $this->pos );
+ if ( $count === false ) {
+ $this->throwPregError();
+ } elseif ( !$count ) {
+ // Text runs to end
+ $this->emitDataRange( $this->pos, $this->length - $this->pos );
+ $this->pos = $this->length;
+ $nextState = self::STATE_EOF;
+ break;
+ }
+
+ $startPos = $m[0][1];
+ $tagName = isset( $m[self::MD_TAG_NAME] ) ? $m[self::MD_TAG_NAME][0] : '';
+
+ $this->emitDataRange( $this->pos, $startPos - $this->pos );
+ $this->pos = $startPos;
+ $nextPos = $m[0][1] + strlen( $m[0][0] );
+
+ if ( isset( $m[self::MD_CDATA] ) && $m[self::MD_CDATA][1] >= 0 ) {
+ if ( $this->enableCdataCallback ) {
+ $isCdata = call_user_func( $this->enableCdataCallback );
+ } else {
+ $isCdata = false;
+ }
+ if ( !$isCdata ) {
+ $m[self::MD_BOGUS_COMMENT] = $m[self::MD_CDATA];
+ }
+ } else {
+ $isCdata = false;
+ }
+
+ if ( strlen( $tagName ) ) {
+ // Tag
+ $isEndTag = (bool)strlen( $m[self::MD_END_TAG_OPEN][0] );
+ if ( !$this->ignoreNulls ) {
+ $tagName = $this->handleNulls( $tagName, $m[self::MD_TAG_NAME][1] );
+ }
+ $tagName = strtolower( $tagName );
+ $this->pos = $nextPos;
+ $nextState = $this->handleAttribsAndClose( self::STATE_DATA,
+ $tagName, $isEndTag, $startPos );
+ $nextPos = $this->pos;
+ if ( $nextState === self::STATE_EOF ) {
+ break;
+ }
+
+ // Respect any state switch imposed by the parser
+ $nextState = $this->state;
+
+ } elseif ( isset( $m[self::MD_COMMENT] ) && $m[self::MD_COMMENT][1] >= 0 ) {
+ // Comment
+ $this->interpretCommentMatches( $m );
+ } elseif ( isset( $m[self::MD_DOCTYPE] ) && $m[self::MD_DOCTYPE][1] >= 0 ) {
+ // DOCTYPE
+ $this->interpretDoctypeMatches( $m );
+ } elseif ( isset( $m[self::MD_CDATA] ) && $m[self::MD_CDATA][1] >= 0 ) {
+ // CDATA
+ if ( $this->enableCdataCallback
+ && call_user_func( $this->enableCdataCallback )
+ ) {
+ $this->pos += strlen( $m[self::MD_CDATA][0] ) + 1;
+ $endPos = strpos( $this->text, ']]>', $this->pos );
+ if ( $endPos === false ) {
+ $this->emitCdataRange( $this->pos, $this->length - $this->pos,
+ $startPos, $this->length - $startPos );
+ $this->pos = $this->length;
+ $nextState = self::STATE_EOF;
+ break;
+ } else {
+ $outerEndPos = $endPos + strlen( ']]>' );
+ $this->emitCdataRange( $this->pos, $endPos - $this->pos,
+ $startPos, $outerEndPos - $startPos );
+ $nextPos = $outerEndPos;
+ }
+ } else {
+ // Bogus comment
+ $this->error( "unexpected CDATA interpreted as bogus comment" );
+ $endPos = strpos( $this->text, '>', $this->pos );
+ $bogusPos = $this->pos + 2;
+ if ( $endPos === false ) {
+ $nextPos = $this->length;
+ $contents = substr( $this->text, $bogusPos );
+ } else {
+ $nextPos = $endPos + 1;
+ $contents = substr( $this->text, $bogusPos, $endPos - $bogusPos );
+ }
+ $contents = $this->handleNulls( $contents, $bogusPos );
+ $this->listener->comment( $contents, $this->pos, $endPos - $this->pos );
+ }
+ } elseif ( isset ( $m[self::MD_BOGUS_COMMENT] ) && $m[self::MD_BOGUS_COMMENT][1] >= 0 ) {
+ // Bogus comment
+ $contents = $m[self::MD_BOGUS_COMMENT][0];
+ $bogusPos = $m[self::MD_BOGUS_COMMENT][1];
+ if ( $m[0][0] === '</>' ) {
+ $this->error( "empty end tag" );
+ // No token emitted
+ } elseif ( $m[0][0] === '</' ) {
+ $this->error( 'EOF in end tag' );
+ $this->listener->characters( '</', 0, 2, $m[0][1], 2 );
+ } else {
+ $this->error( "unexpected <{$contents[0]} interpreted as bogus comment" );
+ if ( $contents[0] !== '?' ) {
+ // For starting types other than <?, the initial character is
+ // not in the tag contents
+ $contents = substr( $contents, 1 );
+ $bogusPos++;
+ }
+
+ $contents = $this->handleNulls( $contents, $bogusPos );
+ $this->listener->comment( $contents, $startPos, $nextPos - $startPos );
+ }
+ } else {
+ $this->fatal( 'unexpected data state match' );
+ }
+ $this->pos = $nextPos;
+ } while ( $loop && $nextState === self::STATE_DATA );
+
+ return $nextState;
+ }
+
+ /**
+ * Interpret the data state match results for a detected comment, and emit
+ * events as appropriate.
+ *
+ * @param array $m The match array
+ */
+ protected function interpretCommentMatches( $m ) {
+ $outerStart = $m[0][1];
+ $outerLength = strlen( $m[0][0] );
+ $innerStart = $outerStart + strlen( '<!--' );
+ $innerLength = isset( $m[self::MD_COMMENT_INNER] ) ? strlen( $m[self::MD_COMMENT_INNER][0] ) : 0;
+ $contents = $innerLength ? $m[self::MD_COMMENT_INNER][0] : '';
+
+ if ( $m[0][0] === '<!-->' || $m[0][0] === '<!--->' ) {
+ // These are special cases in the comment start state
+ $this->error( 'not enough dashes in empty comment', $outerStart );
+ $this->listener->comment( '', $outerStart, $outerLength );
+ return;
+ }
+
+ if ( !$this->ignoreNulls ) {
+ $contents = $this->handleNulls( $contents, $innerStart );
+ }
+ $close = $m[self::MD_COMMENT_END][0];
+ $closePos = $m[self::MD_COMMENT_END][1];
+
+ if ( !$this->ignoreErrors ) {
+ if ( $close === '--!>' ) {
+ $this->error( 'invalid comment end bang', $closePos );
+ } elseif ( $close === '-' || $close === '--' || $close === '--!' ) {
+ $this->error( 'EOF part way through comment close', $closePos );
+ } elseif ( $close === '' ) {
+ $this->error( 'EOF in comment', $closePos );
+ }
+
+ $dashSearchLength = $innerLength;
+ while ( $dashSearchLength > 0 && $contents[$dashSearchLength - 1] === '-' ) {
+ $this->error( 'invalid extra dash at comment end',
+ $innerStart + $dashSearchLength - 1 );
+ $dashSearchLength--;
+ }
+
+ $offset = 0;
+ while ( $offset !== false && $offset < $dashSearchLength ) {
+ $offset = strpos( $contents, '--', $offset );
+ if ( $offset !== false ) {
+ $this->error( 'bare "--" found in comment', $innerStart + $offset );
+ $offset += 2;
+ }
+ }
+ }
+
+ $this->listener->comment( $contents, $outerStart, $outerLength );
+ }
+
+ /**
+ * Interpret the data state match results for a detected DOCTYPE token,
+ * and emit events as appropriate.
+ *
+ * @param array $m The match array
+ */
+ protected function interpretDoctypeMatches( $m ) {
+ $igerr = $this->ignoreErrors;
+ $name = null;
+ $public = null;
+ $system = null;
+ $quirks = false;
+
+ // Missing ">" can only be caused by EOF
+ $eof = !strlen( $m[self::MD_DT_END][0] );
+
+ if ( strlen( $m[self::MD_DT_BOGUS][0] ) ) {
+ // Bogus DOCTYPE state
+ if ( !$igerr ) {
+ $this->error( 'invalid DOCTYPE contents', $m[self::MD_DT_BOGUS][1] );
+ }
+ // Set quirks mode unless there was a properly quoted SYSTEM identifier
+ $haveDq = isset( $m[self::MD_DT_SYSTEM_DQ] ) && $m[self::MD_DT_SYSTEM_DQ][1] >= 0;
+ $haveSq = isset( $m[self::MD_DT_SYSTEM_SQ] ) && $m[self::MD_DT_SYSTEM_SQ][1] >= 0;
+ if ( !$haveDq && !$haveSq ) {
+ $quirks = true;
+ }
+ // EOF in the bogus state does not set quirks mode (but it is a parse error)
+ if ( $eof && !$igerr ) {
+ $this->error( 'unterminated DOCTYPE' );
+ }
+ } elseif ( $eof ) {
+ if ( !$igerr ) {
+ $this->error( 'unterminated DOCTYPE' );
+ }
+ $quirks = true;
+ }
+
+ if ( !$igerr && !$eof && !strlen( $m[self::MD_DT_NAME_WS][0] ) ) {
+ $this->error( 'missing whitespace', $m[self::MD_DT_NAME_WS][1] );
+ }
+
+ if ( strlen( $m[self::MD_DT_NAME][0] ) ) {
+ // DOCTYPE name
+ $name = $this->handleNulls( strtolower( $m[self::MD_DT_NAME][0] ), $m[self::MD_DT_NAME][1] );
+ } else {
+ if ( !$eof && !$igerr ) {
+ $this->error( 'missing DOCTYPE name',
+ $m[self::MD_DOCTYPE][1] + strlen( '!DOCTYPE' ) );
+ }
+ $quirks = true;
+ }
+
+ if ( isset( $m[self::MD_DT_PUBLIC_WS] ) && $m[self::MD_DT_PUBLIC_WS][1] >= 0 ) {
+ // PUBLIC keyword found
+ $public = $this->interpretDoctypeQuoted( $m,
+ self::MD_DT_PUBLIC_DQ, self::MD_DT_PUBLIC_SQ, $quirks );
+ if ( $public === null ) {
+ $quirks = true;
+ if ( !$eof && !$igerr ) {
+ $this->error( 'missing public identifier', $m[self::MD_DT_PUBLIC_WS][1] );
+ }
+ } elseif ( !$igerr && !$eof && !strlen( $m[self::MD_DT_PUBLIC_WS][0] ) ) {
+ $this->error( 'missing whitespace', $m[self::MD_DT_PUBLIC_WS][1] );
+ }
+
+ // Check for a system ID after the public ID
+ $haveDq = isset( $m[self::MD_DT_PUBSYS_DQ] ) && $m[self::MD_DT_PUBSYS_DQ][1] >= 0;
+ $haveSq = isset( $m[self::MD_DT_PUBSYS_SQ] ) && $m[self::MD_DT_PUBSYS_SQ][1] >= 0;
+ if ( $haveDq || $haveSq ) {
+ if ( !$igerr && !strlen( $m[self::MD_DT_PUBSYS_WS][0] ) ) {
+ $this->error( 'missing whitespace', $m[self::MD_DT_PUBSYS_WS][1] );
+ }
+ $system = $this->interpretDoctypeQuoted( $m,
+ self::MD_DT_PUBSYS_DQ, self::MD_DT_PUBSYS_SQ, $quirks );
+ }
+ } elseif ( isset( $m[self::MD_DT_SYSTEM_WS] ) && $m[self::MD_DT_SYSTEM_WS][1] >= 0 ) {
+ // SYSTEM keyword found
+ $system = $this->interpretDoctypeQuoted( $m,
+ self::MD_DT_SYSTEM_DQ, self::MD_DT_SYSTEM_SQ, $quirks );
+ if ( $system === null ) {
+ $quirks = true;
+ $this->error( 'missing system identifier', $m[self::MD_DT_SYSTEM_WS][1] );
+ } elseif ( !$igerr && !strlen( $m[self::MD_DT_SYSTEM_WS][0] ) ) {
+ $this->error( 'missing whitespace', $m[self::MD_DT_SYSTEM_WS][1] );
+ }
+
+ }
+ $this->listener->doctype( $name, $public, $system, $quirks, $m[0][1], strlen( $m[0][0] ) );
+ }
+
+ /**
+ * DOCTYPE helper which interprets a quoted string (or lack thereof)
+ * @return string|null The quoted value, with nulls replaced.
+ */
+ protected function interpretDoctypeQuoted( $m, $dq, $sq, &$quirks ) {
+ if ( isset( $m[$dq] ) && $m[$dq][1] >= 0 ) {
+ $value = $m[$dq][0];
+ $startPos = $m[$dq][1];
+ } elseif ( isset( $m[$sq] ) && $m[$sq][1] >= 0 ) {
+ $value = $m[$sq][0];
+ $startPos = $m[$sq][1];
+ } else {
+ return null;
+ }
+ $endPos = $startPos + strlen( $value );
+ if ( $endPos >= $this->length ) {
+ // This is a parse error, but we already emitted a generic EOF error
+ $quirks = true;
+ } elseif ( $this->text[$endPos] === '>' ) {
+ $this->error( 'DOCTYPE identifier terminated by ">"', $endPos );
+ $quirks = true;
+ }
+ $value = $this->handleNulls( $value, $startPos );
+ return $value;
+ }
+
+ /**
+ * Generic helper for all those points in the spec where U+0000 needs to be
+ * replaced with U+FFFD with a parse error issued.
+ *
+ * @param string $text The text to be converted
+ * @param integer $sourcePos The input byte offset from which $text was
+ * extracted, for error position reporting.
+ * @return string The converted text
+ */
+ protected function handleNulls( $text, $sourcePos ) {
+ if ( $this->ignoreNulls ) {
+ return $text;
+ }
+ if ( !$this->ignoreErrors ) {
+ $offset = 0;
+ while ( true ) {
+ $nullPos = strpos( $text, "\0", $offset );
+ if ( $nullPos === false ) {
+ break;
+ }
+ $this->error( "replaced null character", $sourcePos + $nullPos );
+ if ( $nullPos < strlen( $text ) - 1 ) {
+ $offset = $nullPos + 1;
+ } else {
+ break;
+ }
+ }
+ }
+ return str_replace( "\0", self::REPLACEMENT_CHAR, $text );
+ }
+
+ /**
+ * Generic helper for points in the spec which say that an error should
+ * be issued when certain ASCII characters are seen, with no other action
+ * taken.
+ *
+ * @param string $mask Mask for strcspn
+ * @param string $text The input text
+ * @param integer $offset The start of the range within $text to search
+ * @param integer $length The length of the range within $text to search
+ * @param integer $sourcePos The offset within the input text corresponding
+ * to $text, for error position reporting.
+ */
+ protected function handleAsciiErrors( $mask, $text, $offset, $length, $sourcePos ) {
+ while ( $length > 0 ) {
+ $validLength = strcspn( $text, $mask, $offset, $length );
+ $offset += $validLength;
+ $length -= $validLength;
+ if ( $length <= 0 ) {
+ break;
+ }
+ $char = $text[$offset];
+ $codepoint = ord( $char );
+ if ( $codepoint < 0x20 || $codepoint >= 0x7f ) {
+ $this->error( sprintf( 'unexpected U+00%02X', $codepoint ), $offset + $sourcePos );
+ } else {
+ $this->error( "unexpected \"$char\"", $offset + $sourcePos );
+ }
+ $offset++;
+ $length--;
+ }
+ }
+
+ /**
+ * Expand character references in some text, and emit errors as appropriate.
+ * @param string $text The text to expand
+ * @param integer $sourcePos The input position of $text
+ * @param bool $inAttr True if the text is within an attribute value
+ * @param string $additionalAllowedChar An unused string which the spec
+ * inexplicably spends a lot of space telling you how to derive. It
+ * suppresses errors in a place where no errors are emitted anyway.
+ * @return string The expanded text
+ */
+ protected function handleCharRefs( $text, $sourcePos, $inAttr = false,
+ $additionalAllowedChar = ''
+ ) {
+ if ( $this->ignoreCharRefs ) {
+ return $text;
+ }
+ // Efficiently translate a few common cases.
+ // Although this doesn't translate any error cases, running this
+ // function in !$ignoreError mode would cause the string offsets to
+ // be wrong when we come to the preg_match_all.
+ //
+ // In HHVM this is way too broken to be usable. (@todo bug/PR)
+ if ( !defined( 'HHVM_VERSION' ) && $this->ignoreErrors ) {
+ $text = html_entity_decode( $text, ENT_HTML5 | ENT_QUOTES );
+ }
+
+ static $re;
+ if ( $re === null ) {
+ $knownNamed = HTMLData::$namedEntityRegex;
+ $re = "~
+ ( .*? ) # 1. prefix
+ &
+ (?:
+ \# (?:
+ 0*(\d+) | # 2. decimal
+ [xX]0*([0-9A-Fa-f]+) # 3. hexadecimal
+ )
+ ( ; ) ? # 4. semicolon
+ |
+ ( \# ) # 5. bare hash
+ |
+ ($knownNamed) # 6. known named
+ (?:
+ (?<! ; ) # Assert no semicolon prior
+ ( [=a-zA-Z0-9] ) # 7. attribute suffix
+ )?
+ |
+ ( [a-zA-Z0-9]+ ; ) # 8. invalid named
+ )
+ # S = study, for efficient knownNamed
+ # A = anchor, to avoid unnecessary movement of the whole pattern on failure
+ ~xAsS";
+ }
+ $out = '';
+ $pos = 0;
+ $length = strlen( $text );
+ $matches = [];
+ $count = preg_match_all( $re, $text, $matches, PREG_SET_ORDER );
+ if ( $count === false ) {
+ $this->throwPregError();
+ }
+
+ foreach ( $matches as $m ) {
+ $out .= $m[self::MC_PREFIX];
+ $errorPos = $sourcePos + $pos + strlen( $m[self::MC_PREFIX] );
+ $lastPos = $pos;
+ $pos += strlen( $m[0] );
+
+ if ( isset( $m[self::MC_HASH] ) && strlen( $m[self::MC_HASH] ) ) {
+ // Bare &#
+ $this->error( 'Expected digits after &#', $errorPos );
+ $out .= '&#';
+ continue;
+ }
+
+ $knownNamed = isset( $m[self::MC_NAMED] ) ? $m[self::MC_NAMED] : '';
+ $attributeSuffix = isset( $m[self::MC_SUFFIX] ) ? $m[self::MC_SUFFIX] : '';
+
+ $haveSemicolon =
+ ( isset( $m[self::MC_SEMICOLON] ) && strlen( $m[self::MC_SEMICOLON] ) )
+ || ( strlen( $knownNamed ) && $knownNamed[ strlen( $knownNamed ) - 1 ] === ';' )
+ || ( isset( $m[self::MC_INVALID] ) && strlen( $m[self::MC_INVALID] ) );
+
+ if ( $inAttr && !$haveSemicolon ) {
+ if ( strlen( $attributeSuffix ) ) {
+ if ( !$this->ignoreErrors && $attributeSuffix === '=' ) {
+ $this->error( 'invalid equals sign after named character reference' );
+ }
+ $out .= '&' . $knownNamed . $attributeSuffix;
+ continue;
+ }
+ }
+
+ if ( !$this->ignoreErrors && !$haveSemicolon ) {
+ $this->error( 'character reference missing semicolon', $errorPos );
+ }
+
+ if ( isset( $m[self::MC_DECIMAL] ) && strlen( $m[self::MC_DECIMAL] ) ) {
+ // Decimal
+ if ( strlen( $m[self::MC_DECIMAL] ) > 7 ) {
+ $this->error( 'invalid numeric reference', $errorPos );
+ $out .= self::REPLACEMENT_CHAR;
+ continue;
+ }
+ $codepoint = intval( $m[self::MC_DECIMAL] );
+ } elseif ( isset( $m[self::MC_HEXDEC] ) && strlen( $m[self::MC_HEXDEC] ) ) {
+ // Hexadecimal
+ if ( strlen( $m[self::MC_HEXDEC] ) > 6 ) {
+ $this->error( 'invalid numeric reference', $errorPos );
+ $out .= self::REPLACEMENT_CHAR;
+ continue;
+ }
+ $codepoint = intval( $m[self::MC_HEXDEC], 16 );
+ } elseif ( $knownNamed !== '' ) {
+ $out .= HTMLData::$namedEntityTranslations[$knownNamed] . $attributeSuffix;
+ continue;
+ } elseif ( isset( $m[self::MC_INVALID] ) && strlen( $m[self::MC_INVALID] ) ) {
+ if ( !$this->ignoreErrors ) {
+ $this->error( 'invalid named reference', $errorPos );
+ }
+ $out .= '&' . $m[self::MC_INVALID];
+ continue;
+ } else {
+ $this->fatal( 'unable to identify char ref submatch' );
+ }
+
+ // Interpret $codepoint
+ if ( $codepoint === 0
+ || ( $codepoint >= 0xD800 && $codepoint <= 0xDFFF )
+ || $codepoint > 0x10FFFF
+ ) {
+ if ( !$this->ignoreErrors ) {
+ $this->error( 'invalid numeric reference', $errorPos );
+ }
+ $out .= self::REPLACEMENT_CHAR;
+ } elseif ( isset( HTMLData::$legacyNumericEntities[$codepoint] ) ) {
+ if ( !$this->ignoreErrors ) {
+ $this->error( 'invalid reference to non-ASCII control character', $errorPos );
+ }
+ $out .= HTMLData::$legacyNumericEntities[$codepoint];
+ } else {
+ if ( !$this->ignoreErrors ) {
+ $disallowedCodepoints = [
+ 0x000B => true,
+ 0xFFFE => true, 0xFFFF => true,
+ 0x1FFFE => true, 0x1FFFF => true,
+ 0x2FFFE => true, 0x2FFFF => true,
+ 0x3FFFE => true, 0x3FFFF => true,
+ 0x4FFFE => true, 0x4FFFF => true,
+ 0x5FFFE => true, 0x5FFFF => true,
+ 0x6FFFE => true, 0x6FFFF => true,
+ 0x7FFFE => true, 0x7FFFF => true,
+ 0x8FFFE => true, 0x8FFFF => true,
+ 0x9FFFE => true, 0x9FFFF => true,
+ 0xAFFFE => true, 0xAFFFF => true,
+ 0xBFFFE => true, 0xBFFFF => true,
+ 0xCFFFE => true, 0xCFFFF => true,
+ 0xDFFFE => true, 0xDFFFF => true,
+ 0xEFFFE => true, 0xEFFFF => true,
+ 0xFFFFE => true, 0xFFFFF => true,
+ 0x10FFFE => true, 0x10FFFF => true ];
+ if (
+ ( $codepoint >= 1 && $codepoint <= 8 ) ||
+ ( $codepoint >= 0x0d && $codepoint <= 0x1f ) ||
+ ( $codepoint >= 0x7f && $codepoint <= 0x9f ) ||
+ ( $codepoint >= 0xfdd0 && $codepoint <= 0xfdef ) ||
+ isset( $disallowedCodepoints[$codepoint] )
+ ) {
+ $this->error( 'invalid numeric reference to control character',
+ $errorPos );
+ }
+ }
+
+ $out .= \UtfNormal\Utils::codepointToUtf8( $codepoint );
+ }
+ }
+ if ( $pos < $length ) {
+ $out .= substr( $text, $pos );
+ }
+ return $out;
+ }
+
+ /**
+ * Emit a range of the input text as a character token, and emit related
+ * errors, with validity rules as per the data state.
+ *
+ * @param integer $pos Offset within the input text
+ * @param integer $length The length of the range
+ */
+ protected function emitDataRange( $pos, $length ) {
+ if ( $length === 0 ) {
+ return;
+ }
+ if ( $this->ignoreCharRefs && $this->ignoreNulls && $this->ignoreErrors ) {
+ $this->listener->characters( $this->text, $pos, $length, $pos, $length );
+ } else {
+ if ( !$this->ignoreErrors ) {
+ // Any bare "<" in a data state text node is a parse error.
+ // Uniquely to the data state, nulls are just flagged as errors
+ // and passed through, they are not replaced.
+ $this->handleAsciiErrors( "<\0", $this->text, $pos, $length, 0 );
+ }
+
+ $text = substr( $this->text, $pos, $length );
+ $text = $this->handleCharRefs( $text, $pos );
+ $this->listener->characters( $text, 0, strlen( $text ), $pos, $length );
+ }
+ }
+
+ /**
+ * Emit a range of characters from the input text, with validity rules as
+ * per the CDATA section state.
+ *
+ * @param $innerPos The position after the <![CDATA[
+ * @param $innerLength The length of the string not including the terminating ]]>
+ * @param $outerPos The position of the start of the <!CDATA[
+ * @param $outerLength The length of the whole input region being emitted
+ */
+ protected function emitCdataRange( $innerPos, $innerLength, $outerPos, $outerLength ) {
+ $this->listener->characters( $this->text, $innerPos, $innerLength,
+ $outerPos, $outerLength );
+ }
+
+ /**
+ * Emit a range of characters from the input text, either from RCDATA,
+ * RAWTEXT, script data or PLAINTEXT. The only difference between these
+ * states is whether or not character references are expanded, so we take
+ * that as a parameter.
+ *
+ * @param bool $ignoreCharRefs
+ * @param integer $pos The input position
+ * @param integer $length The length of the range to be emitted
+ */
+ protected function emitRawTextRange( $ignoreCharRefs, $pos, $length ) {
+ if ( $length === 0 ) {
+ return;
+ }
+ $ignoreCharRefs = $ignoreCharRefs || $this->ignoreCharRefs;
+ if ( $ignoreCharRefs && $this->ignoreNulls ) {
+ $this->listener->characters( $this->text, $pos, $length, $pos, $length );
+ } else {
+ $text = substr( $this->text, $pos, $length );
+ if ( !$ignoreCharRefs ) {
+ $text = $this->handleCharRefs( $text, $pos );
+ }
+ $text = $this->handleNulls( $text, $pos );
+ $this->listener->characters( $text, 0, strlen( $text ), $pos, $length );
+ }
+ }
+
+ /**
+ * The entry point for the RCDATA and RAWTEXT states.
+ * @param bool $ignoreCharRefs True to ignore character references regardless
+ * of configuration, false to respect the configuration.
+ * @return integer The next state index
+ */
+ protected function textElementState( $ignoreCharRefs ) {
+ if ( $this->appropriateEndTag === null ) {
+ $this->emitRawTextRange( $ignoreCharRefs, $this->pos, $this->length - $this->pos );
+ $this->pos = $this->length;
+ return self::STATE_EOF;
+ }
+
+ $re = "~</
+ {$this->appropriateEndTag}
+ # Assert that the end tag name state is exited appropriately,
+ # since the anything else case leads to the tag being treated as
+ # a literal
+ (?=[\t\n\f />])
+ ~ix";
+
+ do {
+ $count = preg_match( $re, $this->text, $m, PREG_OFFSET_CAPTURE, $this->pos );
+
+ if ( $count === false ) {
+ $this->throwPregError();
+ } elseif ( !$count ) {
+ // Text runs to end
+ $this->emitRawTextRange( $ignoreCharRefs, $this->pos, $this->length - $this->pos );
+ $this->pos = $this->length;
+ return self::STATE_EOF;
+ }
+ $startPos = $m[0][1];
+
+ // Emit text before tag
+ $this->emitRawTextRange( $ignoreCharRefs, $this->pos, $startPos - $this->pos );
+
+ $matchLength = strlen( $m[0][0] );
+ $this->pos = $startPos + $matchLength;
+ $nextState = $this->handleAttribsAndClose( self::STATE_RCDATA,
+ $this->appropriateEndTag, true, $startPos );
+ } while ( $nextState === self::STATE_RCDATA );
+ return $nextState;
+ }
+
+ /**
+ * Advance $this->pos, consuming all tag attributes found at the current
+ * position. The new position will be at the end of the tag or at the end
+ * of the input string.
+ *
+ * To improve performance of consumers which don't need to read the
+ * attribute array, interpretation of the PCRE match results is deferred.
+ *
+ * - @todo: Make deferral configurable.
+ * - @todo: Measure performance improvement, assess whether the LazyAttributes
+ * feature is warranted.
+ *
+ * @return array Attributes
+ */
+ protected function consumeAttribs() {
+ $re = '~
+ [\t\n\f ]*+ # Ignored whitespace before attribute name
+ (?! /> ) # Do not consume self-closing end of tag
+ (?! > ) # Do not consume normal closing bracket
+
+ (?:
+ # Before attribute name state
+ # A bare slash at this point, not part of a self-closing end tag, is
+ # consumed and ignored (with a parse error), returning to the before
+ # attribute name state.
+ ( / ) | # 1. Bare slash
+
+ # Attribute name state
+ # Note that the first character can be an equals sign, this is a parse error
+ # but still generates an attribute called "=". Thus the only way the match
+ # could fail here is due to EOF.
+
+ ( [^\t\n\f />] [^\t\n\f =/>]*+ ) # 2. Attribute name
+
+ # After attribute name state
+ [\t\n\f ]*
+
+ (?:
+ =
+ # Before attribute value state
+ # Ignore whitespace
+ [\t\n\f ]*+
+ (?:
+ # If an end-quote is omitted, the attribute will run to the end of the
+ # string, leaving no closing bracket. So the caller will detect the
+ # unexpected EOF and will not emit the tag, which is correct.
+ " ( [^"]*+ ) "? | # 3. Double-quoted attribute value
+ \' ( [^\']*+ ) \'? | # 4. Single-quoted attribute value
+ ( [^\t\n\f >]*+ ) # 5. Unquoted attribute value
+ )
+ # Or nothing: an attribute with an empty value. The attribute name was
+ # terminated by a slash, closing bracket or EOF
+ |
+ )
+ )
+ # The /A modifier causes preg_match_all to give contiguous chunks
+ ~xA';
+ $count = preg_match_all( $re, $this->text, $m,
+ PREG_SET_ORDER | PREG_OFFSET_CAPTURE, $this->pos );
+ if ( $count === false ) {
+ $this->throwPregError();
+ } elseif ( $count ) {
+ $this->pos = $m[$count - 1][0][1] + strlen( $m[$count - 1][0][0] );
+ $attribs = new LazyAttributes( $m, function ( $m ) {
+ return $this->interpretAttribMatches( $m );
+ } );
+ } else {
+ $attribs = new PlainAttributes();
+ }
+
+ // Consume trailing whitespace. This is strictly part of the before attribute
+ // name state, but we didn't consume it in the regex since we used a principle
+ // of one match equals one attribute.
+ $this->pos += strspn( $this->text, "\t\n\f ", $this->pos );
+ return $attribs;
+ }
+
+ /**
+ * Interpret the results of the attribute preg_match_all(). Emit errors as
+ * appropriate and return an associative array.
+ *
+ * @param array $matches
+ * @return array
+ */
+ protected function interpretAttribMatches( $matches ) {
+ $attributes = [];
+ foreach ( $matches as $m ) {
+ if ( strlen( $m[self::MA_SLASH][0] ) ) {
+ $this->error( 'unexpected bare slash', $m[self::MA_SLASH][1] );
+ continue;
+ }
+ $name = $m[self::MA_NAME][0];
+ if ( !$this->ignoreErrors ) {
+ $this->handleAsciiErrors( "\"'<=", $name, 0, strlen( $name ), $m[self::MA_NAME][1] );
+ }
+ if ( !$this->ignoreNulls ) {
+ $name = $this->handleNulls( $m[self::MA_NAME][0], $m[self::MA_NAME][1] );
+ }
+ $name = strtolower( $name );
+ $additionalAllowedChar = '';
+ if ( isset( $m[self::MA_DQUOTED] ) && $m[self::MA_DQUOTED][1] >= 0 ) {
+ // Double-quoted attribute value
+ $additionalAllowedChar = '"';
+ $value = $m[self::MA_DQUOTED][0];
+ $pos = $m[self::MA_DQUOTED][1];
+ } elseif ( isset( $m[self::MA_SQUOTED] ) && $m[self::MA_SQUOTED][1] >= 0 ) {
+ // Single-quoted attribute value
+ $additionalAllowedChar = "'";
+ $value = $m[self::MA_SQUOTED][0];
+ $pos = $m[self::MA_SQUOTED][1];
+ } elseif ( isset( $m[self::MA_UNQUOTED] ) && $m[self::MA_UNQUOTED][1] >= 0 ) {
+ // Unquoted attribute value
+ $value = $m[self::MA_UNQUOTED][0];
+ $pos = $m[self::MA_UNQUOTED][1];
+ // Search for parse errors
+ if ( !$this->ignoreErrors ) {
+ if ( $value === '' ) {
+ // ">" in the before attribute value state is a parse error
+ $this->error( 'empty unquoted attribute', $pos );
+ }
+ $this->handleAsciiErrors( "\"'<=`", $value, 0, strlen( $value ), $pos );
+ }
+ } else {
+ $value = '';
+ }
+ if ( $additionalAllowedChar && !$this->ignoreErrors ) {
+ // After attribute value (quoted) state
+ // Quoted attributes must be followed by a space, "/" or ">"
+ $aavPos = $m[0][1] + strlen( $m[0][0] );
+ if ( $aavPos < $this->length ) {
+ $aavChar = $this->text[$aavPos];
+ if ( !preg_match( '~^[\t\n\f />]~', $aavChar ) ) {
+ $this->error( 'missing space between attributes', $aavPos );
+ }
+ }
+ }
+ if ( $value !== '' ) {
+ if ( !$this->ignoreNulls ) {
+ $value = $this->handleNulls( $value, $pos );
+ }
+ if ( !$this->ignoreCharRefs ) {
+ $value = $this->handleCharRefs( $value, $pos, true, $additionalAllowedChar );
+ }
+ }
+ if ( isset( $attributes[$name] ) ) {
+ $this->error( "duplicate attribute", $m[0][1] );
+ } else {
+ $attributes[$name] = $value;
+ }
+ }
+ return $attributes;
+ }
+
+ /**
+ * Consume attributes, and the closing bracket which follows attributes.
+ * Emit the appropriate tag event, or in the case of broken attributes in
+ * text states, emit characters.
+ *
+ * @param integer $state The current state
+ * @param string $tagName The normalized tag name
+ * @param bool $isEndTag True if this is an end tag, false if it is a start tag
+ * @param integer $startPos The input position of the start of the current tag.
+ * @return integer The next state
+ */
+ protected function handleAttribsAndClose( $state, $tagName, $isEndTag, $startPos ) {
+ $attribStartPos = $this->pos;
+ $attribs = $this->consumeAttribs();
+ $pos = $this->pos;
+
+ // Literal characters are emitted on EOF or "anything else" from the
+ // end tag substates of the text states.
+ // (spec ref 8.2.4 sections 11-19, 25-27)
+ $isDataState = $state === self::STATE_DATA;
+ $isLiteral = $attribStartPos === $pos && !$isDataState;
+
+ if ( $pos >= $this->length ) {
+ $this->error( 'unexpected end of file inside tag' );
+ if ( $isLiteral ) {
+ $this->listener->characters( $this->text,
+ $startPos, $this->length - $startPos,
+ $startPos, $this->length - $startPos );
+ }
+ return self::STATE_EOF;
+ }
+ if ( $isEndTag && !$this->ignoreErrors && $attribs->count() ) {
+ $this->error( 'end tag has an attribute' );
+ }
+
+ if ( $this->text[$pos] === '/' && $this->text[$pos + 1] === '>' ) {
+ $pos += 2;
+ $selfClose = true;
+ } elseif ( $this->text[$pos] === '>' ) {
+ $pos++;
+ $selfClose = false;
+ } elseif ( $isLiteral ) {
+ $this->listener->characters( $this->text,
+ $startPos, $attribStartPos - $startPos,
+ $startPos, $attribStartPos - $startPos );
+ return $state;
+ } else {
+ $this->fatal( 'failed to find an already-matched ">"' );
+ }
+ $this->pos = $pos;
+ if ( $isEndTag ) {
+ if ( $selfClose ) {
+ $this->error( 'self-closing end tag' );
+ }
+ $this->listener->endTag( $tagName, $startPos, $pos - $startPos );
+ } else {
+ $this->listener->startTag( $tagName, $attribs, $selfClose,
+ $startPos, $pos - $startPos );
+ }
+ return self::STATE_DATA;
+ }
+
+ /**
+ * Process input text in the PLAINTEXT state
+ * @return integer The next state index
+ */
+ protected function plaintextState() {
+ $this->emitRawTextRange( true, $this->pos, $this->length - $this->pos );
+ return self::STATE_EOF;
+ }
+
+ /**
+ * Process input text in the script data state
+ * @return integer The next state index
+ */
+ protected function scriptDataState() {
+ if ( $this->appropriateEndTag === null ) {
+ $this->pos = $this->length;
+ return self::STATE_EOF;
+ }
+
+ $re = <<<REGEX
+~
+ (?: # Outer loop start
+ # Script data state
+ # Stop iteration if we previously matched an appropriate end tag.
+ # This is a conditional subpattern: if capture 1 previously
+ # matched, then run the pattern /$./ which always fails.
+ (?(1) $. )
+ .*?
+ (?:
+ $ |
+ (
+ </ {$this->appropriateEndTag}
+ # If we hit the "anything else" case in the script data
+ # end tag name state, don't exit
+ (?= [\t\n\f />] )
+ ) | # 1. Appropriate end tag
+ <!--
+ # Script data escaped dash dash state
+ # Hyphens at this point are consumed without a state transition
+ # and so are not part of a comment-end.
+ -*+
+
+ (?: # Inner loop start
+ # Script data escaped state
+ .*?
+ (?:
+ $ |
+ # Stop at, but do not consume, comment-close or end tag.
+ # This causes the inner loop to exit, since restarting the
+ # inner loop at this input position will cause the loop
+ # body to match zero characters. Repeating a zero-character
+ # match causes the repeat to terminate.
+ (?= --> ) |
+ (?= </ {$this->appropriateEndTag} [\t\n\f />] ) |
+ <script [\t\n\f />]
+ # Script data double escaped state
+ .*?
+ (?:
+ $ |
+ # Stop at, but do not consume, comment-close
+ (?= --> ) |
+ </script [\t\n\f />]
+ )
+ )
+ )*
+
+
+ # Consume the comment close which exited the inner loop, if any
+ (?: --> )?
+ )
+ )*+
+ ~xsiA
+REGEX;
+
+ do {
+ $count = preg_match( $re, $this->text, $m, 0, $this->pos );
+ if ( $count === false ) {
+ $this->throwPregError();
+ } elseif ( !$count ) {
+ $this->fatal( 'unexpected regex failure: this pattern can match zero characters' );
+ }
+
+ $startPos = $this->pos;
+ $matchLength = strlen( $m[0] );
+ $endTagLength = isset( $m[1] ) ? strlen( $m[1] ) : 0;
+ $textLength = $matchLength - $endTagLength;
+ $this->emitRawTextRange( true, $startPos, $textLength );
+ $this->pos = $startPos + $matchLength;
+ $tagStartPos = $startPos + $textLength;
+
+ if ( $endTagLength ) {
+ $nextState = $this->handleAttribsAndClose( self::STATE_SCRIPT_DATA,
+ $this->appropriateEndTag, true, $tagStartPos );
+ } else {
+ $nextState = self::STATE_EOF;
+ }
+ } while ( $nextState === self::STATE_SCRIPT_DATA );
+ return $nextState;
+ }
+
+ /**
+ * Emit a parse error event.
+ * @param string $text The error message
+ * @param integer|null $pos The error position, or null to use the current position
+ */
+ protected function error( $text, $pos = null ) {
+ if ( !$this->ignoreErrors ) {
+ if ( $pos === null ) {
+ $pos = $this->pos;
+ }
+ $this->listener->error( $text, $pos );
+ }
+ }
+
+ /**
+ * Throw an exception for a specified reason. This is used for API errors
+ * and assertion-like sanity checks.
+ * @param string $text The error message
+ */
+ protected function fatal( $text ) {
+ throw new TokenizerError( __CLASS__ . ": " . $text );
+ }
+
+ /**
+ * Interpret preg_last_error() and throw a suitable exception. This is
+ * called when preg_match() or similar returns false.
+ *
+ * Notes for users:
+ *
+ * - PCRE internal error: may be due to JIT stack space exhaustion prior
+ * to PHP 7, due to excessive recursion. Increase stack space.
+ *
+ * - pcre.backtrack_limit exhausted: The backtrack limit should be at least
+ * double the input size, the defaults are way too small. Increase it in
+ * configuration.
+ */
+ protected function throwPregError() {
+ if ( defined( 'PREG_JIT_STACKLIMIT_ERROR' ) ) {
+ $PREG_JIT_STACKLIMIT_ERROR = PREG_JIT_STACKLIMIT_ERROR;
+ } else {
+ $PREG_JIT_STACKLIMIT_ERROR = 'undefined error';
+ }
+ switch ( preg_last_error() ) {
+ case PREG_NO_ERROR:
+ $msg = "PCRE returned false but gave PREG_NO_ERROR";
+ break;
+
+ case PREG_INTERNAL_ERROR:
+ $msg = "PCRE internal error";
+ break;
+
+ case PREG_BACKTRACK_LIMIT_ERROR:
+ $msg = "pcre.backtrack_limit exhausted";
+ break;
+
+ case PREG_RECURSION_LIMIT_ERROR:
+ $msg = "pcre.recursion_limit exhausted";
+ break;
+
+ case $PREG_JIT_STACKLIMIT_ERROR:
+ $msg = "PCRE JIT stack space exhausted";
+ break;
+
+ case PREG_BAD_UTF8_ERROR:
+ case PREG_BAD_UTF8_OFFSET_ERROR:
+ default:
+ $msg = "PCRE unexpected error";
+ }
+
+ throw new TokenizerError( __CLASS__.": $msg" );
+ }
+}
+