listener = $listener; $this->text = $text; $this->pos = 0; $this->preprocessed = false; $this->length = strlen( $text ); $this->ignoreErrors = !empty( $options['ignoreErrors'] ); $this->ignoreCharRefs = !empty( $options['ignoreCharRefs'] ); $this->ignoreNulls = !empty( $options['ignoreNulls'] ); $this->skipPreprocess = !empty( $options['skipPreprocess'] ); } public function __set( $name, $value ) { PropGuard::set( $this, $name, $value ); } public function setEnableCdataCallback( $cb ) { $this->enableCdataCallback = $cb; } /** * Run the tokenizer on the whole input stream. This is the normal entry point. * * @param array $options An associative array of options: * - state : One of the STATE_* constants, a state in which to start. * - appropriateEndTag : The "appropriate end tag", which needs to be set * if entering one of the raw text states. * - fragmentNamespace : The fragment namespace * - fragmentName : The fragment tag name */ public function execute( $options = [] ) { if ( isset( $options['state'] ) ) { $this->state = $options['state']; } else { $this->state = self::STATE_START; } if ( isset( $options['fragmentNamespace'] ) ) { $this->setFragmentContext( $options['fragmentNamespace'], $options['fragmentName'] ); } else { $this->fragmentNamespace = null; $this->fragmentName = null; } $this->appropriateEndTag = isset( $options['appropriateEndTag'] ) ? $options['appropriateEndTag'] : null; $this->preprocess(); $this->listener->startDocument( $this, $this->fragmentNamespace, $this->fragmentName ); $this->executeInternal( true ); } /** * Get the preprocessed input text. Source offsets in event parameters are * relative to this string. If skipPreprocess was specified, this will be * the same as the input string. */ public function getPreprocessedText() { $this->preprocess(); return $this->text; } /** * Change the state of the tokenizer during parsing. This for use by the * tree builder to switch the tokenizer into one of the raw text states. * * @param integer $state One of the STATE_* constants * @param string $appropriateEndTag The appropriate end tag */ public function switchState( $state, $appropriateEndTag ) { $this->state = $state; $this->appropriateEndTag = $appropriateEndTag; } /** * Initialize the tokenizer for fragment parsing * * @param string $namespace The namespace of the context element * @param string $tagName The name of the context element */ public function setFragmentContext( $namespace, $tagName ) { $this->fragmentNamespace = $namespace; $this->fragmentName = $tagName; if ( strval( $namespace ) !== '' && $namespace !== HTMLData::NS_HTML ) { return; } switch ( $tagName ) { case 'title': case 'textarea': $this->state = Tokenizer::STATE_RCDATA; break; case 'style': case 'xmp': case 'iframe': case 'noembed': case 'noframes': $this->state = Tokenizer::STATE_RAWTEXT; break; case 'script': $this->state = Tokenizer::STATE_SCRIPT_DATA; break; case 'noscript': if ( $this->scriptingFlag ) { $this->state = Tokenizer::STATE_RAWTEXT; } break; case 'plaintext': $this->state = Tokenizer::STATE_PLAINTEXT; break; } } /** * Notify the tokenizer that the document will be tokenized by repeated step() * calls. This must be called once only, before the first call to step(). */ public function beginStepping() { $this->state = self::STATE_START; $this->preprocess(); $this->listener->startDocument( $this, null, null ); } /** * Tokenize a minimum amount of text from the input stream, and emit the * resulting events. * * @return bool True if the input continues and step() should be called * again, false on EOF */ public function step() { if ( $this->state === null ) { $this->fatal( "beginStepping() must be called before step()" ); } return $this->executeInternal( false ); } /** * Preprocess the input text, if it hasn't been done already. */ protected function preprocess() { if ( $this->preprocessed || $this->skipPreprocess ) { return; } // Normalize line endings $this->text = strtr( $this->text, [ "\r\n" => "\n", "\r" => "\n" ] ); $this->length = strlen( $this->text ); // Raise parse errors for any control characters if ( !$this->ignoreErrors ) { $pos = 0; $re = '/[' . '\x{0001}-\x{0008}' . '\x{000E}-\x{001F}' . '\x{007F}-\x{009F}' . '\x{FDD0}-\x{FDEF}' . '\x{000B}' . '\x{FFFE}\x{FFFF}' . '\x{1FFFE}\x{1FFFF}' . '\x{2FFFE}\x{2FFFF}' . '\x{3FFFE}\x{3FFFF}' . '\x{4FFFE}\x{4FFFF}' . '\x{5FFFE}\x{5FFFF}' . '\x{6FFFE}\x{6FFFF}' . '\x{7FFFE}\x{7FFFF}' . '\x{8FFFE}\x{8FFFF}' . '\x{9FFFE}\x{9FFFF}' . '\x{AFFFE}\x{AFFFF}' . '\x{BFFFE}\x{BFFFF}' . '\x{CFFFE}\x{CFFFF}' . '\x{DFFFE}\x{DFFFF}' . '\x{EFFFE}\x{EFFFF}' . '\x{FFFFE}\x{FFFFF}' . '\x{10FFFE}\x{10FFFF}]/u'; while ( $pos < $this->length ) { $count = preg_match( $re, $this->text, $m, PREG_OFFSET_CAPTURE, $pos ); if ( $count === false ) { $this->fatal( "Invalid UTF-8 sequence given to Tokenizer" ); } elseif ( !$count ) { break; } $pos = $m[0][1]; $this->error( "disallowed control character", $pos ); $pos += strlen( $m[0][0] ); } } } /** * The main state machine, the common implementation of step() and execute(). * @param bool $loop Set to true to loop until finished, false to step once. * @return bool True if the input continues, false on EOF */ protected function executeInternal( $loop ) { $eof = false; do { switch ( $this->state ) { case self::STATE_DATA: $this->state = $this->dataState( $loop ); break; case self::STATE_RCDATA: $this->state = $this->textElementState( false ); break; case self::STATE_RAWTEXT: $this->state = $this->textElementState( true ); break; case self::STATE_SCRIPT_DATA: $this->state = $this->scriptDataState(); break; case self::STATE_PLAINTEXT: $this->state = $this->plaintextState(); break; case self::STATE_START: $this->state = self::STATE_DATA; break; case self::STATE_EOF: $this->listener->endDocument( $this->length ); $eof = true; break 2; default: $this->fatal( 'invalid state' ); } } while ( $loop ); return !$eof; } /** * Consume input text starting from the "data state". * * @param bool $loop True to loop while still in the data state, false to * process a single less-than sign. * @return integer The next state index */ protected function dataState( $loop ) { $re = "~ < (?: ( /? ) # 1. End tag open ( # 2. Tag name # Try to match the ASCII letter required for the start of a start # or end tag. If this fails, a slash matched above can be # backtracked and then fed into the bogus comment alternative below. [a-zA-Z] # Then capture the rest of the tag name [^\t\n\f />]* ) | # Comment !-- ( # 3. Comment match detector > | -> | # Invalid short close ( # 4. Comment contents (?: (?! --> ) (?! --!> ) (?! --! \\z ) (?! -- \\z ) (?! - \\z ) . )*+ ) ( # 5. Comment close --> | # Normal close --!> | # Comment end bang --! | # EOF in comment end bang state -- | # EOF in comment end state - | # EOF in comment end dash state # EOF in comment state ) ) | ( (?i) # 6. Doctype ! DOCTYPE # There must be at least one whitespace character to suppress # a parse error, but if there isn't one, this is still a # DOCTYPE. There is no way for the DOCTYPE string to end up # as a character node, the DOCTYPE subexpression must always # wholly match if we matched up to this point. ( [\t\n\f ]*+ ) # 7. Required whitespace ( [^\t\n\f >]*+ ) # 8. DOCTYPE name [\t\n\f ]*+ (?: # After DOCTYPE name state PUBLIC ( [\t\n\f ]* ) # 9. Required whitespace (?: \" ( [^\">]* ) \"? | # 10. Double-quoted identifier ' ( [^'>]* ) '? | # 11. Single-quoted identifier # Non-match: bogus ) (?: # After DOCTYPE public identifier state # Assert quoted identifier before here (?<= \" | ' ) ( [\t\n\f ]* ) # 12. Required whitespace (?: \" ( [^\">]* ) \"? | # 13. Double-quoted identifier ' ( [^'>]* ) '? | # 14. Single-quoted identifier # Non-match: no system ID ) )? | SYSTEM ( [\t\n\f ]* ) # 15. Required whitespace (?: \" ( [^\">]* ) \"? | # 16. Double-quoted identifier ' ( [^'>]* ) '? | # 17. Single-quoted identifier # Non-match: bogus ) | # No keyword is OK ) [\t\n\f ]* ( [^>]*+ ) # 18. Bogus DOCTYPE ( >? ) # 19. End of DOCTYPE ) | ( ! \[CDATA\[ ) | # 20. CDATA section ( [!?/] [^>]*+ ) >? # 21. Bogus comment # Anything else: parse error and emit literal less-than sign. # We will let the match fail at this position and later check # for less-than signs in the resulting text node. ) ~xs"; $nextState = self::STATE_DATA; do { $count = preg_match( $re, $this->text, $m, PREG_OFFSET_CAPTURE, $this->pos ); if ( $count === false ) { $this->throwPregError(); } elseif ( !$count ) { // Text runs to end $this->emitDataRange( $this->pos, $this->length - $this->pos ); $this->pos = $this->length; $nextState = self::STATE_EOF; break; } $startPos = $m[0][1]; $tagName = isset( $m[self::MD_TAG_NAME] ) ? $m[self::MD_TAG_NAME][0] : ''; $this->emitDataRange( $this->pos, $startPos - $this->pos ); $this->pos = $startPos; $nextPos = $m[0][1] + strlen( $m[0][0] ); if ( isset( $m[self::MD_CDATA] ) && $m[self::MD_CDATA][1] >= 0 ) { if ( $this->enableCdataCallback ) { $isCdata = call_user_func( $this->enableCdataCallback ); } else { $isCdata = false; } if ( !$isCdata ) { $m[self::MD_BOGUS_COMMENT] = $m[self::MD_CDATA]; } } else { $isCdata = false; } if ( strlen( $tagName ) ) { // Tag $isEndTag = (bool)strlen( $m[self::MD_END_TAG_OPEN][0] ); if ( !$this->ignoreNulls ) { $tagName = $this->handleNulls( $tagName, $m[self::MD_TAG_NAME][1] ); } $tagName = strtolower( $tagName ); $this->pos = $nextPos; $nextState = $this->handleAttribsAndClose( self::STATE_DATA, $tagName, $isEndTag, $startPos ); $nextPos = $this->pos; if ( $nextState === self::STATE_EOF ) { break; } // Respect any state switch imposed by the parser $nextState = $this->state; } elseif ( isset( $m[self::MD_COMMENT] ) && $m[self::MD_COMMENT][1] >= 0 ) { // Comment $this->interpretCommentMatches( $m ); } elseif ( isset( $m[self::MD_DOCTYPE] ) && $m[self::MD_DOCTYPE][1] >= 0 ) { // DOCTYPE $this->interpretDoctypeMatches( $m ); } elseif ( isset( $m[self::MD_CDATA] ) && $m[self::MD_CDATA][1] >= 0 ) { // CDATA if ( $this->enableCdataCallback && call_user_func( $this->enableCdataCallback ) ) { $this->pos += strlen( $m[self::MD_CDATA][0] ) + 1; $endPos = strpos( $this->text, ']]>', $this->pos ); if ( $endPos === false ) { $this->emitCdataRange( $this->pos, $this->length - $this->pos, $startPos, $this->length - $startPos ); $this->pos = $this->length; $nextState = self::STATE_EOF; break; } else { $outerEndPos = $endPos + strlen( ']]>' ); $this->emitCdataRange( $this->pos, $endPos - $this->pos, $startPos, $outerEndPos - $startPos ); $nextPos = $outerEndPos; } } else { // Bogus comment $this->error( "unexpected CDATA interpreted as bogus comment" ); $endPos = strpos( $this->text, '>', $this->pos ); $bogusPos = $this->pos + 2; if ( $endPos === false ) { $nextPos = $this->length; $contents = substr( $this->text, $bogusPos ); } else { $nextPos = $endPos + 1; $contents = substr( $this->text, $bogusPos, $endPos - $bogusPos ); } $contents = $this->handleNulls( $contents, $bogusPos ); $this->listener->comment( $contents, $this->pos, $endPos - $this->pos ); } } elseif ( isset ( $m[self::MD_BOGUS_COMMENT] ) && $m[self::MD_BOGUS_COMMENT][1] >= 0 ) { // Bogus comment $contents = $m[self::MD_BOGUS_COMMENT][0]; $bogusPos = $m[self::MD_BOGUS_COMMENT][1]; if ( $m[0][0] === '' ) { $this->error( "empty end tag" ); // No token emitted } elseif ( $m[0][0] === 'error( 'EOF in end tag' ); $this->listener->characters( 'error( "unexpected <{$contents[0]} interpreted as bogus comment" ); if ( $contents[0] !== '?' ) { // For starting types other than handleNulls( $contents, $bogusPos ); $this->listener->comment( $contents, $startPos, $nextPos - $startPos ); } } else { $this->fatal( 'unexpected data state match' ); } $this->pos = $nextPos; } while ( $loop && $nextState === self::STATE_DATA ); return $nextState; } /** * Interpret the data state match results for a detected comment, and emit * events as appropriate. * * @param array $m The match array */ protected function interpretCommentMatches( $m ) { $outerStart = $m[0][1]; $outerLength = strlen( $m[0][0] ); $innerStart = $outerStart + strlen( '' || $m[0][0] === '' ) { // These are special cases in the comment start state $this->error( 'not enough dashes in empty comment', $outerStart ); $this->listener->comment( '', $outerStart, $outerLength ); return; } if ( !$this->ignoreNulls ) { $contents = $this->handleNulls( $contents, $innerStart ); } $close = $m[self::MD_COMMENT_END][0]; $closePos = $m[self::MD_COMMENT_END][1]; if ( !$this->ignoreErrors ) { if ( $close === '--!>' ) { $this->error( 'invalid comment end bang', $closePos ); } elseif ( $close === '-' || $close === '--' || $close === '--!' ) { $this->error( 'EOF part way through comment close', $closePos ); } elseif ( $close === '' ) { $this->error( 'EOF in comment', $closePos ); } $dashSearchLength = $innerLength; while ( $dashSearchLength > 0 && $contents[$dashSearchLength - 1] === '-' ) { $this->error( 'invalid extra dash at comment end', $innerStart + $dashSearchLength - 1 ); $dashSearchLength--; } $offset = 0; while ( $offset !== false && $offset < $dashSearchLength ) { $offset = strpos( $contents, '--', $offset ); if ( $offset !== false ) { $this->error( 'bare "--" found in comment', $innerStart + $offset ); $offset += 2; } } } $this->listener->comment( $contents, $outerStart, $outerLength ); } /** * Interpret the data state match results for a detected DOCTYPE token, * and emit events as appropriate. * * @param array $m The match array */ protected function interpretDoctypeMatches( $m ) { $igerr = $this->ignoreErrors; $name = null; $public = null; $system = null; $quirks = false; // Missing ">" can only be caused by EOF $eof = !strlen( $m[self::MD_DT_END][0] ); if ( strlen( $m[self::MD_DT_BOGUS][0] ) ) { // Bogus DOCTYPE state if ( !$igerr ) { $this->error( 'invalid DOCTYPE contents', $m[self::MD_DT_BOGUS][1] ); } // Set quirks mode unless there was a properly quoted SYSTEM identifier $haveDq = isset( $m[self::MD_DT_SYSTEM_DQ] ) && $m[self::MD_DT_SYSTEM_DQ][1] >= 0; $haveSq = isset( $m[self::MD_DT_SYSTEM_SQ] ) && $m[self::MD_DT_SYSTEM_SQ][1] >= 0; if ( !$haveDq && !$haveSq ) { $quirks = true; } // EOF in the bogus state does not set quirks mode (but it is a parse error) if ( $eof && !$igerr ) { $this->error( 'unterminated DOCTYPE' ); } } elseif ( $eof ) { if ( !$igerr ) { $this->error( 'unterminated DOCTYPE' ); } $quirks = true; } if ( !$igerr && !$eof && !strlen( $m[self::MD_DT_NAME_WS][0] ) ) { $this->error( 'missing whitespace', $m[self::MD_DT_NAME_WS][1] ); } if ( strlen( $m[self::MD_DT_NAME][0] ) ) { // DOCTYPE name $name = $this->handleNulls( strtolower( $m[self::MD_DT_NAME][0] ), $m[self::MD_DT_NAME][1] ); } else { if ( !$eof && !$igerr ) { $this->error( 'missing DOCTYPE name', $m[self::MD_DOCTYPE][1] + strlen( '!DOCTYPE' ) ); } $quirks = true; } if ( isset( $m[self::MD_DT_PUBLIC_WS] ) && $m[self::MD_DT_PUBLIC_WS][1] >= 0 ) { // PUBLIC keyword found $public = $this->interpretDoctypeQuoted( $m, self::MD_DT_PUBLIC_DQ, self::MD_DT_PUBLIC_SQ, $quirks ); if ( $public === null ) { $quirks = true; if ( !$eof && !$igerr ) { $this->error( 'missing public identifier', $m[self::MD_DT_PUBLIC_WS][1] ); } } elseif ( !$igerr && !$eof && !strlen( $m[self::MD_DT_PUBLIC_WS][0] ) ) { $this->error( 'missing whitespace', $m[self::MD_DT_PUBLIC_WS][1] ); } // Check for a system ID after the public ID $haveDq = isset( $m[self::MD_DT_PUBSYS_DQ] ) && $m[self::MD_DT_PUBSYS_DQ][1] >= 0; $haveSq = isset( $m[self::MD_DT_PUBSYS_SQ] ) && $m[self::MD_DT_PUBSYS_SQ][1] >= 0; if ( $haveDq || $haveSq ) { if ( !$igerr && !strlen( $m[self::MD_DT_PUBSYS_WS][0] ) ) { $this->error( 'missing whitespace', $m[self::MD_DT_PUBSYS_WS][1] ); } $system = $this->interpretDoctypeQuoted( $m, self::MD_DT_PUBSYS_DQ, self::MD_DT_PUBSYS_SQ, $quirks ); } } elseif ( isset( $m[self::MD_DT_SYSTEM_WS] ) && $m[self::MD_DT_SYSTEM_WS][1] >= 0 ) { // SYSTEM keyword found $system = $this->interpretDoctypeQuoted( $m, self::MD_DT_SYSTEM_DQ, self::MD_DT_SYSTEM_SQ, $quirks ); if ( $system === null ) { $quirks = true; $this->error( 'missing system identifier', $m[self::MD_DT_SYSTEM_WS][1] ); } elseif ( !$igerr && !strlen( $m[self::MD_DT_SYSTEM_WS][0] ) ) { $this->error( 'missing whitespace', $m[self::MD_DT_SYSTEM_WS][1] ); } } $this->listener->doctype( $name, $public, $system, $quirks, $m[0][1], strlen( $m[0][0] ) ); } /** * DOCTYPE helper which interprets a quoted string (or lack thereof) * @return string|null The quoted value, with nulls replaced. */ protected function interpretDoctypeQuoted( $m, $dq, $sq, &$quirks ) { if ( isset( $m[$dq] ) && $m[$dq][1] >= 0 ) { $value = $m[$dq][0]; $startPos = $m[$dq][1]; } elseif ( isset( $m[$sq] ) && $m[$sq][1] >= 0 ) { $value = $m[$sq][0]; $startPos = $m[$sq][1]; } else { return null; } $endPos = $startPos + strlen( $value ); if ( $endPos >= $this->length ) { // This is a parse error, but we already emitted a generic EOF error $quirks = true; } elseif ( $this->text[$endPos] === '>' ) { $this->error( 'DOCTYPE identifier terminated by ">"', $endPos ); $quirks = true; } $value = $this->handleNulls( $value, $startPos ); return $value; } /** * Generic helper for all those points in the spec where U+0000 needs to be * replaced with U+FFFD with a parse error issued. * * @param string $text The text to be converted * @param integer $sourcePos The input byte offset from which $text was * extracted, for error position reporting. * @return string The converted text */ protected function handleNulls( $text, $sourcePos ) { if ( $this->ignoreNulls ) { return $text; } if ( !$this->ignoreErrors ) { $offset = 0; while ( true ) { $nullPos = strpos( $text, "\0", $offset ); if ( $nullPos === false ) { break; } $this->error( "replaced null character", $sourcePos + $nullPos ); if ( $nullPos < strlen( $text ) - 1 ) { $offset = $nullPos + 1; } else { break; } } } return str_replace( "\0", self::REPLACEMENT_CHAR, $text ); } /** * Generic helper for points in the spec which say that an error should * be issued when certain ASCII characters are seen, with no other action * taken. * * @param string $mask Mask for strcspn * @param string $text The input text * @param integer $offset The start of the range within $text to search * @param integer $length The length of the range within $text to search * @param integer $sourcePos The offset within the input text corresponding * to $text, for error position reporting. */ protected function handleAsciiErrors( $mask, $text, $offset, $length, $sourcePos ) { while ( $length > 0 ) { $validLength = strcspn( $text, $mask, $offset, $length ); $offset += $validLength; $length -= $validLength; if ( $length <= 0 ) { break; } $char = $text[$offset]; $codepoint = ord( $char ); if ( $codepoint < 0x20 || $codepoint >= 0x7f ) { $this->error( sprintf( 'unexpected U+00%02X', $codepoint ), $offset + $sourcePos ); } else { $this->error( "unexpected \"$char\"", $offset + $sourcePos ); } $offset++; $length--; } } /** * Expand character references in some text, and emit errors as appropriate. * @param string $text The text to expand * @param integer $sourcePos The input position of $text * @param bool $inAttr True if the text is within an attribute value * @param string $additionalAllowedChar An unused string which the spec * inexplicably spends a lot of space telling you how to derive. It * suppresses errors in a place where no errors are emitted anyway. * @return string The expanded text */ protected function handleCharRefs( $text, $sourcePos, $inAttr = false, $additionalAllowedChar = '' ) { if ( $this->ignoreCharRefs ) { return $text; } // Efficiently translate a few common cases. // Although this doesn't translate any error cases, running this // function in !$ignoreError mode would cause the string offsets to // be wrong when we come to the preg_match_all. // // In HHVM this is way too broken to be usable. (@todo bug/PR) if ( !defined( 'HHVM_VERSION' ) && $this->ignoreErrors ) { $text = html_entity_decode( $text, ENT_HTML5 | ENT_QUOTES ); } static $re; if ( $re === null ) { $knownNamed = HTMLData::$namedEntityRegex; $re = "~ ( .*? ) # 1. prefix & (?: \# (?: 0*(\d+) | # 2. decimal [xX]0*([0-9A-Fa-f]+) # 3. hexadecimal ) ( ; ) ? # 4. semicolon | ( \# ) # 5. bare hash | ($knownNamed) # 6. known named (?: (?throwPregError(); } foreach ( $matches as $m ) { $out .= $m[self::MC_PREFIX]; $errorPos = $sourcePos + $pos + strlen( $m[self::MC_PREFIX] ); $lastPos = $pos; $pos += strlen( $m[0] ); if ( isset( $m[self::MC_HASH] ) && strlen( $m[self::MC_HASH] ) ) { // Bare &# $this->error( 'Expected digits after &#', $errorPos ); $out .= '&#'; continue; } $knownNamed = isset( $m[self::MC_NAMED] ) ? $m[self::MC_NAMED] : ''; $attributeSuffix = isset( $m[self::MC_SUFFIX] ) ? $m[self::MC_SUFFIX] : ''; $haveSemicolon = ( isset( $m[self::MC_SEMICOLON] ) && strlen( $m[self::MC_SEMICOLON] ) ) || ( strlen( $knownNamed ) && $knownNamed[ strlen( $knownNamed ) - 1 ] === ';' ) || ( isset( $m[self::MC_INVALID] ) && strlen( $m[self::MC_INVALID] ) ); if ( $inAttr && !$haveSemicolon ) { if ( strlen( $attributeSuffix ) ) { if ( !$this->ignoreErrors && $attributeSuffix === '=' ) { $this->error( 'invalid equals sign after named character reference' ); } $out .= '&' . $knownNamed . $attributeSuffix; continue; } } if ( !$this->ignoreErrors && !$haveSemicolon ) { $this->error( 'character reference missing semicolon', $errorPos ); } if ( isset( $m[self::MC_DECIMAL] ) && strlen( $m[self::MC_DECIMAL] ) ) { // Decimal if ( strlen( $m[self::MC_DECIMAL] ) > 7 ) { $this->error( 'invalid numeric reference', $errorPos ); $out .= self::REPLACEMENT_CHAR; continue; } $codepoint = intval( $m[self::MC_DECIMAL] ); } elseif ( isset( $m[self::MC_HEXDEC] ) && strlen( $m[self::MC_HEXDEC] ) ) { // Hexadecimal if ( strlen( $m[self::MC_HEXDEC] ) > 6 ) { $this->error( 'invalid numeric reference', $errorPos ); $out .= self::REPLACEMENT_CHAR; continue; } $codepoint = intval( $m[self::MC_HEXDEC], 16 ); } elseif ( $knownNamed !== '' ) { $out .= HTMLData::$namedEntityTranslations[$knownNamed] . $attributeSuffix; continue; } elseif ( isset( $m[self::MC_INVALID] ) && strlen( $m[self::MC_INVALID] ) ) { if ( !$this->ignoreErrors ) { $this->error( 'invalid named reference', $errorPos ); } $out .= '&' . $m[self::MC_INVALID]; continue; } else { $this->fatal( 'unable to identify char ref submatch' ); } // Interpret $codepoint if ( $codepoint === 0 || ( $codepoint >= 0xD800 && $codepoint <= 0xDFFF ) || $codepoint > 0x10FFFF ) { if ( !$this->ignoreErrors ) { $this->error( 'invalid numeric reference', $errorPos ); } $out .= self::REPLACEMENT_CHAR; } elseif ( isset( HTMLData::$legacyNumericEntities[$codepoint] ) ) { if ( !$this->ignoreErrors ) { $this->error( 'invalid reference to non-ASCII control character', $errorPos ); } $out .= HTMLData::$legacyNumericEntities[$codepoint]; } else { if ( !$this->ignoreErrors ) { $disallowedCodepoints = [ 0x000B => true, 0xFFFE => true, 0xFFFF => true, 0x1FFFE => true, 0x1FFFF => true, 0x2FFFE => true, 0x2FFFF => true, 0x3FFFE => true, 0x3FFFF => true, 0x4FFFE => true, 0x4FFFF => true, 0x5FFFE => true, 0x5FFFF => true, 0x6FFFE => true, 0x6FFFF => true, 0x7FFFE => true, 0x7FFFF => true, 0x8FFFE => true, 0x8FFFF => true, 0x9FFFE => true, 0x9FFFF => true, 0xAFFFE => true, 0xAFFFF => true, 0xBFFFE => true, 0xBFFFF => true, 0xCFFFE => true, 0xCFFFF => true, 0xDFFFE => true, 0xDFFFF => true, 0xEFFFE => true, 0xEFFFF => true, 0xFFFFE => true, 0xFFFFF => true, 0x10FFFE => true, 0x10FFFF => true ]; if ( ( $codepoint >= 1 && $codepoint <= 8 ) || ( $codepoint >= 0x0d && $codepoint <= 0x1f ) || ( $codepoint >= 0x7f && $codepoint <= 0x9f ) || ( $codepoint >= 0xfdd0 && $codepoint <= 0xfdef ) || isset( $disallowedCodepoints[$codepoint] ) ) { $this->error( 'invalid numeric reference to control character', $errorPos ); } } $out .= \UtfNormal\Utils::codepointToUtf8( $codepoint ); } } if ( $pos < $length ) { $out .= substr( $text, $pos ); } return $out; } /** * Emit a range of the input text as a character token, and emit related * errors, with validity rules as per the data state. * * @param integer $pos Offset within the input text * @param integer $length The length of the range */ protected function emitDataRange( $pos, $length ) { if ( $length === 0 ) { return; } if ( $this->ignoreCharRefs && $this->ignoreNulls && $this->ignoreErrors ) { $this->listener->characters( $this->text, $pos, $length, $pos, $length ); } else { if ( !$this->ignoreErrors ) { // Any bare "<" in a data state text node is a parse error. // Uniquely to the data state, nulls are just flagged as errors // and passed through, they are not replaced. $this->handleAsciiErrors( "<\0", $this->text, $pos, $length, 0 ); } $text = substr( $this->text, $pos, $length ); $text = $this->handleCharRefs( $text, $pos ); $this->listener->characters( $text, 0, strlen( $text ), $pos, $length ); } } /** * Emit a range of characters from the input text, with validity rules as * per the CDATA section state. * * @param $innerPos The position after the * @param $outerPos The position of the start of the listener->characters( $this->text, $innerPos, $innerLength, $outerPos, $outerLength ); } /** * Emit a range of characters from the input text, either from RCDATA, * RAWTEXT, script data or PLAINTEXT. The only difference between these * states is whether or not character references are expanded, so we take * that as a parameter. * * @param bool $ignoreCharRefs * @param integer $pos The input position * @param integer $length The length of the range to be emitted */ protected function emitRawTextRange( $ignoreCharRefs, $pos, $length ) { if ( $length === 0 ) { return; } $ignoreCharRefs = $ignoreCharRefs || $this->ignoreCharRefs; if ( $ignoreCharRefs && $this->ignoreNulls ) { $this->listener->characters( $this->text, $pos, $length, $pos, $length ); } else { $text = substr( $this->text, $pos, $length ); if ( !$ignoreCharRefs ) { $text = $this->handleCharRefs( $text, $pos ); } $text = $this->handleNulls( $text, $pos ); $this->listener->characters( $text, 0, strlen( $text ), $pos, $length ); } } /** * The entry point for the RCDATA and RAWTEXT states. * @param bool $ignoreCharRefs True to ignore character references regardless * of configuration, false to respect the configuration. * @return integer The next state index */ protected function textElementState( $ignoreCharRefs ) { if ( $this->appropriateEndTag === null ) { $this->emitRawTextRange( $ignoreCharRefs, $this->pos, $this->length - $this->pos ); $this->pos = $this->length; return self::STATE_EOF; } $re = "~appropriateEndTag} # Assert that the end tag name state is exited appropriately, # since the anything else case leads to the tag being treated as # a literal (?=[\t\n\f />]) ~ix"; do { $count = preg_match( $re, $this->text, $m, PREG_OFFSET_CAPTURE, $this->pos ); if ( $count === false ) { $this->throwPregError(); } elseif ( !$count ) { // Text runs to end $this->emitRawTextRange( $ignoreCharRefs, $this->pos, $this->length - $this->pos ); $this->pos = $this->length; return self::STATE_EOF; } $startPos = $m[0][1]; // Emit text before tag $this->emitRawTextRange( $ignoreCharRefs, $this->pos, $startPos - $this->pos ); $matchLength = strlen( $m[0][0] ); $this->pos = $startPos + $matchLength; $nextState = $this->handleAttribsAndClose( self::STATE_RCDATA, $this->appropriateEndTag, true, $startPos ); } while ( $nextState === self::STATE_RCDATA ); return $nextState; } /** * Advance $this->pos, consuming all tag attributes found at the current * position. The new position will be at the end of the tag or at the end * of the input string. * * To improve performance of consumers which don't need to read the * attribute array, interpretation of the PCRE match results is deferred. * * - @todo: Make deferral configurable. * - @todo: Measure performance improvement, assess whether the LazyAttributes * feature is warranted. * * @return array Attributes */ protected function consumeAttribs() { $re = '~ [\t\n\f ]*+ # Ignored whitespace before attribute name (?! /> ) # Do not consume self-closing end of tag (?! > ) # Do not consume normal closing bracket (?: # Before attribute name state # A bare slash at this point, not part of a self-closing end tag, is # consumed and ignored (with a parse error), returning to the before # attribute name state. ( / ) | # 1. Bare slash # Attribute name state # Note that the first character can be an equals sign, this is a parse error # but still generates an attribute called "=". Thus the only way the match # could fail here is due to EOF. ( [^\t\n\f />] [^\t\n\f =/>]*+ ) # 2. Attribute name # After attribute name state [\t\n\f ]* (?: = # Before attribute value state # Ignore whitespace [\t\n\f ]*+ (?: # If an end-quote is omitted, the attribute will run to the end of the # string, leaving no closing bracket. So the caller will detect the # unexpected EOF and will not emit the tag, which is correct. " ( [^"]*+ ) "? | # 3. Double-quoted attribute value \' ( [^\']*+ ) \'? | # 4. Single-quoted attribute value ( [^\t\n\f >]*+ ) # 5. Unquoted attribute value ) # Or nothing: an attribute with an empty value. The attribute name was # terminated by a slash, closing bracket or EOF | ) ) # The /A modifier causes preg_match_all to give contiguous chunks ~xA'; $count = preg_match_all( $re, $this->text, $m, PREG_SET_ORDER | PREG_OFFSET_CAPTURE, $this->pos ); if ( $count === false ) { $this->throwPregError(); } elseif ( $count ) { $this->pos = $m[$count - 1][0][1] + strlen( $m[$count - 1][0][0] ); $attribs = new LazyAttributes( $m, function ( $m ) { return $this->interpretAttribMatches( $m ); } ); } else { $attribs = new PlainAttributes(); } // Consume trailing whitespace. This is strictly part of the before attribute // name state, but we didn't consume it in the regex since we used a principle // of one match equals one attribute. $this->pos += strspn( $this->text, "\t\n\f ", $this->pos ); return $attribs; } /** * Interpret the results of the attribute preg_match_all(). Emit errors as * appropriate and return an associative array. * * @param array $matches * @return array */ protected function interpretAttribMatches( $matches ) { $attributes = []; foreach ( $matches as $m ) { if ( strlen( $m[self::MA_SLASH][0] ) ) { $this->error( 'unexpected bare slash', $m[self::MA_SLASH][1] ); continue; } $name = $m[self::MA_NAME][0]; if ( !$this->ignoreErrors ) { $this->handleAsciiErrors( "\"'<=", $name, 0, strlen( $name ), $m[self::MA_NAME][1] ); } if ( !$this->ignoreNulls ) { $name = $this->handleNulls( $m[self::MA_NAME][0], $m[self::MA_NAME][1] ); } $name = strtolower( $name ); $additionalAllowedChar = ''; if ( isset( $m[self::MA_DQUOTED] ) && $m[self::MA_DQUOTED][1] >= 0 ) { // Double-quoted attribute value $additionalAllowedChar = '"'; $value = $m[self::MA_DQUOTED][0]; $pos = $m[self::MA_DQUOTED][1]; } elseif ( isset( $m[self::MA_SQUOTED] ) && $m[self::MA_SQUOTED][1] >= 0 ) { // Single-quoted attribute value $additionalAllowedChar = "'"; $value = $m[self::MA_SQUOTED][0]; $pos = $m[self::MA_SQUOTED][1]; } elseif ( isset( $m[self::MA_UNQUOTED] ) && $m[self::MA_UNQUOTED][1] >= 0 ) { // Unquoted attribute value $value = $m[self::MA_UNQUOTED][0]; $pos = $m[self::MA_UNQUOTED][1]; // Search for parse errors if ( !$this->ignoreErrors ) { if ( $value === '' ) { // ">" in the before attribute value state is a parse error $this->error( 'empty unquoted attribute', $pos ); } $this->handleAsciiErrors( "\"'<=`", $value, 0, strlen( $value ), $pos ); } } else { $value = ''; } if ( $additionalAllowedChar && !$this->ignoreErrors ) { // After attribute value (quoted) state // Quoted attributes must be followed by a space, "/" or ">" $aavPos = $m[0][1] + strlen( $m[0][0] ); if ( $aavPos < $this->length ) { $aavChar = $this->text[$aavPos]; if ( !preg_match( '~^[\t\n\f />]~', $aavChar ) ) { $this->error( 'missing space between attributes', $aavPos ); } } } if ( $value !== '' ) { if ( !$this->ignoreNulls ) { $value = $this->handleNulls( $value, $pos ); } if ( !$this->ignoreCharRefs ) { $value = $this->handleCharRefs( $value, $pos, true, $additionalAllowedChar ); } } if ( isset( $attributes[$name] ) ) { $this->error( "duplicate attribute", $m[0][1] ); } else { $attributes[$name] = $value; } } return $attributes; } /** * Consume attributes, and the closing bracket which follows attributes. * Emit the appropriate tag event, or in the case of broken attributes in * text states, emit characters. * * @param integer $state The current state * @param string $tagName The normalized tag name * @param bool $isEndTag True if this is an end tag, false if it is a start tag * @param integer $startPos The input position of the start of the current tag. * @return integer The next state */ protected function handleAttribsAndClose( $state, $tagName, $isEndTag, $startPos ) { $attribStartPos = $this->pos; $attribs = $this->consumeAttribs(); $pos = $this->pos; // Literal characters are emitted on EOF or "anything else" from the // end tag substates of the text states. // (spec ref 8.2.4 sections 11-19, 25-27) $isDataState = $state === self::STATE_DATA; $isLiteral = $attribStartPos === $pos && !$isDataState; if ( $pos >= $this->length ) { $this->error( 'unexpected end of file inside tag' ); if ( $isLiteral ) { $this->listener->characters( $this->text, $startPos, $this->length - $startPos, $startPos, $this->length - $startPos ); } return self::STATE_EOF; } if ( $isEndTag && !$this->ignoreErrors && $attribs->count() ) { $this->error( 'end tag has an attribute' ); } if ( $this->text[$pos] === '/' && $this->text[$pos + 1] === '>' ) { $pos += 2; $selfClose = true; } elseif ( $this->text[$pos] === '>' ) { $pos++; $selfClose = false; } elseif ( $isLiteral ) { $this->listener->characters( $this->text, $startPos, $attribStartPos - $startPos, $startPos, $attribStartPos - $startPos ); return $state; } else { $this->fatal( 'failed to find an already-matched ">"' ); } $this->pos = $pos; if ( $isEndTag ) { if ( $selfClose ) { $this->error( 'self-closing end tag' ); } $this->listener->endTag( $tagName, $startPos, $pos - $startPos ); } else { $this->listener->startTag( $tagName, $attribs, $selfClose, $startPos, $pos - $startPos ); } return self::STATE_DATA; } /** * Process input text in the PLAINTEXT state * @return integer The next state index */ protected function plaintextState() { $this->emitRawTextRange( true, $this->pos, $this->length - $this->pos ); return self::STATE_EOF; } /** * Process input text in the script data state * @return integer The next state index */ protected function scriptDataState() { if ( $this->appropriateEndTag === null ) { $this->pos = $this->length; return self::STATE_EOF; } $re = <<appropriateEndTag} # If we hit the "anything else" case in the script data # end tag name state, don't exit (?= [\t\n\f />] ) ) | # 1. Appropriate end tag ) | (?= appropriateEndTag} [\t\n\f />] ) | ] ) ) )* # Consume the comment close which exited the inner loop, if any (?: --> )? ) )*+ ~xsiA REGEX; do { $count = preg_match( $re, $this->text, $m, 0, $this->pos ); if ( $count === false ) { $this->throwPregError(); } elseif ( !$count ) { $this->fatal( 'unexpected regex failure: this pattern can match zero characters' ); } $startPos = $this->pos; $matchLength = strlen( $m[0] ); $endTagLength = isset( $m[1] ) ? strlen( $m[1] ) : 0; $textLength = $matchLength - $endTagLength; $this->emitRawTextRange( true, $startPos, $textLength ); $this->pos = $startPos + $matchLength; $tagStartPos = $startPos + $textLength; if ( $endTagLength ) { $nextState = $this->handleAttribsAndClose( self::STATE_SCRIPT_DATA, $this->appropriateEndTag, true, $tagStartPos ); } else { $nextState = self::STATE_EOF; } } while ( $nextState === self::STATE_SCRIPT_DATA ); return $nextState; } /** * Emit a parse error event. * @param string $text The error message * @param integer|null $pos The error position, or null to use the current position */ protected function error( $text, $pos = null ) { if ( !$this->ignoreErrors ) { if ( $pos === null ) { $pos = $this->pos; } $this->listener->error( $text, $pos ); } } /** * Throw an exception for a specified reason. This is used for API errors * and assertion-like sanity checks. * @param string $text The error message */ protected function fatal( $text ) { throw new TokenizerError( __CLASS__ . ": " . $text ); } /** * Interpret preg_last_error() and throw a suitable exception. This is * called when preg_match() or similar returns false. * * Notes for users: * * - PCRE internal error: may be due to JIT stack space exhaustion prior * to PHP 7, due to excessive recursion. Increase stack space. * * - pcre.backtrack_limit exhausted: The backtrack limit should be at least * double the input size, the defaults are way too small. Increase it in * configuration. */ protected function throwPregError() { if ( defined( 'PREG_JIT_STACKLIMIT_ERROR' ) ) { $PREG_JIT_STACKLIMIT_ERROR = PREG_JIT_STACKLIMIT_ERROR; } else { $PREG_JIT_STACKLIMIT_ERROR = 'undefined error'; } switch ( preg_last_error() ) { case PREG_NO_ERROR: $msg = "PCRE returned false but gave PREG_NO_ERROR"; break; case PREG_INTERNAL_ERROR: $msg = "PCRE internal error"; break; case PREG_BACKTRACK_LIMIT_ERROR: $msg = "pcre.backtrack_limit exhausted"; break; case PREG_RECURSION_LIMIT_ERROR: $msg = "pcre.recursion_limit exhausted"; break; case $PREG_JIT_STACKLIMIT_ERROR: $msg = "PCRE JIT stack space exhausted"; break; case PREG_BAD_UTF8_ERROR: case PREG_BAD_UTF8_OFFSET_ERROR: default: $msg = "PCRE unexpected error"; } throw new TokenizerError( __CLASS__.": $msg" ); } }