listener = $listener;
$this->text = $text;
$this->pos = 0;
$this->preprocessed = false;
$this->length = strlen( $text );
$this->ignoreErrors = !empty( $options['ignoreErrors'] );
$this->ignoreCharRefs = !empty( $options['ignoreCharRefs'] );
$this->ignoreNulls = !empty( $options['ignoreNulls'] );
$this->skipPreprocess = !empty( $options['skipPreprocess'] );
}
public function __set( $name, $value ) {
PropGuard::set( $this, $name, $value );
}
public function setEnableCdataCallback( $cb ) {
$this->enableCdataCallback = $cb;
}
/**
* Run the tokenizer on the whole input stream. This is the normal entry point.
*
* @param array $options An associative array of options:
* - state : One of the STATE_* constants, a state in which to start.
* - appropriateEndTag : The "appropriate end tag", which needs to be set
* if entering one of the raw text states.
* - fragmentNamespace : The fragment namespace
* - fragmentName : The fragment tag name
*/
public function execute( $options = [] ) {
if ( isset( $options['state'] ) ) {
$this->state = $options['state'];
} else {
$this->state = self::STATE_START;
}
if ( isset( $options['fragmentNamespace'] ) ) {
$this->setFragmentContext( $options['fragmentNamespace'], $options['fragmentName'] );
} else {
$this->fragmentNamespace = null;
$this->fragmentName = null;
}
$this->appropriateEndTag = isset( $options['appropriateEndTag'] ) ?
$options['appropriateEndTag'] : null;
$this->preprocess();
$this->listener->startDocument( $this, $this->fragmentNamespace, $this->fragmentName );
$this->executeInternal( true );
}
/**
* Get the preprocessed input text. Source offsets in event parameters are
* relative to this string. If skipPreprocess was specified, this will be
* the same as the input string.
*/
public function getPreprocessedText() {
$this->preprocess();
return $this->text;
}
/**
* Change the state of the tokenizer during parsing. This for use by the
* tree builder to switch the tokenizer into one of the raw text states.
*
* @param integer $state One of the STATE_* constants
* @param string $appropriateEndTag The appropriate end tag
*/
public function switchState( $state, $appropriateEndTag ) {
$this->state = $state;
$this->appropriateEndTag = $appropriateEndTag;
}
/**
* Initialize the tokenizer for fragment parsing
*
* @param string $namespace The namespace of the context element
* @param string $tagName The name of the context element
*/
public function setFragmentContext( $namespace, $tagName ) {
$this->fragmentNamespace = $namespace;
$this->fragmentName = $tagName;
if ( strval( $namespace ) !== '' && $namespace !== HTMLData::NS_HTML ) {
return;
}
switch ( $tagName ) {
case 'title':
case 'textarea':
$this->state = Tokenizer::STATE_RCDATA;
break;
case 'style':
case 'xmp':
case 'iframe':
case 'noembed':
case 'noframes':
$this->state = Tokenizer::STATE_RAWTEXT;
break;
case 'script':
$this->state = Tokenizer::STATE_SCRIPT_DATA;
break;
case 'noscript':
if ( $this->scriptingFlag ) {
$this->state = Tokenizer::STATE_RAWTEXT;
}
break;
case 'plaintext':
$this->state = Tokenizer::STATE_PLAINTEXT;
break;
}
}
/**
* Notify the tokenizer that the document will be tokenized by repeated step()
* calls. This must be called once only, before the first call to step().
*/
public function beginStepping() {
$this->state = self::STATE_START;
$this->preprocess();
$this->listener->startDocument( $this, null, null );
}
/**
* Tokenize a minimum amount of text from the input stream, and emit the
* resulting events.
*
* @return bool True if the input continues and step() should be called
* again, false on EOF
*/
public function step() {
if ( $this->state === null ) {
$this->fatal( "beginStepping() must be called before step()" );
}
return $this->executeInternal( false );
}
/**
* Preprocess the input text, if it hasn't been done already.
*/
protected function preprocess() {
if ( $this->preprocessed || $this->skipPreprocess ) {
return;
}
// Normalize line endings
$this->text = strtr( $this->text, [
"\r\n" => "\n",
"\r" => "\n" ] );
$this->length = strlen( $this->text );
// Raise parse errors for any control characters
if ( !$this->ignoreErrors ) {
$pos = 0;
$re = '/[' .
'\x{0001}-\x{0008}' .
'\x{000E}-\x{001F}' .
'\x{007F}-\x{009F}' .
'\x{FDD0}-\x{FDEF}' .
'\x{000B}' .
'\x{FFFE}\x{FFFF}' .
'\x{1FFFE}\x{1FFFF}' .
'\x{2FFFE}\x{2FFFF}' .
'\x{3FFFE}\x{3FFFF}' .
'\x{4FFFE}\x{4FFFF}' .
'\x{5FFFE}\x{5FFFF}' .
'\x{6FFFE}\x{6FFFF}' .
'\x{7FFFE}\x{7FFFF}' .
'\x{8FFFE}\x{8FFFF}' .
'\x{9FFFE}\x{9FFFF}' .
'\x{AFFFE}\x{AFFFF}' .
'\x{BFFFE}\x{BFFFF}' .
'\x{CFFFE}\x{CFFFF}' .
'\x{DFFFE}\x{DFFFF}' .
'\x{EFFFE}\x{EFFFF}' .
'\x{FFFFE}\x{FFFFF}' .
'\x{10FFFE}\x{10FFFF}]/u';
while ( $pos < $this->length ) {
$count = preg_match( $re, $this->text, $m, PREG_OFFSET_CAPTURE, $pos );
if ( $count === false ) {
$this->fatal( "Invalid UTF-8 sequence given to Tokenizer" );
} elseif ( !$count ) {
break;
}
$pos = $m[0][1];
$this->error( "disallowed control character", $pos );
$pos += strlen( $m[0][0] );
}
}
}
/**
* The main state machine, the common implementation of step() and execute().
* @param bool $loop Set to true to loop until finished, false to step once.
* @return bool True if the input continues, false on EOF
*/
protected function executeInternal( $loop ) {
$eof = false;
do {
switch ( $this->state ) {
case self::STATE_DATA:
$this->state = $this->dataState( $loop );
break;
case self::STATE_RCDATA:
$this->state = $this->textElementState( false );
break;
case self::STATE_RAWTEXT:
$this->state = $this->textElementState( true );
break;
case self::STATE_SCRIPT_DATA:
$this->state = $this->scriptDataState();
break;
case self::STATE_PLAINTEXT:
$this->state = $this->plaintextState();
break;
case self::STATE_START:
$this->state = self::STATE_DATA;
break;
case self::STATE_EOF:
$this->listener->endDocument( $this->length );
$eof = true;
break 2;
default:
$this->fatal( 'invalid state' );
}
} while ( $loop );
return !$eof;
}
/**
* Consume input text starting from the "data state".
*
* @param bool $loop True to loop while still in the data state, false to
* process a single less-than sign.
* @return integer The next state index
*/
protected function dataState( $loop ) {
$re = "~ <
(?:
( /? ) # 1. End tag open
( # 2. Tag name
# Try to match the ASCII letter required for the start of a start
# or end tag. If this fails, a slash matched above can be
# backtracked and then fed into the bogus comment alternative below.
[a-zA-Z]
# Then capture the rest of the tag name
[^\t\n\f />]*
) |
# Comment
!--
( # 3. Comment match detector
> | -> | # Invalid short close
( # 4. Comment contents
(?:
(?! --> )
(?! --!> )
(?! --! \\z )
(?! -- \\z )
(?! - \\z )
.
)*+
)
( # 5. Comment close
--> | # Normal close
--!> | # Comment end bang
--! | # EOF in comment end bang state
-- | # EOF in comment end state
- | # EOF in comment end dash state
# EOF in comment state
)
) |
( (?i) # 6. Doctype
! DOCTYPE
# There must be at least one whitespace character to suppress
# a parse error, but if there isn't one, this is still a
# DOCTYPE. There is no way for the DOCTYPE string to end up
# as a character node, the DOCTYPE subexpression must always
# wholly match if we matched up to this point.
( [\t\n\f ]*+ ) # 7. Required whitespace
( [^\t\n\f >]*+ ) # 8. DOCTYPE name
[\t\n\f ]*+
(?:
# After DOCTYPE name state
PUBLIC
( [\t\n\f ]* ) # 9. Required whitespace
(?:
\" ( [^\">]* ) \"? | # 10. Double-quoted identifier
' ( [^'>]* ) '? | # 11. Single-quoted identifier
# Non-match: bogus
)
(?:
# After DOCTYPE public identifier state
# Assert quoted identifier before here
(?<= \" | ' )
( [\t\n\f ]* ) # 12. Required whitespace
(?:
\" ( [^\">]* ) \"? | # 13. Double-quoted identifier
' ( [^'>]* ) '? | # 14. Single-quoted identifier
# Non-match: no system ID
)
)?
|
SYSTEM
( [\t\n\f ]* ) # 15. Required whitespace
(?:
\" ( [^\">]* ) \"? | # 16. Double-quoted identifier
' ( [^'>]* ) '? | # 17. Single-quoted identifier
# Non-match: bogus
)
| # No keyword is OK
)
[\t\n\f ]*
( [^>]*+ ) # 18. Bogus DOCTYPE
( >? ) # 19. End of DOCTYPE
) |
( ! \[CDATA\[ ) | # 20. CDATA section
( [!?/] [^>]*+ ) >? # 21. Bogus comment
# Anything else: parse error and emit literal less-than sign.
# We will let the match fail at this position and later check
# for less-than signs in the resulting text node.
)
~xs";
$nextState = self::STATE_DATA;
do {
$count = preg_match( $re, $this->text, $m, PREG_OFFSET_CAPTURE, $this->pos );
if ( $count === false ) {
$this->throwPregError();
} elseif ( !$count ) {
// Text runs to end
$this->emitDataRange( $this->pos, $this->length - $this->pos );
$this->pos = $this->length;
$nextState = self::STATE_EOF;
break;
}
$startPos = $m[0][1];
$tagName = isset( $m[self::MD_TAG_NAME] ) ? $m[self::MD_TAG_NAME][0] : '';
$this->emitDataRange( $this->pos, $startPos - $this->pos );
$this->pos = $startPos;
$nextPos = $m[0][1] + strlen( $m[0][0] );
if ( isset( $m[self::MD_CDATA] ) && $m[self::MD_CDATA][1] >= 0 ) {
if ( $this->enableCdataCallback ) {
$isCdata = call_user_func( $this->enableCdataCallback );
} else {
$isCdata = false;
}
if ( !$isCdata ) {
$m[self::MD_BOGUS_COMMENT] = $m[self::MD_CDATA];
}
} else {
$isCdata = false;
}
if ( strlen( $tagName ) ) {
// Tag
$isEndTag = (bool)strlen( $m[self::MD_END_TAG_OPEN][0] );
if ( !$this->ignoreNulls ) {
$tagName = $this->handleNulls( $tagName, $m[self::MD_TAG_NAME][1] );
}
$tagName = strtolower( $tagName );
$this->pos = $nextPos;
$nextState = $this->handleAttribsAndClose( self::STATE_DATA,
$tagName, $isEndTag, $startPos );
$nextPos = $this->pos;
if ( $nextState === self::STATE_EOF ) {
break;
}
// Respect any state switch imposed by the parser
$nextState = $this->state;
} elseif ( isset( $m[self::MD_COMMENT] ) && $m[self::MD_COMMENT][1] >= 0 ) {
// Comment
$this->interpretCommentMatches( $m );
} elseif ( isset( $m[self::MD_DOCTYPE] ) && $m[self::MD_DOCTYPE][1] >= 0 ) {
// DOCTYPE
$this->interpretDoctypeMatches( $m );
} elseif ( isset( $m[self::MD_CDATA] ) && $m[self::MD_CDATA][1] >= 0 ) {
// CDATA
if ( $this->enableCdataCallback
&& call_user_func( $this->enableCdataCallback )
) {
$this->pos += strlen( $m[self::MD_CDATA][0] ) + 1;
$endPos = strpos( $this->text, ']]>', $this->pos );
if ( $endPos === false ) {
$this->emitCdataRange( $this->pos, $this->length - $this->pos,
$startPos, $this->length - $startPos );
$this->pos = $this->length;
$nextState = self::STATE_EOF;
break;
} else {
$outerEndPos = $endPos + strlen( ']]>' );
$this->emitCdataRange( $this->pos, $endPos - $this->pos,
$startPos, $outerEndPos - $startPos );
$nextPos = $outerEndPos;
}
} else {
// Bogus comment
$this->error( "unexpected CDATA interpreted as bogus comment" );
$endPos = strpos( $this->text, '>', $this->pos );
$bogusPos = $this->pos + 2;
if ( $endPos === false ) {
$nextPos = $this->length;
$contents = substr( $this->text, $bogusPos );
} else {
$nextPos = $endPos + 1;
$contents = substr( $this->text, $bogusPos, $endPos - $bogusPos );
}
$contents = $this->handleNulls( $contents, $bogusPos );
$this->listener->comment( $contents, $this->pos, $endPos - $this->pos );
}
} elseif ( isset ( $m[self::MD_BOGUS_COMMENT] ) && $m[self::MD_BOGUS_COMMENT][1] >= 0 ) {
// Bogus comment
$contents = $m[self::MD_BOGUS_COMMENT][0];
$bogusPos = $m[self::MD_BOGUS_COMMENT][1];
if ( $m[0][0] === '>' ) {
$this->error( "empty end tag" );
// No token emitted
} elseif ( $m[0][0] === '' ) {
$this->error( 'EOF in end tag' );
$this->listener->characters( '', 0, 2, $m[0][1], 2 );
} else {
$this->error( "unexpected <{$contents[0]} interpreted as bogus comment" );
if ( $contents[0] !== '?' ) {
// For starting types other than , the initial character is
// not in the tag contents
$contents = substr( $contents, 1 );
$bogusPos++;
}
$contents = $this->handleNulls( $contents, $bogusPos );
$this->listener->comment( $contents, $startPos, $nextPos - $startPos );
}
} else {
$this->fatal( 'unexpected data state match' );
}
$this->pos = $nextPos;
} while ( $loop && $nextState === self::STATE_DATA );
return $nextState;
}
/**
* Interpret the data state match results for a detected comment, and emit
* events as appropriate.
*
* @param array $m The match array
*/
protected function interpretCommentMatches( $m ) {
$outerStart = $m[0][1];
$outerLength = strlen( $m[0][0] );
$innerStart = $outerStart + strlen( '' || $m[0][0] === '' ) {
// These are special cases in the comment start state
$this->error( 'not enough dashes in empty comment', $outerStart );
$this->listener->comment( '', $outerStart, $outerLength );
return;
}
if ( !$this->ignoreNulls ) {
$contents = $this->handleNulls( $contents, $innerStart );
}
$close = $m[self::MD_COMMENT_END][0];
$closePos = $m[self::MD_COMMENT_END][1];
if ( !$this->ignoreErrors ) {
if ( $close === '--!>' ) {
$this->error( 'invalid comment end bang', $closePos );
} elseif ( $close === '-' || $close === '--' || $close === '--!' ) {
$this->error( 'EOF part way through comment close', $closePos );
} elseif ( $close === '' ) {
$this->error( 'EOF in comment', $closePos );
}
$dashSearchLength = $innerLength;
while ( $dashSearchLength > 0 && $contents[$dashSearchLength - 1] === '-' ) {
$this->error( 'invalid extra dash at comment end',
$innerStart + $dashSearchLength - 1 );
$dashSearchLength--;
}
$offset = 0;
while ( $offset !== false && $offset < $dashSearchLength ) {
$offset = strpos( $contents, '--', $offset );
if ( $offset !== false ) {
$this->error( 'bare "--" found in comment', $innerStart + $offset );
$offset += 2;
}
}
}
$this->listener->comment( $contents, $outerStart, $outerLength );
}
/**
* Interpret the data state match results for a detected DOCTYPE token,
* and emit events as appropriate.
*
* @param array $m The match array
*/
protected function interpretDoctypeMatches( $m ) {
$igerr = $this->ignoreErrors;
$name = null;
$public = null;
$system = null;
$quirks = false;
// Missing ">" can only be caused by EOF
$eof = !strlen( $m[self::MD_DT_END][0] );
if ( strlen( $m[self::MD_DT_BOGUS][0] ) ) {
// Bogus DOCTYPE state
if ( !$igerr ) {
$this->error( 'invalid DOCTYPE contents', $m[self::MD_DT_BOGUS][1] );
}
// Set quirks mode unless there was a properly quoted SYSTEM identifier
$haveDq = isset( $m[self::MD_DT_SYSTEM_DQ] ) && $m[self::MD_DT_SYSTEM_DQ][1] >= 0;
$haveSq = isset( $m[self::MD_DT_SYSTEM_SQ] ) && $m[self::MD_DT_SYSTEM_SQ][1] >= 0;
if ( !$haveDq && !$haveSq ) {
$quirks = true;
}
// EOF in the bogus state does not set quirks mode (but it is a parse error)
if ( $eof && !$igerr ) {
$this->error( 'unterminated DOCTYPE' );
}
} elseif ( $eof ) {
if ( !$igerr ) {
$this->error( 'unterminated DOCTYPE' );
}
$quirks = true;
}
if ( !$igerr && !$eof && !strlen( $m[self::MD_DT_NAME_WS][0] ) ) {
$this->error( 'missing whitespace', $m[self::MD_DT_NAME_WS][1] );
}
if ( strlen( $m[self::MD_DT_NAME][0] ) ) {
// DOCTYPE name
$name = $this->handleNulls( strtolower( $m[self::MD_DT_NAME][0] ), $m[self::MD_DT_NAME][1] );
} else {
if ( !$eof && !$igerr ) {
$this->error( 'missing DOCTYPE name',
$m[self::MD_DOCTYPE][1] + strlen( '!DOCTYPE' ) );
}
$quirks = true;
}
if ( isset( $m[self::MD_DT_PUBLIC_WS] ) && $m[self::MD_DT_PUBLIC_WS][1] >= 0 ) {
// PUBLIC keyword found
$public = $this->interpretDoctypeQuoted( $m,
self::MD_DT_PUBLIC_DQ, self::MD_DT_PUBLIC_SQ, $quirks );
if ( $public === null ) {
$quirks = true;
if ( !$eof && !$igerr ) {
$this->error( 'missing public identifier', $m[self::MD_DT_PUBLIC_WS][1] );
}
} elseif ( !$igerr && !$eof && !strlen( $m[self::MD_DT_PUBLIC_WS][0] ) ) {
$this->error( 'missing whitespace', $m[self::MD_DT_PUBLIC_WS][1] );
}
// Check for a system ID after the public ID
$haveDq = isset( $m[self::MD_DT_PUBSYS_DQ] ) && $m[self::MD_DT_PUBSYS_DQ][1] >= 0;
$haveSq = isset( $m[self::MD_DT_PUBSYS_SQ] ) && $m[self::MD_DT_PUBSYS_SQ][1] >= 0;
if ( $haveDq || $haveSq ) {
if ( !$igerr && !strlen( $m[self::MD_DT_PUBSYS_WS][0] ) ) {
$this->error( 'missing whitespace', $m[self::MD_DT_PUBSYS_WS][1] );
}
$system = $this->interpretDoctypeQuoted( $m,
self::MD_DT_PUBSYS_DQ, self::MD_DT_PUBSYS_SQ, $quirks );
}
} elseif ( isset( $m[self::MD_DT_SYSTEM_WS] ) && $m[self::MD_DT_SYSTEM_WS][1] >= 0 ) {
// SYSTEM keyword found
$system = $this->interpretDoctypeQuoted( $m,
self::MD_DT_SYSTEM_DQ, self::MD_DT_SYSTEM_SQ, $quirks );
if ( $system === null ) {
$quirks = true;
$this->error( 'missing system identifier', $m[self::MD_DT_SYSTEM_WS][1] );
} elseif ( !$igerr && !strlen( $m[self::MD_DT_SYSTEM_WS][0] ) ) {
$this->error( 'missing whitespace', $m[self::MD_DT_SYSTEM_WS][1] );
}
}
$this->listener->doctype( $name, $public, $system, $quirks, $m[0][1], strlen( $m[0][0] ) );
}
/**
* DOCTYPE helper which interprets a quoted string (or lack thereof)
* @return string|null The quoted value, with nulls replaced.
*/
protected function interpretDoctypeQuoted( $m, $dq, $sq, &$quirks ) {
if ( isset( $m[$dq] ) && $m[$dq][1] >= 0 ) {
$value = $m[$dq][0];
$startPos = $m[$dq][1];
} elseif ( isset( $m[$sq] ) && $m[$sq][1] >= 0 ) {
$value = $m[$sq][0];
$startPos = $m[$sq][1];
} else {
return null;
}
$endPos = $startPos + strlen( $value );
if ( $endPos >= $this->length ) {
// This is a parse error, but we already emitted a generic EOF error
$quirks = true;
} elseif ( $this->text[$endPos] === '>' ) {
$this->error( 'DOCTYPE identifier terminated by ">"', $endPos );
$quirks = true;
}
$value = $this->handleNulls( $value, $startPos );
return $value;
}
/**
* Generic helper for all those points in the spec where U+0000 needs to be
* replaced with U+FFFD with a parse error issued.
*
* @param string $text The text to be converted
* @param integer $sourcePos The input byte offset from which $text was
* extracted, for error position reporting.
* @return string The converted text
*/
protected function handleNulls( $text, $sourcePos ) {
if ( $this->ignoreNulls ) {
return $text;
}
if ( !$this->ignoreErrors ) {
$offset = 0;
while ( true ) {
$nullPos = strpos( $text, "\0", $offset );
if ( $nullPos === false ) {
break;
}
$this->error( "replaced null character", $sourcePos + $nullPos );
if ( $nullPos < strlen( $text ) - 1 ) {
$offset = $nullPos + 1;
} else {
break;
}
}
}
return str_replace( "\0", self::REPLACEMENT_CHAR, $text );
}
/**
* Generic helper for points in the spec which say that an error should
* be issued when certain ASCII characters are seen, with no other action
* taken.
*
* @param string $mask Mask for strcspn
* @param string $text The input text
* @param integer $offset The start of the range within $text to search
* @param integer $length The length of the range within $text to search
* @param integer $sourcePos The offset within the input text corresponding
* to $text, for error position reporting.
*/
protected function handleAsciiErrors( $mask, $text, $offset, $length, $sourcePos ) {
while ( $length > 0 ) {
$validLength = strcspn( $text, $mask, $offset, $length );
$offset += $validLength;
$length -= $validLength;
if ( $length <= 0 ) {
break;
}
$char = $text[$offset];
$codepoint = ord( $char );
if ( $codepoint < 0x20 || $codepoint >= 0x7f ) {
$this->error( sprintf( 'unexpected U+00%02X', $codepoint ), $offset + $sourcePos );
} else {
$this->error( "unexpected \"$char\"", $offset + $sourcePos );
}
$offset++;
$length--;
}
}
/**
* Expand character references in some text, and emit errors as appropriate.
* @param string $text The text to expand
* @param integer $sourcePos The input position of $text
* @param bool $inAttr True if the text is within an attribute value
* @param string $additionalAllowedChar An unused string which the spec
* inexplicably spends a lot of space telling you how to derive. It
* suppresses errors in a place where no errors are emitted anyway.
* @return string The expanded text
*/
protected function handleCharRefs( $text, $sourcePos, $inAttr = false,
$additionalAllowedChar = ''
) {
if ( $this->ignoreCharRefs ) {
return $text;
}
// Efficiently translate a few common cases.
// Although this doesn't translate any error cases, running this
// function in !$ignoreError mode would cause the string offsets to
// be wrong when we come to the preg_match_all.
//
// In HHVM this is way too broken to be usable. (@todo bug/PR)
if ( !defined( 'HHVM_VERSION' ) && $this->ignoreErrors ) {
$text = html_entity_decode( $text, ENT_HTML5 | ENT_QUOTES );
}
static $re;
if ( $re === null ) {
$knownNamed = HTMLData::$namedEntityRegex;
$re = "~
( .*? ) # 1. prefix
&
(?:
\# (?:
0*(\d+) | # 2. decimal
[xX]0*([0-9A-Fa-f]+) # 3. hexadecimal
)
( ; ) ? # 4. semicolon
|
( \# ) # 5. bare hash
|
($knownNamed) # 6. known named
(?:
(?throwPregError();
}
foreach ( $matches as $m ) {
$out .= $m[self::MC_PREFIX];
$errorPos = $sourcePos + $pos + strlen( $m[self::MC_PREFIX] );
$lastPos = $pos;
$pos += strlen( $m[0] );
if ( isset( $m[self::MC_HASH] ) && strlen( $m[self::MC_HASH] ) ) {
// Bare
$this->error( 'Expected digits after ', $errorPos );
$out .= '';
continue;
}
$knownNamed = isset( $m[self::MC_NAMED] ) ? $m[self::MC_NAMED] : '';
$attributeSuffix = isset( $m[self::MC_SUFFIX] ) ? $m[self::MC_SUFFIX] : '';
$haveSemicolon =
( isset( $m[self::MC_SEMICOLON] ) && strlen( $m[self::MC_SEMICOLON] ) )
|| ( strlen( $knownNamed ) && $knownNamed[ strlen( $knownNamed ) - 1 ] === ';' )
|| ( isset( $m[self::MC_INVALID] ) && strlen( $m[self::MC_INVALID] ) );
if ( $inAttr && !$haveSemicolon ) {
if ( strlen( $attributeSuffix ) ) {
if ( !$this->ignoreErrors && $attributeSuffix === '=' ) {
$this->error( 'invalid equals sign after named character reference' );
}
$out .= '&' . $knownNamed . $attributeSuffix;
continue;
}
}
if ( !$this->ignoreErrors && !$haveSemicolon ) {
$this->error( 'character reference missing semicolon', $errorPos );
}
if ( isset( $m[self::MC_DECIMAL] ) && strlen( $m[self::MC_DECIMAL] ) ) {
// Decimal
if ( strlen( $m[self::MC_DECIMAL] ) > 7 ) {
$this->error( 'invalid numeric reference', $errorPos );
$out .= self::REPLACEMENT_CHAR;
continue;
}
$codepoint = intval( $m[self::MC_DECIMAL] );
} elseif ( isset( $m[self::MC_HEXDEC] ) && strlen( $m[self::MC_HEXDEC] ) ) {
// Hexadecimal
if ( strlen( $m[self::MC_HEXDEC] ) > 6 ) {
$this->error( 'invalid numeric reference', $errorPos );
$out .= self::REPLACEMENT_CHAR;
continue;
}
$codepoint = intval( $m[self::MC_HEXDEC], 16 );
} elseif ( $knownNamed !== '' ) {
$out .= HTMLData::$namedEntityTranslations[$knownNamed] . $attributeSuffix;
continue;
} elseif ( isset( $m[self::MC_INVALID] ) && strlen( $m[self::MC_INVALID] ) ) {
if ( !$this->ignoreErrors ) {
$this->error( 'invalid named reference', $errorPos );
}
$out .= '&' . $m[self::MC_INVALID];
continue;
} else {
$this->fatal( 'unable to identify char ref submatch' );
}
// Interpret $codepoint
if ( $codepoint === 0
|| ( $codepoint >= 0xD800 && $codepoint <= 0xDFFF )
|| $codepoint > 0x10FFFF
) {
if ( !$this->ignoreErrors ) {
$this->error( 'invalid numeric reference', $errorPos );
}
$out .= self::REPLACEMENT_CHAR;
} elseif ( isset( HTMLData::$legacyNumericEntities[$codepoint] ) ) {
if ( !$this->ignoreErrors ) {
$this->error( 'invalid reference to non-ASCII control character', $errorPos );
}
$out .= HTMLData::$legacyNumericEntities[$codepoint];
} else {
if ( !$this->ignoreErrors ) {
$disallowedCodepoints = [
0x000B => true,
0xFFFE => true, 0xFFFF => true,
0x1FFFE => true, 0x1FFFF => true,
0x2FFFE => true, 0x2FFFF => true,
0x3FFFE => true, 0x3FFFF => true,
0x4FFFE => true, 0x4FFFF => true,
0x5FFFE => true, 0x5FFFF => true,
0x6FFFE => true, 0x6FFFF => true,
0x7FFFE => true, 0x7FFFF => true,
0x8FFFE => true, 0x8FFFF => true,
0x9FFFE => true, 0x9FFFF => true,
0xAFFFE => true, 0xAFFFF => true,
0xBFFFE => true, 0xBFFFF => true,
0xCFFFE => true, 0xCFFFF => true,
0xDFFFE => true, 0xDFFFF => true,
0xEFFFE => true, 0xEFFFF => true,
0xFFFFE => true, 0xFFFFF => true,
0x10FFFE => true, 0x10FFFF => true ];
if (
( $codepoint >= 1 && $codepoint <= 8 ) ||
( $codepoint >= 0x0d && $codepoint <= 0x1f ) ||
( $codepoint >= 0x7f && $codepoint <= 0x9f ) ||
( $codepoint >= 0xfdd0 && $codepoint <= 0xfdef ) ||
isset( $disallowedCodepoints[$codepoint] )
) {
$this->error( 'invalid numeric reference to control character',
$errorPos );
}
}
$out .= \UtfNormal\Utils::codepointToUtf8( $codepoint );
}
}
if ( $pos < $length ) {
$out .= substr( $text, $pos );
}
return $out;
}
/**
* Emit a range of the input text as a character token, and emit related
* errors, with validity rules as per the data state.
*
* @param integer $pos Offset within the input text
* @param integer $length The length of the range
*/
protected function emitDataRange( $pos, $length ) {
if ( $length === 0 ) {
return;
}
if ( $this->ignoreCharRefs && $this->ignoreNulls && $this->ignoreErrors ) {
$this->listener->characters( $this->text, $pos, $length, $pos, $length );
} else {
if ( !$this->ignoreErrors ) {
// Any bare "<" in a data state text node is a parse error.
// Uniquely to the data state, nulls are just flagged as errors
// and passed through, they are not replaced.
$this->handleAsciiErrors( "<\0", $this->text, $pos, $length, 0 );
}
$text = substr( $this->text, $pos, $length );
$text = $this->handleCharRefs( $text, $pos );
$this->listener->characters( $text, 0, strlen( $text ), $pos, $length );
}
}
/**
* Emit a range of characters from the input text, with validity rules as
* per the CDATA section state.
*
* @param $innerPos The position after the
* @param $outerPos The position of the start of the listener->characters( $this->text, $innerPos, $innerLength,
$outerPos, $outerLength );
}
/**
* Emit a range of characters from the input text, either from RCDATA,
* RAWTEXT, script data or PLAINTEXT. The only difference between these
* states is whether or not character references are expanded, so we take
* that as a parameter.
*
* @param bool $ignoreCharRefs
* @param integer $pos The input position
* @param integer $length The length of the range to be emitted
*/
protected function emitRawTextRange( $ignoreCharRefs, $pos, $length ) {
if ( $length === 0 ) {
return;
}
$ignoreCharRefs = $ignoreCharRefs || $this->ignoreCharRefs;
if ( $ignoreCharRefs && $this->ignoreNulls ) {
$this->listener->characters( $this->text, $pos, $length, $pos, $length );
} else {
$text = substr( $this->text, $pos, $length );
if ( !$ignoreCharRefs ) {
$text = $this->handleCharRefs( $text, $pos );
}
$text = $this->handleNulls( $text, $pos );
$this->listener->characters( $text, 0, strlen( $text ), $pos, $length );
}
}
/**
* The entry point for the RCDATA and RAWTEXT states.
* @param bool $ignoreCharRefs True to ignore character references regardless
* of configuration, false to respect the configuration.
* @return integer The next state index
*/
protected function textElementState( $ignoreCharRefs ) {
if ( $this->appropriateEndTag === null ) {
$this->emitRawTextRange( $ignoreCharRefs, $this->pos, $this->length - $this->pos );
$this->pos = $this->length;
return self::STATE_EOF;
}
$re = "~
{$this->appropriateEndTag}
# Assert that the end tag name state is exited appropriately,
# since the anything else case leads to the tag being treated as
# a literal
(?=[\t\n\f />])
~ix";
do {
$count = preg_match( $re, $this->text, $m, PREG_OFFSET_CAPTURE, $this->pos );
if ( $count === false ) {
$this->throwPregError();
} elseif ( !$count ) {
// Text runs to end
$this->emitRawTextRange( $ignoreCharRefs, $this->pos, $this->length - $this->pos );
$this->pos = $this->length;
return self::STATE_EOF;
}
$startPos = $m[0][1];
// Emit text before tag
$this->emitRawTextRange( $ignoreCharRefs, $this->pos, $startPos - $this->pos );
$matchLength = strlen( $m[0][0] );
$this->pos = $startPos + $matchLength;
$nextState = $this->handleAttribsAndClose( self::STATE_RCDATA,
$this->appropriateEndTag, true, $startPos );
} while ( $nextState === self::STATE_RCDATA );
return $nextState;
}
/**
* Advance $this->pos, consuming all tag attributes found at the current
* position. The new position will be at the end of the tag or at the end
* of the input string.
*
* To improve performance of consumers which don't need to read the
* attribute array, interpretation of the PCRE match results is deferred.
*
* - @todo: Make deferral configurable.
* - @todo: Measure performance improvement, assess whether the LazyAttributes
* feature is warranted.
*
* @return array Attributes
*/
protected function consumeAttribs() {
$re = '~
[\t\n\f ]*+ # Ignored whitespace before attribute name
(?! /> ) # Do not consume self-closing end of tag
(?! > ) # Do not consume normal closing bracket
(?:
# Before attribute name state
# A bare slash at this point, not part of a self-closing end tag, is
# consumed and ignored (with a parse error), returning to the before
# attribute name state.
( / ) | # 1. Bare slash
# Attribute name state
# Note that the first character can be an equals sign, this is a parse error
# but still generates an attribute called "=". Thus the only way the match
# could fail here is due to EOF.
( [^\t\n\f />] [^\t\n\f =/>]*+ ) # 2. Attribute name
# After attribute name state
[\t\n\f ]*
(?:
=
# Before attribute value state
# Ignore whitespace
[\t\n\f ]*+
(?:
# If an end-quote is omitted, the attribute will run to the end of the
# string, leaving no closing bracket. So the caller will detect the
# unexpected EOF and will not emit the tag, which is correct.
" ( [^"]*+ ) "? | # 3. Double-quoted attribute value
\' ( [^\']*+ ) \'? | # 4. Single-quoted attribute value
( [^\t\n\f >]*+ ) # 5. Unquoted attribute value
)
# Or nothing: an attribute with an empty value. The attribute name was
# terminated by a slash, closing bracket or EOF
|
)
)
# The /A modifier causes preg_match_all to give contiguous chunks
~xA';
$count = preg_match_all( $re, $this->text, $m,
PREG_SET_ORDER | PREG_OFFSET_CAPTURE, $this->pos );
if ( $count === false ) {
$this->throwPregError();
} elseif ( $count ) {
$this->pos = $m[$count - 1][0][1] + strlen( $m[$count - 1][0][0] );
$attribs = new LazyAttributes( $m, function ( $m ) {
return $this->interpretAttribMatches( $m );
} );
} else {
$attribs = new PlainAttributes();
}
// Consume trailing whitespace. This is strictly part of the before attribute
// name state, but we didn't consume it in the regex since we used a principle
// of one match equals one attribute.
$this->pos += strspn( $this->text, "\t\n\f ", $this->pos );
return $attribs;
}
/**
* Interpret the results of the attribute preg_match_all(). Emit errors as
* appropriate and return an associative array.
*
* @param array $matches
* @return array
*/
protected function interpretAttribMatches( $matches ) {
$attributes = [];
foreach ( $matches as $m ) {
if ( strlen( $m[self::MA_SLASH][0] ) ) {
$this->error( 'unexpected bare slash', $m[self::MA_SLASH][1] );
continue;
}
$name = $m[self::MA_NAME][0];
if ( !$this->ignoreErrors ) {
$this->handleAsciiErrors( "\"'<=", $name, 0, strlen( $name ), $m[self::MA_NAME][1] );
}
if ( !$this->ignoreNulls ) {
$name = $this->handleNulls( $m[self::MA_NAME][0], $m[self::MA_NAME][1] );
}
$name = strtolower( $name );
$additionalAllowedChar = '';
if ( isset( $m[self::MA_DQUOTED] ) && $m[self::MA_DQUOTED][1] >= 0 ) {
// Double-quoted attribute value
$additionalAllowedChar = '"';
$value = $m[self::MA_DQUOTED][0];
$pos = $m[self::MA_DQUOTED][1];
} elseif ( isset( $m[self::MA_SQUOTED] ) && $m[self::MA_SQUOTED][1] >= 0 ) {
// Single-quoted attribute value
$additionalAllowedChar = "'";
$value = $m[self::MA_SQUOTED][0];
$pos = $m[self::MA_SQUOTED][1];
} elseif ( isset( $m[self::MA_UNQUOTED] ) && $m[self::MA_UNQUOTED][1] >= 0 ) {
// Unquoted attribute value
$value = $m[self::MA_UNQUOTED][0];
$pos = $m[self::MA_UNQUOTED][1];
// Search for parse errors
if ( !$this->ignoreErrors ) {
if ( $value === '' ) {
// ">" in the before attribute value state is a parse error
$this->error( 'empty unquoted attribute', $pos );
}
$this->handleAsciiErrors( "\"'<=`", $value, 0, strlen( $value ), $pos );
}
} else {
$value = '';
}
if ( $additionalAllowedChar && !$this->ignoreErrors ) {
// After attribute value (quoted) state
// Quoted attributes must be followed by a space, "/" or ">"
$aavPos = $m[0][1] + strlen( $m[0][0] );
if ( $aavPos < $this->length ) {
$aavChar = $this->text[$aavPos];
if ( !preg_match( '~^[\t\n\f />]~', $aavChar ) ) {
$this->error( 'missing space between attributes', $aavPos );
}
}
}
if ( $value !== '' ) {
if ( !$this->ignoreNulls ) {
$value = $this->handleNulls( $value, $pos );
}
if ( !$this->ignoreCharRefs ) {
$value = $this->handleCharRefs( $value, $pos, true, $additionalAllowedChar );
}
}
if ( isset( $attributes[$name] ) ) {
$this->error( "duplicate attribute", $m[0][1] );
} else {
$attributes[$name] = $value;
}
}
return $attributes;
}
/**
* Consume attributes, and the closing bracket which follows attributes.
* Emit the appropriate tag event, or in the case of broken attributes in
* text states, emit characters.
*
* @param integer $state The current state
* @param string $tagName The normalized tag name
* @param bool $isEndTag True if this is an end tag, false if it is a start tag
* @param integer $startPos The input position of the start of the current tag.
* @return integer The next state
*/
protected function handleAttribsAndClose( $state, $tagName, $isEndTag, $startPos ) {
$attribStartPos = $this->pos;
$attribs = $this->consumeAttribs();
$pos = $this->pos;
// Literal characters are emitted on EOF or "anything else" from the
// end tag substates of the text states.
// (spec ref 8.2.4 sections 11-19, 25-27)
$isDataState = $state === self::STATE_DATA;
$isLiteral = $attribStartPos === $pos && !$isDataState;
if ( $pos >= $this->length ) {
$this->error( 'unexpected end of file inside tag' );
if ( $isLiteral ) {
$this->listener->characters( $this->text,
$startPos, $this->length - $startPos,
$startPos, $this->length - $startPos );
}
return self::STATE_EOF;
}
if ( $isEndTag && !$this->ignoreErrors && $attribs->count() ) {
$this->error( 'end tag has an attribute' );
}
if ( $this->text[$pos] === '/' && $this->text[$pos + 1] === '>' ) {
$pos += 2;
$selfClose = true;
} elseif ( $this->text[$pos] === '>' ) {
$pos++;
$selfClose = false;
} elseif ( $isLiteral ) {
$this->listener->characters( $this->text,
$startPos, $attribStartPos - $startPos,
$startPos, $attribStartPos - $startPos );
return $state;
} else {
$this->fatal( 'failed to find an already-matched ">"' );
}
$this->pos = $pos;
if ( $isEndTag ) {
if ( $selfClose ) {
$this->error( 'self-closing end tag' );
}
$this->listener->endTag( $tagName, $startPos, $pos - $startPos );
} else {
$this->listener->startTag( $tagName, $attribs, $selfClose,
$startPos, $pos - $startPos );
}
return self::STATE_DATA;
}
/**
* Process input text in the PLAINTEXT state
* @return integer The next state index
*/
protected function plaintextState() {
$this->emitRawTextRange( true, $this->pos, $this->length - $this->pos );
return self::STATE_EOF;
}
/**
* Process input text in the script data state
* @return integer The next state index
*/
protected function scriptDataState() {
if ( $this->appropriateEndTag === null ) {
$this->pos = $this->length;
return self::STATE_EOF;
}
$re = <<appropriateEndTag}
# If we hit the "anything else" case in the script data
# end tag name state, don't exit
(?= [\t\n\f />] )
) | # 1. Appropriate end tag
) |
(?= {$this->appropriateEndTag} [\t\n\f />] ) |
]
# Script data double escaped state
.*?
(?:
$ |
# Stop at, but do not consume, comment-close
(?= --> ) |
]
)
)
)*
# Consume the comment close which exited the inner loop, if any
(?: --> )?
)
)*+
~xsiA
REGEX;
do {
$count = preg_match( $re, $this->text, $m, 0, $this->pos );
if ( $count === false ) {
$this->throwPregError();
} elseif ( !$count ) {
$this->fatal( 'unexpected regex failure: this pattern can match zero characters' );
}
$startPos = $this->pos;
$matchLength = strlen( $m[0] );
$endTagLength = isset( $m[1] ) ? strlen( $m[1] ) : 0;
$textLength = $matchLength - $endTagLength;
$this->emitRawTextRange( true, $startPos, $textLength );
$this->pos = $startPos + $matchLength;
$tagStartPos = $startPos + $textLength;
if ( $endTagLength ) {
$nextState = $this->handleAttribsAndClose( self::STATE_SCRIPT_DATA,
$this->appropriateEndTag, true, $tagStartPos );
} else {
$nextState = self::STATE_EOF;
}
} while ( $nextState === self::STATE_SCRIPT_DATA );
return $nextState;
}
/**
* Emit a parse error event.
* @param string $text The error message
* @param integer|null $pos The error position, or null to use the current position
*/
protected function error( $text, $pos = null ) {
if ( !$this->ignoreErrors ) {
if ( $pos === null ) {
$pos = $this->pos;
}
$this->listener->error( $text, $pos );
}
}
/**
* Throw an exception for a specified reason. This is used for API errors
* and assertion-like sanity checks.
* @param string $text The error message
*/
protected function fatal( $text ) {
throw new TokenizerError( __CLASS__ . ": " . $text );
}
/**
* Interpret preg_last_error() and throw a suitable exception. This is
* called when preg_match() or similar returns false.
*
* Notes for users:
*
* - PCRE internal error: may be due to JIT stack space exhaustion prior
* to PHP 7, due to excessive recursion. Increase stack space.
*
* - pcre.backtrack_limit exhausted: The backtrack limit should be at least
* double the input size, the defaults are way too small. Increase it in
* configuration.
*/
protected function throwPregError() {
if ( defined( 'PREG_JIT_STACKLIMIT_ERROR' ) ) {
$PREG_JIT_STACKLIMIT_ERROR = PREG_JIT_STACKLIMIT_ERROR;
} else {
$PREG_JIT_STACKLIMIT_ERROR = 'undefined error';
}
switch ( preg_last_error() ) {
case PREG_NO_ERROR:
$msg = "PCRE returned false but gave PREG_NO_ERROR";
break;
case PREG_INTERNAL_ERROR:
$msg = "PCRE internal error";
break;
case PREG_BACKTRACK_LIMIT_ERROR:
$msg = "pcre.backtrack_limit exhausted";
break;
case PREG_RECURSION_LIMIT_ERROR:
$msg = "pcre.recursion_limit exhausted";
break;
case $PREG_JIT_STACKLIMIT_ERROR:
$msg = "PCRE JIT stack space exhausted";
break;
case PREG_BAD_UTF8_ERROR:
case PREG_BAD_UTF8_OFFSET_ERROR:
default:
$msg = "PCRE unexpected error";
}
throw new TokenizerError( __CLASS__.": $msg" );
}
}