X-Git-Url: https://scripts.mit.edu/gitweb/autoinstallsdev/mediawiki.git/blobdiff_plain/19e297c21b10b1b8a3acad5e73fc71dcb35db44a..6932310fd58ebef145fa01eb76edf7150284d8ea:/vendor/wikimedia/css-sanitizer/src/Parser/DataSourceTokenizer.php diff --git a/vendor/wikimedia/css-sanitizer/src/Parser/DataSourceTokenizer.php b/vendor/wikimedia/css-sanitizer/src/Parser/DataSourceTokenizer.php new file mode 100644 index 00000000..3fc0998a --- /dev/null +++ b/vendor/wikimedia/css-sanitizer/src/Parser/DataSourceTokenizer.php @@ -0,0 +1,957 @@ +source = $source; + } + + /** + * Read a character from the data source + * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#input-preprocessing + * @return string One UTF-8 character, or empty string on EOF + */ + protected function nextChar() { + $char = $this->source->readCharacter(); + + // Perform transformations per the spec + + // Any U+0000 becomes U+FFFD + if ( $char === "\0" ) { + return \UtfNormal\Constants::UTF8_REPLACEMENT; + } + + // Any U+000D, U+000C, or pair of U+000D + U+000A becomes U+000A + if ( $char === "\f" ) { // U+000C + return "\n"; + } + + if ( $char === "\r" ) { // Either U+000D + U+000A or a lone U+000D + $char2 = $this->source->readCharacter(); + if ( $char2 !== "\n" ) { + $this->source->putBackCharacter( $char2 ); + } + return "\n"; + } + + return $char; + } + + /** + * Update the current and next character fields + */ + protected function consumeCharacter() { + if ( $this->currentCharacter === "\n" ) { + $this->line++; + $this->pos = 1; + } elseif ( $this->currentCharacter !== DataSource::EOF ) { + $this->pos++; + } + + $this->currentCharacter = $this->nextChar(); + $this->nextCharacter = $this->nextChar(); + $this->source->putBackCharacter( $this->nextCharacter ); + } + + /** + * Reconsume the next character + * + * In more normal terms, this pushes a character back onto the data source + * so it will be read again for the next call to self::consumeCharacter(). + */ + protected function reconsumeCharacter() { + // @codeCoverageIgnoreStart + if ( !is_string( $this->currentCharacter ) ) { + throw new \UnexpectedValueException( "[$this->line:$this->pos] Can't reconsume" ); + } + // @codeCoverageIgnoreEnd + + if ( $this->currentCharacter === DataSource::EOF ) { + // Huh? + return; + } + + $this->source->putBackCharacter( $this->currentCharacter ); + $this->nextCharacter = $this->currentCharacter; + $this->currentCharacter = (object)[]; + $this->pos--; + } + + /** + * Look ahead at the next three characters + * @return string[] Three characters + */ + protected function lookAhead() { + $ret = [ + $this->nextChar(), + $this->nextChar(), + $this->nextChar(), + ]; + $this->source->putBackCharacter( $ret[2] ); + $this->source->putBackCharacter( $ret[1] ); + $this->source->putBackCharacter( $ret[0] ); + + return $ret; + } + + public function getParseErrors() { + return $this->parseErrors; + } + + public function clearParseErrors() { + $this->parseErrors = []; + } + + /** + * Record a parse error + * @param string $tag Error tag + * @param array|null $position Report the error as starting at this + * position instead of at the current position. + * @param array $data Extra data about the error. + */ + protected function parseError( $tag, array $position = null, array $data = [] ) { + if ( $position ) { + if ( isset( $position['position'] ) ) { + $position = $position['position']; + } + if ( count( $position ) !== 2 || !is_int( $position[0] ) || !is_int( $position[1] ) ) { + // @codeCoverageIgnoreStart + throw new InvalidArgumentException( 'Invalid position' ); + // @codeCoverageIgnoreEnd + } + $err = [ $tag, $position[0], $position[1] ]; + } else { + $err = [ $tag, $this->line, $this->pos ]; + } + $this->parseErrors[] = array_merge( $err, $data ); + } + + /** + * Read a token from the data source + * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#consume-a-token + * @return Token + */ + public function consumeToken() { + $this->consumeCharacter(); + $pos = [ 'position' => [ $this->line, $this->pos ] ]; + + switch ( (string)$this->currentCharacter ) { + case "\n": + case "\t": + case ' ': + // Whitespace token + while ( self::isWhitespace( $this->nextCharacter ) ) { + $this->consumeCharacter(); + } + return new Token( Token::T_WHITESPACE, $pos ); + + case '"': + case '\'': + // String token + return $this->consumeStringToken( $this->currentCharacter, $pos ); + + case '#': + list( $next, $next2, $next3 ) = $this->lookAhead(); + if ( self::isNameCharacter( $this->nextCharacter ) || + self::isValidEscape( $next, $next2 ) + ) { + return new Token( Token::T_HASH, $pos + [ + 'typeFlag' => self::wouldStartIdentifier( $next, $next2, $next3 ) ? 'id' : 'unrestricted', + 'value' => $this->consumeName(), + ] ); + } + + return new Token( Token::T_DELIM, $pos + [ 'value' => $this->currentCharacter ] ); + + case '$': + if ( $this->nextCharacter === '=' ) { + $this->consumeCharacter(); + return new Token( Token::T_SUFFIX_MATCH, $pos ); + } + + return new Token( Token::T_DELIM, $pos + [ 'value' => $this->currentCharacter ] ); + + case '(': + return new Token( Token::T_LEFT_PAREN, $pos ); + + case ')': + return new Token( Token::T_RIGHT_PAREN, $pos ); + + case '*': + if ( $this->nextCharacter === '=' ) { + $this->consumeCharacter(); + return new Token( Token::T_SUBSTRING_MATCH, $pos ); + } + + return new Token( Token::T_DELIM, $pos + [ 'value' => $this->currentCharacter ] ); + + case '+': + case '.': + list( $next, $next2, $next3 ) = $this->lookAhead(); + if ( self::wouldStartNumber( $this->currentCharacter, $next, $next2 ) ) { + $this->reconsumeCharacter(); + return $this->consumeNumericToken( $pos ); + } + + return new Token( Token::T_DELIM, $pos + [ 'value' => $this->currentCharacter ] ); + + case ',': + return new Token( Token::T_COMMA, $pos ); + + case '-': + list( $next, $next2, $next3 ) = $this->lookAhead(); + if ( self::wouldStartNumber( $this->currentCharacter, $next, $next2 ) ) { + $this->reconsumeCharacter(); + return $this->consumeNumericToken( $pos ); + } + + if ( $next === '-' && $next2 === '>' ) { + $this->consumeCharacter(); + $this->consumeCharacter(); + return new Token( Token::T_CDC, $pos ); + } + + if ( self::wouldStartIdentifier( $this->currentCharacter, $next, $next2 ) ) { + $this->reconsumeCharacter(); + return $this->consumeIdentLikeToken( $pos ); + } + + return new Token( Token::T_DELIM, $pos + [ 'value' => $this->currentCharacter ] ); + + case '/': + if ( $this->nextCharacter === '*' ) { + $this->consumeCharacter(); + $this->consumeCharacter(); + while ( $this->currentCharacter !== DataSource::EOF && + !( $this->currentCharacter === '*' && $this->nextCharacter === '/' ) + ) { + $this->consumeCharacter(); + } + if ( $this->currentCharacter === DataSource::EOF ) { + // Parse error from the editor's draft as of 2017-01-06 + $this->parseError( 'unclosed-comment', $pos ); + } + $this->consumeCharacter(); + return $this->consumeToken(); + } + + return new Token( Token::T_DELIM, $pos + [ 'value' => $this->currentCharacter ] ); + + case ':': + return new Token( Token::T_COLON, $pos ); + + case ';': + return new Token( Token::T_SEMICOLON, $pos ); + + case '<': + list( $next, $next2, $next3 ) = $this->lookAhead(); + if ( $next === '!' && $next2 === '-' && $next3 === '-' ) { + $this->consumeCharacter(); + $this->consumeCharacter(); + $this->consumeCharacter(); + return new Token( Token::T_CDO, $pos ); + } + + return new Token( Token::T_DELIM, $pos + [ 'value' => $this->currentCharacter ] ); + + case '@': + list( $next, $next2, $next3 ) = $this->lookAhead(); + if ( self::wouldStartIdentifier( $next, $next2, $next3 ) ) { + return new Token( Token::T_AT_KEYWORD, $pos + [ 'value' => $this->consumeName() ] ); + } + + return new Token( Token::T_DELIM, $pos + [ 'value' => $this->currentCharacter ] ); + + case '[': + return new Token( Token::T_LEFT_BRACKET, $pos ); + + case '\\': + if ( self::isValidEscape( $this->currentCharacter, $this->nextCharacter ) ) { + $this->reconsumeCharacter(); + return $this->consumeIdentLikeToken( $pos ); + } + + $this->parseError( 'bad-escape' ); + return new Token( Token::T_DELIM, $pos + [ 'value' => $this->currentCharacter ] ); + + case ']': + return new Token( Token::T_RIGHT_BRACKET, $pos ); + + case '^': + if ( $this->nextCharacter === '=' ) { + $this->consumeCharacter(); + return new Token( Token::T_PREFIX_MATCH, $pos ); + } + + return new Token( Token::T_DELIM, $pos + [ 'value' => $this->currentCharacter ] ); + + case '{': + return new Token( Token::T_LEFT_BRACE, $pos ); + + case '}': + return new Token( Token::T_RIGHT_BRACE, $pos ); + + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + $this->reconsumeCharacter(); + return $this->consumeNumericToken( $pos ); + + case 'u': + case 'U': + if ( $this->nextCharacter === '+' ) { + list( $next, $next2 ) = $this->lookAhead(); + if ( self::isHexDigit( $next2 ) || $next2 === '?' ) { + $this->consumeCharacter(); + return $this->consumeUnicodeRangeToken( $pos ); + } + } + + $this->reconsumeCharacter(); + return $this->consumeIdentLikeToken( $pos ); + + case '|': + if ( $this->nextCharacter === '=' ) { + $this->consumeCharacter(); + return new Token( Token::T_DASH_MATCH, $pos ); + } + + if ( $this->nextCharacter === '|' ) { + $this->consumeCharacter(); + return new Token( Token::T_COLUMN, $pos ); + } + + return new Token( Token::T_DELIM, $pos + [ 'value' => $this->currentCharacter ] ); + + case '~': + if ( $this->nextCharacter === '=' ) { + $this->consumeCharacter(); + return new Token( Token::T_INCLUDE_MATCH, $pos ); + } + + return new Token( Token::T_DELIM, $pos + [ 'value' => $this->currentCharacter ] ); + + case DataSource::EOF: + return new Token( Token::T_EOF, $pos ); + + default: + if ( self::isNameStartCharacter( $this->currentCharacter ) ) { + $this->reconsumeCharacter(); + return $this->consumeIdentLikeToken( $pos ); + } + + return new Token( Token::T_DELIM, $pos + [ 'value' => $this->currentCharacter ] ); + } + } + + /** + * Consume a numeric token + * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#consume-a-numeric-token + * @param array $data Data for the new token (typically contains just 'position') + * @return Token + */ + protected function consumeNumericToken( array $data ) { + list( $data['representation'], $data['value'], $data['typeFlag'] ) = $this->consumeNumber(); + + list( $next, $next2, $next3 ) = $this->lookAhead(); + if ( self::wouldStartIdentifier( $next, $next2, $next3 ) ) { + return new Token( Token::T_DIMENSION, $data + [ 'unit' => $this->consumeName() ] ); + } elseif ( $this->nextCharacter === '%' ) { + $this->consumeCharacter(); + return new Token( Token::T_PERCENTAGE, $data ); + } else { + return new Token( Token::T_NUMBER, $data ); + } + } + + /** + * Consume an ident-like token + * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#consume-an-ident-like-token + * @note Per the draft as of January 2017, quoted URLs are parsed as + * functions named 'url'. This is needed in order to implement the `` + * type in the [Values specification](https://www.w3.org/TR/2016/CR-css-values-3-20160929/#urls). + * @param array $data Data for the new token (typically contains just 'position') + * @return Token + */ + protected function consumeIdentLikeToken( array $data ) { + $name = $this->consumeName(); + + if ( $this->nextCharacter === '(' ) { + $this->consumeCharacter(); + + if ( !strcasecmp( $name, 'url' ) ) { + while ( true ) { + list( $next, $next2 ) = $this->lookAhead(); + if ( !self::isWhitespace( $next ) || !self::isWhitespace( $next2 ) ) { + break; + } + $this->consumeCharacter(); + } + if ( $next !== '"' && $next !== '\'' && + !( self::isWhitespace( $next ) && ( $next2 === '"' || $next2=== '\'' ) ) + ) { + return $this->consumeUrlToken( $data ); + } + } + + return new Token( Token::T_FUNCTION, $data + [ 'value' => $name ] ); + } + + return new Token( Token::T_IDENT, $data + [ 'value' => $name ] ); + } + + /** + * Consume a string token + * + * This assumes the leading quote or apostrophe has already been consumed. + * + * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#consume-a-string-token + * @param string $endChar Ending character of the string + * @param array $data Data for the new token (typically contains just 'position') + * @return Token + */ + protected function consumeStringToken( $endChar, array $data ) { + $data['value'] = ''; + + while ( true ) { + $this->consumeCharacter(); + switch ( $this->currentCharacter ) { + case DataSource::EOF: + // Parse error from the editor's draft as of 2017-01-06 + $this->parseError( 'unclosed-string', $data ); + break 2; + + case $endChar: + break 2; + + case "\n": + $this->parseError( 'newline-in-string' ); + $this->reconsumeCharacter(); + return new Token( Token::T_BAD_STRING, [ 'value' => '' ] + $data ); + + case '\\': + if ( $this->nextCharacter === DataSource::EOF ) { + // Do nothing + // Parse error from the editor's draft as of 2017-01-06 + $this->parseError( 'bad-escape' ); + } elseif ( $this->nextCharacter === "\n" ) { + // Consume it + $this->consumeCharacter(); + } elseif ( self::isValidEscape( $this->currentCharacter, $this->nextCharacter ) ) { + $data['value'] .= $this->consumeEscape(); + } else { + // @codeCoverageIgnoreStart + throw new \UnexpectedValueException( "[$this->line:$this->pos] Unexpected state" ); + // @codeCoverageIgnoreEnd + } + break; + + default: + $data['value'] .= $this->currentCharacter; + break; + } + } + + return new Token( Token::T_STRING, $data ); + } + + /** + * Consume a URL token + * + * This assumes the leading "url(" has already been consumed. + * + * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#consume-a-url-token + * @note Per the draft as of January 2017, this does not handle quoted URL tokens. + * @param array $data Data for the new token (typically contains just 'position') + * @return Token + */ + protected function consumeUrlToken( array $data ) { + // 1. + $data['value'] = ''; + + // 2. + while ( self::isWhitespace( $this->nextCharacter ) ) { + $this->consumeCharacter(); + } + + // 3. + if ( $this->nextCharacter === DataSource::EOF ) { + // Parse error from the editor's draft as of 2017-01-06 + $this->parseError( 'unclosed-url', $data ); + return new Token( Token::T_URL, $data ); + } + + // 4. (removed in draft, this was formerly the parsing for a quoted URL token) + + // 5. (renumbered as 4 in the draft) + while ( true ) { + $this->consumeCharacter(); + switch ( $this->currentCharacter ) { + case DataSource::EOF: + // Parse error from the editor's draft as of 2017-01-06 + $this->parseError( 'unclosed-url', $data ); + break 2; + + case ')': + break 2; + + case "\n": + case "\t": + case ' ': + while ( self::isWhitespace( $this->nextCharacter ) ) { + $this->consumeCharacter(); + } + if ( $this->nextCharacter === ')' ) { + $this->consumeCharacter(); + break 2; + } elseif ( $this->nextCharacter === DataSource::EOF ) { + // Parse error from the editor's draft as of 2017-01-06 + $this->consumeCharacter(); + $this->parseError( 'unclosed-url', $data ); + break 2; + } else { + $this->consumeBadUrlRemnants(); + return new Token( Token::T_BAD_URL, [ 'value' => '' ] + $data ); + } + break; + + case '"': + case '\'': + case '(': + $this->parseError( 'bad-character-in-url' ); + $this->consumeBadUrlRemnants(); + return new Token( Token::T_BAD_URL, [ 'value' => '' ] + $data ); + + case '\\': + if ( self::isValidEscape( $this->currentCharacter, $this->nextCharacter ) ) { + $data['value'] .= $this->consumeEscape(); + } else { + $this->parseError( 'bad-escape' ); + $this->consumeBadUrlRemnants(); + return new Token( Token::T_BAD_URL, [ 'value' => '' ] + $data ); + } + break; + + default: + if ( self::isNonPrintable( $this->currentCharacter ) ) { + $this->parseError( 'bad-character-in-url' ); + $this->consumeBadUrlRemnants(); + return new Token( Token::T_BAD_URL, [ 'value' => '' ] + $data ); + } + + $data['value'] .= $this->currentCharacter; + break; + } + } + + return new Token( Token::T_URL, $data ); + } + + /** + * Clean up after finding an error in a URL + * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#consume-the-remnants-of-a-bad-url + */ + protected function consumeBadUrlRemnants() { + while ( true ) { + $this->consumeCharacter(); + if ( $this->currentCharacter === ')' || $this->currentCharacter === DataSource::EOF ) { + break; + } + if ( self::isValidEscape( $this->currentCharacter, $this->nextCharacter ) ) { + $this->consumeEscape(); + } + } + } + + /** + * Consume a unicode-range token + * + * This assumes the initial "u" has been consumed (currentCharacter is the '+'), + * and the next codepoint is verfied to be a hex digit or "?". + * + * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#consume-a-unicode-range-token + * @param array $data Data for the new token (typically contains just 'position') + * @return Token + */ + protected function consumeUnicodeRangeToken( array $data ) { + // 1. + $v = ''; + while ( strlen( $v ) < 6 && self::isHexDigit( $this->nextCharacter ) ) { + $this->consumeCharacter(); + $v .= $this->currentCharacter; + } + $anyQ = false; + while ( strlen( $v ) < 6 && $this->nextCharacter === '?' ) { + $anyQ = true; + $this->consumeCharacter(); + $v .= $this->currentCharacter; + } + + if ( $anyQ ) { + return new Token( Token::T_UNICODE_RANGE, $data + [ + 'start' => intval( str_replace( '?', '0', $v ), 16 ), + 'end' => intval( str_replace( '?', 'F', $v ), 16 ), + ] ); + } + + $data['start'] = intval( $v, 16 ); + + // 2. + list( $next, $next2 ) = $this->lookAhead(); + if ( $next === '-' && self::isHexDigit( $next2 ) ) { + $this->consumeCharacter(); + $v = ''; + while ( strlen( $v ) < 6 && self::isHexDigit( $this->nextCharacter ) ) { + $this->consumeCharacter(); + $v .= $this->currentCharacter; + } + $data['end'] = intval( $v, 16 ); + } else { + // 3. + $data['end'] = $data['start']; + } + + // 4. + return new Token( Token::T_UNICODE_RANGE, $data ); + } + + /** + * Indicate if a character is whitespace + * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#whitespace + * @param string $char A single UTF-8 character + * @return bool + */ + protected static function isWhitespace( $char ) { + return $char === "\n" || $char === "\t" || $char === " "; + } + + /** + * Indicate if a character is a name-start code point + * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#name-start-code-point + * @param string $char A single UTF-8 character + * @return bool + */ + protected static function isNameStartCharacter( $char ) { + // Every non-ASCII character is a name start character, so we can just + // check the first byte. + $char = ord( $char ); + return $char >= 0x41 && $char <= 0x5a || + $char >= 0x61 && $char <= 0x7a || + $char >= 0x80 || $char === 0x5f; + } + + /** + * Indicate if a character is a name code point + * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#name-code-point + * @param string $char A single UTF-8 character + * @return bool + */ + protected static function isNameCharacter( $char ) { + // Every non-ASCII character is a name character, so we can just check + // the first byte. + $char = ord( $char ); + return $char >= 0x41 && $char <= 0x5a || + $char >= 0x61 && $char <= 0x7a || + $char >= 0x30 && $char <= 0x39 || + $char >= 0x80 || $char === 0x5f || $char === 0x2d; + } + + /** + * Indicate if a character is non-printable + * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#non-printable-code-point + * @param string $char A single UTF-8 character + * @return bool + */ + protected static function isNonPrintable( $char ) { + // No non-ASCII character is non-printable, so we can just check the + // first byte. + $char = ord( $char ); + return $char >= 0x00 && $char <= 0x08 || + $char === 0x0b || + $char >= 0x0e && $char <= 0x1f || + $char === 0x7f; + } + + /** + * Indicate if a character is a digit + * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#digit + * @param string $char A single UTF-8 character + * @return bool + */ + protected static function isDigit( $char ) { + // No non-ASCII character is a digit, so we can just check the first + // byte. + $char = ord( $char ); + return $char >= 0x30 && $char <= 0x39; + } + + /** + * Indicate if a character is a hex digit + * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#hex-digit + * @param string $char A single UTF-8 character + * @return bool + */ + protected static function isHexDigit( $char ) { + // No non-ASCII character is a hex digit, so we can just check the + // first byte. + $char = ord( $char ); + return $char >= 0x30 && $char <= 0x39 || + $char >= 0x41 && $char <= 0x46 || + $char >= 0x61 && $char <= 0x66; + } + + /** + * Determine if two characters constitute a valid escape + * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#starts-with-a-valid-escape + * @param string $char1 + * @param string $char2 + * @return bool + */ + protected static function isValidEscape( $char1, $char2 ) { + return $char1 === '\\' && $char2 !== "\n"; + } + + /** + * Determine if three characters would start an identifier + * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#would-start-an-identifier + * @param string $char1 + * @param string $char2 + * @param string $char3 + * @return bool + */ + protected static function wouldStartIdentifier( $char1, $char2, $char3 ) { + if ( $char1 === '-' ) { + // Added the possibility for an itentifier beginning with "--" per the draft. + return self::isNameStartCharacter( $char2 ) || $char2 === '-' || + self::isValidEscape( $char2, $char3 ); + } elseif ( self::isNameStartCharacter( $char1 ) ) { + return true; + } elseif ( $char1 === '\\' ) { + return self::isValidEscape( $char1, $char2 ); + } else { + return false; + } + } + + /** + * Determine if three characters would start a number + * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#starts-with-a-number + * @param string $char1 + * @param string $char2 + * @param string $char3 + * @return bool + */ + protected static function wouldStartNumber( $char1, $char2, $char3 ) { + if ( $char1 === '+' || $char1 === '-' ) { + return self::isDigit( $char2 ) || + $char2 === '.' && self::isDigit( $char3 ); + } elseif ( $char1 === '.' ) { + return self::isDigit( $char2 ); + // @codeCoverageIgnoreStart + // Nothing reaches this code + } else { + return self::isDigit( $char1 ); + } + // @codeCoverageIgnoreEnd + } + + /** + * Consume a valid escape + * + * This assumes the leading backslash is consumed. + * + * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#consume-an-escaped-code-point + * @return string Escaped character + */ + protected function consumeEscape() { + $position = [ 'position' => [ $this->line, $this->pos ] ]; + + $this->consumeCharacter(); + + // @codeCoverageIgnoreStart + if ( $this->currentCharacter === "\n" ) { + throw new \UnexpectedValueException( "[$this->line:$this->pos] Unexpected newline" ); + } + // @codeCoverageIgnoreEnd + + // 1-6 hexits, plus one optional whitespace character + if ( self::isHexDigit( $this->currentCharacter ) ) { + $num = $this->currentCharacter; + while ( strlen( $num ) < 6 && self::isHexDigit( $this->nextCharacter ) ) { + $this->consumeCharacter(); + $num .= $this->currentCharacter; + } + if ( self::isWhitespace( $this->nextCharacter ) ) { + $this->consumeCharacter(); + } + + $num = intval( $num, 16 ); + if ( $num === 0 || $num >= 0xd800 && $num <= 0xdfff || $num > 0x10ffff ) { + return \UtfNormal\Constants::UTF8_REPLACEMENT; + } + return \UtfNormal\Utils::codepointToUtf8( $num ); + } + + if ( $this->currentCharacter === DataSource::EOF ) { + // Parse error from the editor's draft as of 2017-01-06 + $this->parseError( 'bad-escape', $position ); + return \UtfNormal\Constants::UTF8_REPLACEMENT; + } + + return $this->currentCharacter; + } + + /** + * Consume a name + * + * Note this does not do validation on the input stream. Call + * self::wouldStartIdentifier() or the like before calling the method if + * necessary. + * + * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#consume-a-name + * @return string Name + */ + protected function consumeName() { + $name = ''; + + while ( true ) { + $this->consumeCharacter(); + + if ( self::isNameCharacter( $this->currentCharacter ) ) { + $name .= $this->currentCharacter; + } elseif ( self::isValidEscape( $this->currentCharacter, $this->nextCharacter ) ) { + $name .= $this->consumeEscape(); + } else { + $this->reconsumeCharacter(); // Doesn't say to, but breaks otherwise + return $name; + } + } + // @codeCoverageIgnoreStart + } + // @codeCoverageIgnoreEnd + + /** + * Consume a number + * + * Note this does not do validation on the input stream. Call + * self::wouldStartNumber() before calling the method if necessary. + * + * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#consume-a-number + * @return array [ string $value, int|float $number, string $type ('integer' or 'number') ] + */ + protected function consumeNumber() { + // 1. + $repr = ''; + $type = 'integer'; + + // 2. + if ( $this->nextCharacter === '+' || $this->nextCharacter === '-' ) { + $this->consumeCharacter(); + $repr .= $this->currentCharacter; + } + + // 3. + while ( self::isDigit( $this->nextCharacter ) ) { + $this->consumeCharacter(); + $repr .= $this->currentCharacter; + } + + // 4. + if ( $this->nextCharacter === '.' ) { + list( $next, $next2, $next3 ) = $this->lookAhead(); + if ( self::isDigit( $next2 ) ) { + // 4.1. + $this->consumeCharacter(); + $this->consumeCharacter(); + // 4.2. + $repr .= $next . $next2; + // 4.3. + $type = 'number'; + // 4.4. + while ( self::isDigit( $this->nextCharacter ) ) { + $this->consumeCharacter(); + $repr .= $this->currentCharacter; + } + } + } + + // 5. + if ( $this->nextCharacter === 'e' || $this->nextCharacter === 'E' ) { + list( $next, $next2, $next3 ) = $this->lookAhead(); + $ok = false; + if ( ( $next2 === '+' || $next2 === '-' ) && self::isDigit( $next3 ) ) { + $ok = true; + // 5.1. + $this->consumeCharacter(); + $this->consumeCharacter(); + $this->consumeCharacter(); + // 5.2. + $repr .= $next . $next2 . $next3; + } elseif ( self::isDigit( $next2 ) ) { + $ok = true; + // 5.1. + $this->consumeCharacter(); + $this->consumeCharacter(); + // 5.2. + $repr .= $next . $next2; + } + if ( $ok ) { + // 5.3. + $type = 'number'; + // 5.4. + while ( self::isDigit( $this->nextCharacter ) ) { + $this->consumeCharacter(); + $repr .= $this->currentCharacter; + } + } + } + + // 6. We assume PHP's casting follows the same rules as + // https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#convert-a-string-to-a-number + $value = $type === 'integer' ? (int)$repr : (float)$repr; + + // 7. + return [ $repr, $value, $type ]; + } +}