--- /dev/null
+<?php
+/**
+ * @file
+ * @license https://opensource.org/licenses/Apache-2.0 Apache-2.0
+ */
+
+namespace Wikimedia\CSS\Parser;
+
+use Wikimedia\CSS\Objects\AtRule;
+use Wikimedia\CSS\Objects\ComponentValueList;
+use Wikimedia\CSS\Objects\ComponentValue;
+use Wikimedia\CSS\Objects\CSSFunction;
+use Wikimedia\CSS\Objects\DeclarationList;
+use Wikimedia\CSS\Objects\DeclarationOrAtRuleList;
+use Wikimedia\CSS\Objects\Declaration;
+use Wikimedia\CSS\Objects\QualifiedRule;
+use Wikimedia\CSS\Objects\Rule;
+use Wikimedia\CSS\Objects\RuleList;
+use Wikimedia\CSS\Objects\SimpleBlock;
+use Wikimedia\CSS\Objects\Stylesheet;
+use Wikimedia\CSS\Objects\Token;
+use Wikimedia\CSS\Sanitizer\Sanitizer;
+
+// Note: While reading the code below, you might find that my calls to
+// consumeToken() don't match what the spec says and I don't ever "reconsume" a
+// token. It turns out that the spec is overcomplicated and confused with
+// respect to the "current input token" and the "next input token". It turns
+// out things are pretty simple: every "consume an X" is called with the
+// current input token being the first token of X, and returns with the current
+// input token being the last token of X (or EOF if X ends at EOF).
+
+// Also of note is that, since our Tokenizer can only return a stream of tokens
+// rather than a stream of component values, the consume functions here only
+// consider tokens. ComponentValueList::toTokenArray() may be used to convert a
+// list of component values to a list of tokens if necessary.
+
+/**
+ * Parse CSS into a structure for further processing.
+ *
+ * This implements the CSS Syntax Module Level 3 candidate recommendation.
+ * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/
+ *
+ * The usual entry points are:
+ * - Parser::parseStylesheet() to parse a stylesheet or the contents of a <style> tag.
+ * - Parser::parseDeclarationList() to parse an inline style attribute
+ */
+class Parser {
+ /** Maximum depth of nested ComponentValues */
+ const CV_DEPTH_LIMIT = 100; // Arbitrary number that seems like it should be enough
+
+ /** @var Tokenizer */
+ protected $tokenizer;
+
+ /** @var Token|null The most recently consumed token */
+ protected $currentToken = null;
+
+ /** @var array Parse errors. Each error is [ string $tag, int $line, int $pos ] */
+ protected $parseErrors = [];
+
+ /** @var int Recursion depth, incremented in self::consumeComponentValue() */
+ protected $cvDepth = 0;
+
+ /**
+ * @param Tokenizer $tokenizer CSS Tokenizer
+ */
+ public function __construct( Tokenizer $tokenizer ) {
+ $this->tokenizer = $tokenizer;
+ }
+
+ /**
+ * Create a Parser for a CSS string
+ * @param string $source CSS to parse.
+ * @param array $options Configuration options, see DataSourceTokenizer::__construct(). Also,
+ * - convert: (array) If specified, detect the encoding as defined in the
+ * CSS spec. The value is passed as the $encodings argument to
+ * Encoder::convert().
+ * @return static
+ */
+ public static function newFromString( $source, array $options = [] ) {
+ if ( isset( $options['convert'] ) ) {
+ $source = Encoder::convert( $source, $options['convert'] );
+ }
+ return static::newFromDataSource( new StringDataSource( $source ), $options );
+ }
+
+ /**
+ * Create a Parser for a CSS DataSource
+ * @param DataSource $source CSS to parse.
+ * @param array $options Configuration options, see DataSourceTokenizer::__construct().
+ * @return static
+ */
+ public static function newFromDataSource( DataSource $source, array $options = [] ) {
+ $tokenizer = new DataSourceTokenizer( $source, $options );
+ return new static( $tokenizer );
+ }
+
+ /**
+ * Create a Parser for a list of Tokens
+ * @param Token[] $tokens Token-stream to parse
+ * @param Token|null $eof EOF-token
+ * @return static
+ */
+ public static function newFromTokens( array $tokens, Token $eof = null ) {
+ $tokenizer = new TokenListTokenizer( $tokens, $eof );
+ return new static( $tokenizer );
+ }
+
+ /**
+ * Consume a token
+ */
+ protected function consumeToken() {
+ if ( !$this->currentToken || $this->currentToken->type() !== Token::T_EOF ) {
+ $this->currentToken = $this->tokenizer->consumeToken();
+
+ // Copy any parse errors encountered
+ foreach ( $this->tokenizer->getParseErrors() as $error ) {
+ $this->parseErrors[] = $error;
+ }
+ $this->tokenizer->clearParseErrors();
+ }
+ }
+
+ /**
+ * Consume a token, also consuming any following whitespace (and comments)
+ */
+ protected function consumeTokenAndWhitespace() {
+ do {
+ $this->consumeToken();
+ } while ( $this->currentToken->type() === Token::T_WHITESPACE );
+ }
+
+ /**
+ * Return all parse errors seen so far
+ * @return array Array of [ string $tag, int $line, int $pos, ... ]
+ */
+ public function getParseErrors() {
+ return $this->parseErrors;
+ }
+
+ /**
+ * Clear parse errors
+ */
+ public function clearParseErrors() {
+ $this->parseErrors = [];
+ }
+
+ /**
+ * Record a parse error
+ * @param string $tag Error tag
+ * @param Token $token Report the error as starting at this token.
+ * @param array $data Extra data about the error.
+ */
+ protected function parseError( $tag, Token $token, array $data = [] ) {
+ list( $line, $pos ) = $token->getPosition();
+ $this->parseErrors[] = array_merge( [ $tag, $line, $pos ], $data );
+ }
+
+ /**
+ * Parse a stylesheet
+ * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#parse-a-stylesheet
+ * @note Per the Editor's Draft, if the first rule is an at-rule named
+ * "charset" it will be silently dropped. If you're not using the provided
+ * Sanitizer classes to further sanitize the CSS, you'll want to manually
+ * filter out any other such rules before stringifying the stylesheet
+ * and/or prepend `@charset "utf-8";` after stringifying it.
+ * @return Stylesheet
+ */
+ public function parseStylesheet() {
+ $this->consumeToken(); // Move to the first token
+ $list = $this->consumeRuleList( true );
+
+ // Drop @charset per the Editor's Draft
+ if ( isset( $list[0] ) && $list[0] instanceof AtRule &&
+ !strcasecmp( $list[0]->getName(), 'charset' )
+ ) {
+ $list->remove( 0 );
+ $list->rewind();
+ }
+
+ return new Stylesheet( $list );
+ }
+
+ /**
+ * Parse a list of rules
+ * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#parse-a-list-of-rules
+ * @return RuleList
+ */
+ public function parseRuleList() {
+ $this->consumeToken(); // Move to the first token
+ return $this->consumeRuleList( false );
+ }
+
+ /**
+ * Parse a rule
+ * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#parse-a-rule
+ * @return Rule|null
+ */
+ public function parseRule() {
+ // 1. and 2.
+ $this->consumeTokenAndWhitespace();
+
+ // 3.
+ if ( $this->currentToken->type() === Token::T_EOF ) {
+ $this->parseError( 'unexpected-eof', $this->currentToken ); // "return a syntax error"?
+ return null;
+ }
+
+ if ( $this->currentToken->type() === Token::T_AT_KEYWORD ) {
+ $rule = $this->consumeAtRule();
+ } else {
+ $rule = $this->consumeQualifiedRule();
+ if ( !$rule ) {
+ return null;
+ }
+ }
+
+ // 4.
+ $this->consumeTokenAndWhitespace();
+
+ // 5.
+ if ( $this->currentToken->type() === Token::T_EOF ) {
+ return $rule;
+ } else {
+ $this->parseError( 'expected-eof', $this->currentToken ); // "return a syntax error"?
+ return null;
+ }
+ }
+
+ /**
+ * Parse a declaration
+ * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#parse-a-declaration
+ * @return Declaration|null
+ */
+ public function parseDeclaration() {
+ // 1. and 2.
+ $this->consumeTokenAndWhitespace();
+
+ // 3.
+ if ( $this->currentToken->type() !== Token::T_IDENT ) {
+ $this->parseError( 'expected-ident', $this->currentToken ); // "return a syntax error"?
+ return null;
+ }
+
+ // 4.
+ $declaration = $this->consumeDeclaration();
+
+ // Declarations always run to EOF, no need to check.
+
+ return $declaration;
+ }
+
+ /**
+ * Parse a list of declarations
+ * @note This is not the entry point the standard calls "parse a list of declarations",
+ * see self::parseDeclarationOrAtRuleList()
+ * @return DeclarationList
+ */
+ public function parseDeclarationList() {
+ $this->consumeToken(); // Move to the first token
+ return $this->consumeDeclarationOrAtRuleList( false );
+ }
+
+ /**
+ * Parse a list of declarations and at-rules
+ * @note This is the entry point the standard calls "parse a list of declarations"
+ * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#parse-a-list-of-declarations
+ * @return DeclarationOrAtRuleList
+ */
+ public function parseDeclarationOrAtRuleList() {
+ $this->consumeToken(); // Move to the first token
+ return $this->consumeDeclarationOrAtRuleList();
+ }
+
+ /**
+ * Parse a (non-whitespace) component value
+ * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#parse-a-component-value
+ * @return ComponentValue|null
+ */
+ public function parseComponentValue() {
+ // 1. and 2.
+ $this->consumeTokenAndWhitespace();
+
+ // 3.
+ if ( $this->currentToken->type() === Token::T_EOF ) {
+ $this->parseError( 'unexpected-eof', $this->currentToken ); // "return a syntax error"?
+ return null;
+ }
+
+ // 4.
+ $value = $this->consumeComponentValue();
+ // The spec says to return a syntax error if nothing is returned, but
+ // that can never happen and the Editor's Draft removed that language.
+
+ // 5.
+ $this->consumeTokenAndWhitespace();
+
+ // 6.
+ if ( $this->currentToken->type() === Token::T_EOF ) {
+ return $value;
+ } else {
+ $this->parseError( 'expected-eof', $this->currentToken ); // "return a syntax error"?
+ return null;
+ }
+
+ }
+
+ /**
+ * Parse a list of component values
+ * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#parse-a-list-of-component-values
+ * @return ComponentValueList
+ */
+ public function parseComponentValueList() {
+ $list = new ComponentValueList();
+ while ( true ) {
+ $this->consumeToken(); // Move to the first/next token
+ $value = $this->consumeComponentValue();
+ if ( $value instanceof Token && $value->type() === Token::T_EOF ) {
+ break;
+ }
+ $list->add( $value );
+ }
+
+ return $list;
+ }
+
+ /**
+ * Consume a list of rules
+ * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#consume-a-list-of-rules
+ * @param boolean $topLevel Determines the behavior when CDO and CDC tokens are encountered
+ * @return RuleList
+ */
+ protected function consumeRuleList( $topLevel ) {
+ $list = new RuleList();
+ while ( true ) {
+ $rule = false;
+ switch ( $this->currentToken->type() ) {
+ case Token::T_WHITESPACE:
+ break;
+
+ case Token::T_EOF:
+ break 2;
+
+ case Token::T_CDO:
+ case Token::T_CDC:
+ if ( $topLevel ) {
+ // Do nothing
+ } else {
+ $rule = $this->consumeQualifiedRule();
+ }
+ break;
+
+ case Token::T_AT_KEYWORD:
+ $rule = $this->consumeAtRule();
+ break;
+
+ default:
+ $rule = $this->consumeQualifiedRule();
+ break;
+ }
+
+ if ( $rule ) {
+ $list->add( $rule );
+ }
+ $this->consumeToken();
+ }
+
+ return $list;
+ }
+
+ /**
+ * Consume a list of declarations and at-rules
+ * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#consume-a-list-of-declarations
+ * @param bool $allowAtRules Whether to allow at-rules. This flag is not in
+ * the spec, and is used to implement the non-spec self::parseDeclarationList().
+ * @return DeclarationOrAtRuleList|DeclarationList
+ */
+ protected function consumeDeclarationOrAtRuleList( $allowAtRules = true ) {
+ $list = $allowAtRules ? new DeclarationOrAtRuleList() : new DeclarationList();
+ while ( true ) {
+ $declaration = false;
+ switch ( $this->currentToken->type() ) {
+ case Token::T_WHITESPACE:
+ break;
+
+ case Token::T_SEMICOLON:
+ $declaration = null;
+ break;
+
+ case Token::T_EOF:
+ break 2;
+
+ case Token::T_AT_KEYWORD:
+ if ( $allowAtRules ) {
+ $declaration = $this->consumeAtRule();
+ } else {
+ $this->parseError( 'unexpected-token-in-declaration-list', $this->currentToken );
+ $this->consumeAtRule();
+ $declaration = null;
+ }
+ break;
+
+ case Token::T_IDENT:
+ // The draft changes this to ComponentValue instead of Token, which makes more sense.
+ $cvs = [];
+ do {
+ $cvs[] = $this->consumeComponentValue();
+ $this->consumeToken();
+ } while (
+ $this->currentToken->type() !== Token::T_SEMICOLON &&
+ $this->currentToken->type() !== Token::T_EOF
+ );
+ $tokens = ( new ComponentValueList( $cvs ) )->toTokenArray();
+ $parser = static::newFromTokens( $tokens, $this->currentToken );
+ $parser->consumeToken(); // Load that first token
+ $declaration = $parser->consumeDeclaration();
+ // Propagate any errors
+ $this->parseErrors = array_merge( $this->parseErrors, $parser->parseErrors );
+ break;
+
+ default:
+ $this->parseError( 'unexpected-token-in-declaration-list', $this->currentToken );
+ do {
+ $this->consumeComponentValue();
+ $this->consumeToken();
+ } while (
+ $this->currentToken->type() !== Token::T_SEMICOLON &&
+ $this->currentToken->type() !== Token::T_EOF
+ );
+ $declaration = null;
+ break;
+ }
+
+ if ( $declaration ) {
+ $list->add( $declaration );
+ }
+ $this->consumeToken();
+ }
+
+ return $list;
+ }
+
+ /**
+ * Consume a declaration
+ * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#consume-a-declaration
+ * @return Declaration|null
+ */
+ protected function consumeDeclaration() {
+ $declaration = new Declaration( $this->currentToken );
+
+ // 2.
+ $this->consumeTokenAndWhitespace();
+
+ // 3.
+ if ( $this->currentToken->type() !== Token::T_COLON ) {
+ $this->parseError( 'expected-colon', $this->currentToken );
+ return null;
+ }
+ $this->consumeToken();
+
+ // 4.
+ $value = $declaration->getValue();
+ $l1 = $l2 = -1;
+ while ( $this->currentToken->type() !== Token::T_EOF ) {
+ // The draft changes this to ComponentValue instead of Token, which makes more sense.
+ $value->add( $this->consumeComponentValue() );
+ if ( $this->currentToken->type() !== Token::T_WHITESPACE ) {
+ $l1 = $l2;
+ $l2 = $value->count() - 1;
+ }
+ $this->consumeToken();
+ }
+
+ // 5.
+ $v1 = $l1 >= 0 ? $value[$l1] : null;
+ $v2 = $l2 >= 0 ? $value[$l2] : null;
+ if ( $v1 instanceof Token && $v1->type() === Token::T_DELIM && $v1->value() === '!' &&
+ $v2 instanceof Token && $v2->type() === Token::T_IDENT &&
+ !strcasecmp( $v2->value(), 'important' )
+ ) {
+ // Technically it doesn't say to remove any whitespace within/after
+ // the "!important" too, but it makes sense to do so.
+ while ( isset( $value[$l1] ) ) {
+ $value->remove( $l1 );
+ }
+ $declaration->setImportant( true );
+ }
+
+ // 6.
+ return $declaration;
+ }
+
+ /**
+ * Consume an at-rule
+ * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#consume-an-at-rule
+ * @return AtRule
+ */
+ protected function consumeAtRule() {
+ $rule = new AtRule( $this->currentToken );
+ $this->consumeToken();
+ while ( true ) {
+ switch ( $this->currentToken->type() ) {
+ case Token::T_SEMICOLON:
+ return $rule;
+
+ case Token::T_EOF:
+ // Parse error from the editor's draft as of 2017-01-11
+ if ( $this->currentToken->typeFlag() !== 'recursion-depth-exceeded' ) {
+ $this->parseError( 'unexpected-eof-in-rule', $this->currentToken );
+ }
+ return $rule;
+
+ case Token::T_LEFT_BRACE:
+ $rule->setBlock( $this->consumeSimpleBlock( true ) );
+ return $rule;
+
+ default:
+ $rule->getPrelude()->add( $this->consumeComponentValue() );
+ break;
+ }
+ $this->consumeToken();
+ }
+ // @codeCoverageIgnoreStart
+ }
+ // @codeCoverageIgnoreEnd
+
+ /**
+ * Consume a qualified rule
+ * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#consume-a-qualified-rule
+ * @return QualifiedRule|null
+ */
+ protected function consumeQualifiedRule() {
+ $rule = new QualifiedRule( $this->currentToken );
+ while ( true ) {
+ switch ( $this->currentToken->type() ) {
+ case Token::T_EOF:
+ if ( $this->currentToken->typeFlag() !== 'recursion-depth-exceeded' ) {
+ $this->parseError( 'unexpected-eof-in-rule', $this->currentToken );
+ }
+ return null;
+
+ case Token::T_LEFT_BRACE:
+ $rule->setBlock( $this->consumeSimpleBlock( true ) );
+ return $rule;
+
+ default:
+ $rule->getPrelude()->add( $this->consumeComponentValue() );
+ break;
+ }
+ $this->consumeToken();
+ }
+ // @codeCoverageIgnoreStart
+ }
+ // @codeCoverageIgnoreEnd
+
+ /**
+ * Consume a component value
+ * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#consume-a-component-value
+ * @return ComponentValue
+ */
+ protected function consumeComponentValue() {
+ if ( ++$this->cvDepth > static::CV_DEPTH_LIMIT ) {
+ $this->parseError( 'recursion-depth-exceeded', $this->currentToken );
+ // There's no way to safely recover from this without more recursion.
+ // So just eat the rest of the input, then return a
+ // specially-flagged EOF so we can avoid 100 "unexpected EOF"
+ // errors.
+ $position = $this->currentToken->getPosition();
+ while ( $this->currentToken->type() !== Token::T_EOF ) {
+ $this->consumeToken();
+ }
+ $this->currentToken = new Token( Token::T_EOF, [
+ 'position' => $position,
+ 'typeFlag' => 'recursion-depth-exceeded'
+ ] );
+ }
+
+ switch ( $this->currentToken->type() ) {
+ case Token::T_LEFT_BRACE:
+ case Token::T_LEFT_BRACKET:
+ case Token::T_LEFT_PAREN:
+ $ret = $this->consumeSimpleBlock();
+ break;
+
+ case Token::T_FUNCTION:
+ $ret = $this->consumeFunction();
+ break;
+
+ default:
+ $ret = $this->currentToken;
+ break;
+ }
+
+ $this->cvDepth--;
+ return $ret;
+ }
+
+ /**
+ * Consume a simple block
+ * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#consume-a-simple-block
+ * @return SimpleBlock
+ */
+ protected function consumeSimpleBlock() {
+ $block = new SimpleBlock( $this->currentToken );
+ $endTokenType = $block->getEndTokenType();
+ $this->consumeToken();
+ while ( true ) {
+ switch ( $this->currentToken->type() ) {
+ case Token::T_EOF:
+ // Parse error from the editor's draft as of 2017-01-12
+ if ( $this->currentToken->typeFlag() !== 'recursion-depth-exceeded' ) {
+ $this->parseError( 'unexpected-eof-in-block', $this->currentToken );
+ }
+ return $block;
+
+ case $endTokenType:
+ return $block;
+
+ default:
+ $block->getValue()->add( $this->consumeComponentValue() );
+ break;
+ }
+ $this->consumeToken();
+ }
+ // @codeCoverageIgnoreStart
+ }
+ // @codeCoverageIgnoreEnd
+
+ /**
+ * Consume a function
+ * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#consume-a-function
+ * @return CSSFunction
+ */
+ protected function consumeFunction() {
+ $function = new CSSFunction( $this->currentToken );
+ $this->consumeToken();
+
+ while ( true ) {
+ switch ( $this->currentToken->type() ) {
+ case Token::T_EOF:
+ // Parse error from the editor's draft as of 2017-01-12
+ if ( $this->currentToken->typeFlag() !== 'recursion-depth-exceeded' ) {
+ $this->parseError( 'unexpected-eof-in-function', $this->currentToken );
+ }
+ return $function;
+
+ case Token::T_RIGHT_PAREN:
+ return $function;
+
+ default:
+ $function->getValue()->add( $this->consumeComponentValue() );
+ break;
+ }
+ $this->consumeToken();
+ }
+ // @codeCoverageIgnoreStart
+ }
+ // @codeCoverageIgnoreEnd
+}