tag. * - Parser::parseDeclarationList() to parse an inline style attribute */ class Parser { /** Maximum depth of nested ComponentValues */ const CV_DEPTH_LIMIT = 100; // Arbitrary number that seems like it should be enough /** @var Tokenizer */ protected $tokenizer; /** @var Token|null The most recently consumed token */ protected $currentToken = null; /** @var array Parse errors. Each error is [ string $tag, int $line, int $pos ] */ protected $parseErrors = []; /** @var int Recursion depth, incremented in self::consumeComponentValue() */ protected $cvDepth = 0; /** * @param Tokenizer $tokenizer CSS Tokenizer */ public function __construct( Tokenizer $tokenizer ) { $this->tokenizer = $tokenizer; } /** * Create a Parser for a CSS string * @param string $source CSS to parse. * @param array $options Configuration options, see DataSourceTokenizer::__construct(). Also, * - convert: (array) If specified, detect the encoding as defined in the * CSS spec. The value is passed as the $encodings argument to * Encoder::convert(). * @return static */ public static function newFromString( $source, array $options = [] ) { if ( isset( $options['convert'] ) ) { $source = Encoder::convert( $source, $options['convert'] ); } return static::newFromDataSource( new StringDataSource( $source ), $options ); } /** * Create a Parser for a CSS DataSource * @param DataSource $source CSS to parse. * @param array $options Configuration options, see DataSourceTokenizer::__construct(). * @return static */ public static function newFromDataSource( DataSource $source, array $options = [] ) { $tokenizer = new DataSourceTokenizer( $source, $options ); return new static( $tokenizer ); } /** * Create a Parser for a list of Tokens * @param Token[] $tokens Token-stream to parse * @param Token|null $eof EOF-token * @return static */ public static function newFromTokens( array $tokens, Token $eof = null ) { $tokenizer = new TokenListTokenizer( $tokens, $eof ); return new static( $tokenizer ); } /** * Consume a token */ protected function consumeToken() { if ( !$this->currentToken || $this->currentToken->type() !== Token::T_EOF ) { $this->currentToken = $this->tokenizer->consumeToken(); // Copy any parse errors encountered foreach ( $this->tokenizer->getParseErrors() as $error ) { $this->parseErrors[] = $error; } $this->tokenizer->clearParseErrors(); } } /** * Consume a token, also consuming any following whitespace (and comments) */ protected function consumeTokenAndWhitespace() { do { $this->consumeToken(); } while ( $this->currentToken->type() === Token::T_WHITESPACE ); } /** * Return all parse errors seen so far * @return array Array of [ string $tag, int $line, int $pos, ... ] */ public function getParseErrors() { return $this->parseErrors; } /** * Clear parse errors */ public function clearParseErrors() { $this->parseErrors = []; } /** * Record a parse error * @param string $tag Error tag * @param Token $token Report the error as starting at this token. * @param array $data Extra data about the error. */ protected function parseError( $tag, Token $token, array $data = [] ) { list( $line, $pos ) = $token->getPosition(); $this->parseErrors[] = array_merge( [ $tag, $line, $pos ], $data ); } /** * Parse a stylesheet * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#parse-a-stylesheet * @note Per the Editor's Draft, if the first rule is an at-rule named * "charset" it will be silently dropped. If you're not using the provided * Sanitizer classes to further sanitize the CSS, you'll want to manually * filter out any other such rules before stringifying the stylesheet * and/or prepend `@charset "utf-8";` after stringifying it. * @return Stylesheet */ public function parseStylesheet() { $this->consumeToken(); // Move to the first token $list = $this->consumeRuleList( true ); // Drop @charset per the Editor's Draft if ( isset( $list[0] ) && $list[0] instanceof AtRule && !strcasecmp( $list[0]->getName(), 'charset' ) ) { $list->remove( 0 ); $list->rewind(); } return new Stylesheet( $list ); } /** * Parse a list of rules * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#parse-a-list-of-rules * @return RuleList */ public function parseRuleList() { $this->consumeToken(); // Move to the first token return $this->consumeRuleList( false ); } /** * Parse a rule * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#parse-a-rule * @return Rule|null */ public function parseRule() { // 1. and 2. $this->consumeTokenAndWhitespace(); // 3. if ( $this->currentToken->type() === Token::T_EOF ) { $this->parseError( 'unexpected-eof', $this->currentToken ); // "return a syntax error"? return null; } if ( $this->currentToken->type() === Token::T_AT_KEYWORD ) { $rule = $this->consumeAtRule(); } else { $rule = $this->consumeQualifiedRule(); if ( !$rule ) { return null; } } // 4. $this->consumeTokenAndWhitespace(); // 5. if ( $this->currentToken->type() === Token::T_EOF ) { return $rule; } else { $this->parseError( 'expected-eof', $this->currentToken ); // "return a syntax error"? return null; } } /** * Parse a declaration * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#parse-a-declaration * @return Declaration|null */ public function parseDeclaration() { // 1. and 2. $this->consumeTokenAndWhitespace(); // 3. if ( $this->currentToken->type() !== Token::T_IDENT ) { $this->parseError( 'expected-ident', $this->currentToken ); // "return a syntax error"? return null; } // 4. $declaration = $this->consumeDeclaration(); // Declarations always run to EOF, no need to check. return $declaration; } /** * Parse a list of declarations * @note This is not the entry point the standard calls "parse a list of declarations", * see self::parseDeclarationOrAtRuleList() * @return DeclarationList */ public function parseDeclarationList() { $this->consumeToken(); // Move to the first token return $this->consumeDeclarationOrAtRuleList( false ); } /** * Parse a list of declarations and at-rules * @note This is the entry point the standard calls "parse a list of declarations" * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#parse-a-list-of-declarations * @return DeclarationOrAtRuleList */ public function parseDeclarationOrAtRuleList() { $this->consumeToken(); // Move to the first token return $this->consumeDeclarationOrAtRuleList(); } /** * Parse a (non-whitespace) component value * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#parse-a-component-value * @return ComponentValue|null */ public function parseComponentValue() { // 1. and 2. $this->consumeTokenAndWhitespace(); // 3. if ( $this->currentToken->type() === Token::T_EOF ) { $this->parseError( 'unexpected-eof', $this->currentToken ); // "return a syntax error"? return null; } // 4. $value = $this->consumeComponentValue(); // The spec says to return a syntax error if nothing is returned, but // that can never happen and the Editor's Draft removed that language. // 5. $this->consumeTokenAndWhitespace(); // 6. if ( $this->currentToken->type() === Token::T_EOF ) { return $value; } else { $this->parseError( 'expected-eof', $this->currentToken ); // "return a syntax error"? return null; } } /** * Parse a list of component values * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#parse-a-list-of-component-values * @return ComponentValueList */ public function parseComponentValueList() { $list = new ComponentValueList(); while ( true ) { $this->consumeToken(); // Move to the first/next token $value = $this->consumeComponentValue(); if ( $value instanceof Token && $value->type() === Token::T_EOF ) { break; } $list->add( $value ); } return $list; } /** * Consume a list of rules * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#consume-a-list-of-rules * @param boolean $topLevel Determines the behavior when CDO and CDC tokens are encountered * @return RuleList */ protected function consumeRuleList( $topLevel ) { $list = new RuleList(); while ( true ) { $rule = false; switch ( $this->currentToken->type() ) { case Token::T_WHITESPACE: break; case Token::T_EOF: break 2; case Token::T_CDO: case Token::T_CDC: if ( $topLevel ) { // Do nothing } else { $rule = $this->consumeQualifiedRule(); } break; case Token::T_AT_KEYWORD: $rule = $this->consumeAtRule(); break; default: $rule = $this->consumeQualifiedRule(); break; } if ( $rule ) { $list->add( $rule ); } $this->consumeToken(); } return $list; } /** * Consume a list of declarations and at-rules * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#consume-a-list-of-declarations * @param bool $allowAtRules Whether to allow at-rules. This flag is not in * the spec, and is used to implement the non-spec self::parseDeclarationList(). * @return DeclarationOrAtRuleList|DeclarationList */ protected function consumeDeclarationOrAtRuleList( $allowAtRules = true ) { $list = $allowAtRules ? new DeclarationOrAtRuleList() : new DeclarationList(); while ( true ) { $declaration = false; switch ( $this->currentToken->type() ) { case Token::T_WHITESPACE: break; case Token::T_SEMICOLON: $declaration = null; break; case Token::T_EOF: break 2; case Token::T_AT_KEYWORD: if ( $allowAtRules ) { $declaration = $this->consumeAtRule(); } else { $this->parseError( 'unexpected-token-in-declaration-list', $this->currentToken ); $this->consumeAtRule(); $declaration = null; } break; case Token::T_IDENT: // The draft changes this to ComponentValue instead of Token, which makes more sense. $cvs = []; do { $cvs[] = $this->consumeComponentValue(); $this->consumeToken(); } while ( $this->currentToken->type() !== Token::T_SEMICOLON && $this->currentToken->type() !== Token::T_EOF ); $tokens = ( new ComponentValueList( $cvs ) )->toTokenArray(); $parser = static::newFromTokens( $tokens, $this->currentToken ); $parser->consumeToken(); // Load that first token $declaration = $parser->consumeDeclaration(); // Propagate any errors $this->parseErrors = array_merge( $this->parseErrors, $parser->parseErrors ); break; default: $this->parseError( 'unexpected-token-in-declaration-list', $this->currentToken ); do { $this->consumeComponentValue(); $this->consumeToken(); } while ( $this->currentToken->type() !== Token::T_SEMICOLON && $this->currentToken->type() !== Token::T_EOF ); $declaration = null; break; } if ( $declaration ) { $list->add( $declaration ); } $this->consumeToken(); } return $list; } /** * Consume a declaration * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#consume-a-declaration * @return Declaration|null */ protected function consumeDeclaration() { $declaration = new Declaration( $this->currentToken ); // 2. $this->consumeTokenAndWhitespace(); // 3. if ( $this->currentToken->type() !== Token::T_COLON ) { $this->parseError( 'expected-colon', $this->currentToken ); return null; } $this->consumeToken(); // 4. $value = $declaration->getValue(); $l1 = $l2 = -1; while ( $this->currentToken->type() !== Token::T_EOF ) { // The draft changes this to ComponentValue instead of Token, which makes more sense. $value->add( $this->consumeComponentValue() ); if ( $this->currentToken->type() !== Token::T_WHITESPACE ) { $l1 = $l2; $l2 = $value->count() - 1; } $this->consumeToken(); } // 5. $v1 = $l1 >= 0 ? $value[$l1] : null; $v2 = $l2 >= 0 ? $value[$l2] : null; if ( $v1 instanceof Token && $v1->type() === Token::T_DELIM && $v1->value() === '!' && $v2 instanceof Token && $v2->type() === Token::T_IDENT && !strcasecmp( $v2->value(), 'important' ) ) { // Technically it doesn't say to remove any whitespace within/after // the "!important" too, but it makes sense to do so. while ( isset( $value[$l1] ) ) { $value->remove( $l1 ); } $declaration->setImportant( true ); } // 6. return $declaration; } /** * Consume an at-rule * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#consume-an-at-rule * @return AtRule */ protected function consumeAtRule() { $rule = new AtRule( $this->currentToken ); $this->consumeToken(); while ( true ) { switch ( $this->currentToken->type() ) { case Token::T_SEMICOLON: return $rule; case Token::T_EOF: // Parse error from the editor's draft as of 2017-01-11 if ( $this->currentToken->typeFlag() !== 'recursion-depth-exceeded' ) { $this->parseError( 'unexpected-eof-in-rule', $this->currentToken ); } return $rule; case Token::T_LEFT_BRACE: $rule->setBlock( $this->consumeSimpleBlock( true ) ); return $rule; default: $rule->getPrelude()->add( $this->consumeComponentValue() ); break; } $this->consumeToken(); } // @codeCoverageIgnoreStart } // @codeCoverageIgnoreEnd /** * Consume a qualified rule * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#consume-a-qualified-rule * @return QualifiedRule|null */ protected function consumeQualifiedRule() { $rule = new QualifiedRule( $this->currentToken ); while ( true ) { switch ( $this->currentToken->type() ) { case Token::T_EOF: if ( $this->currentToken->typeFlag() !== 'recursion-depth-exceeded' ) { $this->parseError( 'unexpected-eof-in-rule', $this->currentToken ); } return null; case Token::T_LEFT_BRACE: $rule->setBlock( $this->consumeSimpleBlock( true ) ); return $rule; default: $rule->getPrelude()->add( $this->consumeComponentValue() ); break; } $this->consumeToken(); } // @codeCoverageIgnoreStart } // @codeCoverageIgnoreEnd /** * Consume a component value * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#consume-a-component-value * @return ComponentValue */ protected function consumeComponentValue() { if ( ++$this->cvDepth > static::CV_DEPTH_LIMIT ) { $this->parseError( 'recursion-depth-exceeded', $this->currentToken ); // There's no way to safely recover from this without more recursion. // So just eat the rest of the input, then return a // specially-flagged EOF so we can avoid 100 "unexpected EOF" // errors. $position = $this->currentToken->getPosition(); while ( $this->currentToken->type() !== Token::T_EOF ) { $this->consumeToken(); } $this->currentToken = new Token( Token::T_EOF, [ 'position' => $position, 'typeFlag' => 'recursion-depth-exceeded' ] ); } switch ( $this->currentToken->type() ) { case Token::T_LEFT_BRACE: case Token::T_LEFT_BRACKET: case Token::T_LEFT_PAREN: $ret = $this->consumeSimpleBlock(); break; case Token::T_FUNCTION: $ret = $this->consumeFunction(); break; default: $ret = $this->currentToken; break; } $this->cvDepth--; return $ret; } /** * Consume a simple block * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#consume-a-simple-block * @return SimpleBlock */ protected function consumeSimpleBlock() { $block = new SimpleBlock( $this->currentToken ); $endTokenType = $block->getEndTokenType(); $this->consumeToken(); while ( true ) { switch ( $this->currentToken->type() ) { case Token::T_EOF: // Parse error from the editor's draft as of 2017-01-12 if ( $this->currentToken->typeFlag() !== 'recursion-depth-exceeded' ) { $this->parseError( 'unexpected-eof-in-block', $this->currentToken ); } return $block; case $endTokenType: return $block; default: $block->getValue()->add( $this->consumeComponentValue() ); break; } $this->consumeToken(); } // @codeCoverageIgnoreStart } // @codeCoverageIgnoreEnd /** * Consume a function * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#consume-a-function * @return CSSFunction */ protected function consumeFunction() { $function = new CSSFunction( $this->currentToken ); $this->consumeToken(); while ( true ) { switch ( $this->currentToken->type() ) { case Token::T_EOF: // Parse error from the editor's draft as of 2017-01-12 if ( $this->currentToken->typeFlag() !== 'recursion-depth-exceeded' ) { $this->parseError( 'unexpected-eof-in-function', $this->currentToken ); } return $function; case Token::T_RIGHT_PAREN: return $function; default: $function->getValue()->add( $this->consumeComponentValue() ); break; } $this->consumeToken(); } // @codeCoverageIgnoreStart } // @codeCoverageIgnoreEnd }