X-Git-Url: https://scripts.mit.edu/gitweb/autoinstallsdev/mediawiki.git/blobdiff_plain/19e297c21b10b1b8a3acad5e73fc71dcb35db44a..6932310fd58ebef145fa01eb76edf7150284d8ea:/includes/libs/mime/XmlTypeCheck.php diff --git a/includes/libs/mime/XmlTypeCheck.php b/includes/libs/mime/XmlTypeCheck.php new file mode 100644 index 00000000..ea7f9a6c --- /dev/null +++ b/includes/libs/mime/XmlTypeCheck.php @@ -0,0 +1,503 @@ + '', + 'external_dtd_handler' => '', + 'dtd_handler' => '', + 'require_safe_dtd' => true + ]; + + /** + * Allow filtering an XML file. + * + * Filters should return either true or a string to indicate something + * is wrong with the file. $this->filterMatch will store if the + * file failed validation (true = failed validation). + * $this->filterMatchType will contain the validation error. + * $this->wellFormed will contain whether the xml file is well-formed. + * + * @note If multiple filters are hit, only one of them will have the + * result stored in $this->filterMatchType. + * + * @param string $input a filename or string containing the XML element + * @param callable $filterCallback (optional) + * Function to call to do additional custom validity checks from the + * SAX element handler event. This gives you access to the element + * namespace, name, attributes, and text contents. + * Filter should return a truthy value describing the error. + * @param bool $isFile (optional) indicates if the first parameter is a + * filename (default, true) or if it is a string (false) + * @param array $options list of additional parsing options: + * processing_instruction_handler: Callback for xml_set_processing_instruction_handler + * external_dtd_handler: Callback for the url of external dtd subset + * dtd_handler: Callback given the full text of the filterCallback = $filterCallback; + $this->parserOptions = array_merge( $this->parserOptions, $options ); + $this->validateFromInput( $input, $isFile ); + } + + /** + * Alternative constructor: from filename + * + * @param string $fname the filename of an XML document + * @param callable $filterCallback (optional) + * Function to call to do additional custom validity checks from the + * SAX element handler event. This gives you access to the element + * namespace, name, and attributes, but not to text contents. + * Filter should return 'true' to toggle on $this->filterMatch + * @return XmlTypeCheck + */ + public static function newFromFilename( $fname, $filterCallback = null ) { + return new self( $fname, $filterCallback, true ); + } + + /** + * Alternative constructor: from string + * + * @param string $string a string containing an XML element + * @param callable $filterCallback (optional) + * Function to call to do additional custom validity checks from the + * SAX element handler event. This gives you access to the element + * namespace, name, and attributes, but not to text contents. + * Filter should return 'true' to toggle on $this->filterMatch + * @return XmlTypeCheck + */ + public static function newFromString( $string, $filterCallback = null ) { + return new self( $string, $filterCallback, false ); + } + + /** + * Get the root element. Simple accessor to $rootElement + * + * @return string + */ + public function getRootElement() { + return $this->rootElement; + } + + /** + * @param string $fname the filename + */ + private function validateFromInput( $xml, $isFile ) { + $reader = new XMLReader(); + if ( $isFile ) { + $s = $reader->open( $xml, null, LIBXML_NOERROR | LIBXML_NOWARNING ); + } else { + $s = $reader->XML( $xml, null, LIBXML_NOERROR | LIBXML_NOWARNING ); + } + if ( $s !== true ) { + // Couldn't open the XML + $this->wellFormed = false; + } else { + $oldDisable = libxml_disable_entity_loader( true ); + $reader->setParserProperty( XMLReader::SUBST_ENTITIES, true ); + try { + $this->validate( $reader ); + } catch ( Exception $e ) { + // Calling this malformed, because we didn't parse the whole + // thing. Maybe just an external entity refernce. + $this->wellFormed = false; + $reader->close(); + libxml_disable_entity_loader( $oldDisable ); + throw $e; + } + $reader->close(); + libxml_disable_entity_loader( $oldDisable ); + } + } + + private function readNext( XMLReader $reader ) { + set_error_handler( [ $this, 'XmlErrorHandler' ] ); + $ret = $reader->read(); + restore_error_handler(); + return $ret; + } + + public function XmlErrorHandler( $errno, $errstr ) { + $this->wellFormed = false; + } + + private function validate( $reader ) { + // First, move through anything that isn't an element, and + // handle any processing instructions with the callback + do { + if ( !$this->readNext( $reader ) ) { + // Hit the end of the document before any elements + $this->wellFormed = false; + return; + } + if ( $reader->nodeType === XMLReader::PI ) { + $this->processingInstructionHandler( $reader->name, $reader->value ); + } + if ( $reader->nodeType === XMLReader::DOC_TYPE ) { + $this->DTDHandler( $reader ); + } + } while ( $reader->nodeType != XMLReader::ELEMENT ); + + // Process the rest of the document + do { + switch ( $reader->nodeType ) { + case XMLReader::ELEMENT: + $name = $this->expandNS( + $reader->name, + $reader->namespaceURI + ); + if ( $this->rootElement === '' ) { + $this->rootElement = $name; + } + $empty = $reader->isEmptyElement; + $attrs = $this->getAttributesArray( $reader ); + $this->elementOpen( $name, $attrs ); + if ( $empty ) { + $this->elementClose(); + } + break; + + case XMLReader::END_ELEMENT: + $this->elementClose(); + break; + + case XMLReader::WHITESPACE: + case XMLReader::SIGNIFICANT_WHITESPACE: + case XMLReader::CDATA: + case XMLReader::TEXT: + $this->elementData( $reader->value ); + break; + + case XMLReader::ENTITY_REF: + // Unexpanded entity (maybe external?), + // don't send to the filter (xml_parse didn't) + break; + + case XMLReader::COMMENT: + // Don't send to the filter (xml_parse didn't) + break; + + case XMLReader::PI: + // Processing instructions can happen after the header too + $this->processingInstructionHandler( + $reader->name, + $reader->value + ); + break; + case XMLReader::DOC_TYPE: + // We should never see a doctype after first + // element. + $this->wellFormed = false; + break; + default: + // One of DOC, ENTITY, END_ENTITY, + // NOTATION, or XML_DECLARATION + // xml_parse didn't send these to the filter, so we won't. + } + } while ( $this->readNext( $reader ) ); + + if ( $this->stackDepth !== 0 ) { + $this->wellFormed = false; + } elseif ( $this->wellFormed === null ) { + $this->wellFormed = true; + } + } + + /** + * Get all of the attributes for an XMLReader's current node + * @param XMLReader $r + * @return array of attributes + */ + private function getAttributesArray( XMLReader $r ) { + $attrs = []; + while ( $r->moveToNextAttribute() ) { + if ( $r->namespaceURI === 'http://www.w3.org/2000/xmlns/' ) { + // XMLReader treats xmlns attributes as normal + // attributes, while xml_parse doesn't + continue; + } + $name = $this->expandNS( $r->name, $r->namespaceURI ); + $attrs[$name] = $r->value; + } + return $attrs; + } + + /** + * @param string $name element or attribute name, maybe with a full or short prefix + * @param string $namespaceURI the namespaceURI + * @return string the name prefixed with namespaceURI + */ + private function expandNS( $name, $namespaceURI ) { + if ( $namespaceURI ) { + $parts = explode( ':', $name ); + $localname = array_pop( $parts ); + return "$namespaceURI:$localname"; + } + return $name; + } + + /** + * @param string $name + * @param string $attribs + */ + private function elementOpen( $name, $attribs ) { + $this->elementDataContext[] = [ $name, $attribs ]; + $this->elementData[] = ''; + $this->stackDepth++; + } + + private function elementClose() { + list( $name, $attribs ) = array_pop( $this->elementDataContext ); + $data = array_pop( $this->elementData ); + $this->stackDepth--; + $callbackReturn = false; + + if ( is_callable( $this->filterCallback ) ) { + $callbackReturn = call_user_func( + $this->filterCallback, + $name, + $attribs, + $data + ); + } + if ( $callbackReturn ) { + // Filter hit! + $this->filterMatch = true; + $this->filterMatchType = $callbackReturn; + } + } + + /** + * @param string $data + */ + private function elementData( $data ) { + // Collect any data here, and we'll run the callback in elementClose + $this->elementData[ $this->stackDepth - 1 ] .= trim( $data ); + } + + /** + * @param string $target + * @param string $data + */ + private function processingInstructionHandler( $target, $data ) { + $callbackReturn = false; + if ( $this->parserOptions['processing_instruction_handler'] ) { + $callbackReturn = call_user_func( + $this->parserOptions['processing_instruction_handler'], + $target, + $data + ); + } + if ( $callbackReturn ) { + // Filter hit! + $this->filterMatch = true; + $this->filterMatchType = $callbackReturn; + } + } + /** + * Handle coming across a parserOptions['external_dtd_handler']; + $generalCallback = $this->parserOptions['dtd_handler']; + $checkIfSafe = $this->parserOptions['require_safe_dtd']; + if ( !$externalCallback && !$generalCallback && !$checkIfSafe ) { + return; + } + $dtd = $reader->readOuterXML(); + $callbackReturn = false; + + if ( $generalCallback ) { + $callbackReturn = call_user_func( $generalCallback, $dtd ); + } + if ( $callbackReturn ) { + // Filter hit! + $this->filterMatch = true; + $this->filterMatchType = $callbackReturn; + $callbackReturn = false; + } + + $parsedDTD = $this->parseDTD( $dtd ); + if ( $externalCallback && isset( $parsedDTD['type'] ) ) { + $callbackReturn = call_user_func( + $externalCallback, + $parsedDTD['type'], + isset( $parsedDTD['publicid'] ) ? $parsedDTD['publicid'] : null, + isset( $parsedDTD['systemid'] ) ? $parsedDTD['systemid'] : null + ); + } + if ( $callbackReturn ) { + // Filter hit! + $this->filterMatch = true; + $this->filterMatchType = $callbackReturn; + $callbackReturn = false; + } + + if ( $checkIfSafe && isset( $parsedDTD['internal'] ) ) { + if ( !$this->checkDTDIsSafe( $parsedDTD['internal'] ) ) { + $this->wellFormed = false; + } + } + } + + /** + * Check if the internal subset of the DTD is safe. + * + * We whitelist an extremely restricted subset of DTD features. + * + * Safe is defined as: + * * Only contains entity defintions (e.g. No 255 bytes). + * * + * allowed if matched exactly for compatibility with graphviz + * * Comments. + * + * @param string $internalSubset The internal subset of the DTD + * @return bool true if safe. + */ + private function checkDTDIsSafe( $internalSubset ) { + $offset = 0; + $res = preg_match( + '/^(?:\s*' . + '|\s*' . + '|\s*)*\s*$/', + $internalSubset + ); + + return (bool)$res; + } + + /** + * Parse DTD into parts. + * + * If there is an error parsing the dtd, sets wellFormed to false. + * + * @param string $dtd + * @return array Possibly containing keys publicid, systemid, type and internal. + */ + private function parseDTD( $dtd ) { + $m = []; + $res = preg_match( + '/^PUBLIC)\s*' . + '(?:"(?P[^"]*)"|\'(?P[^\']*)\')' . // public identifer + '\s*"(?P[^"]*)"|\'(?P[^\']*)\'' . // system identifier + '|(?PSYSTEM)\s*' . + '(?:"(?P[^"]*)"|\'(?P[^\']*)\')' . + ')?\s*' . + '(?:\[\s*(?P.*)\])?\s*>$/s', + $dtd, + $m + ); + if ( !$res ) { + $this->wellFormed = false; + return []; + } + $parsed = []; + foreach ( $m as $field => $value ) { + if ( $value === '' || is_numeric( $field ) ) { + continue; + } + switch ( $field ) { + case 'typepublic': + case 'typesystem': + $parsed['type'] = $value; + break; + case 'pubquote': + case 'pubapos': + $parsed['publicid'] = $value; + break; + case 'pubsysquote': + case 'pubsysapos': + case 'sysquote': + case 'sysapos': + $parsed['systemid'] = $value; + break; + case 'internal': + $parsed['internal'] = $value; + break; + } + } + return $parsed; + } +}