null, 'external_dtd_handler' => '', 'dtd_handler' => '', 'require_safe_dtd' => true ]; /** * Allow filtering an XML file. * * Filters should return either true or a string to indicate something * is wrong with the file. $this->filterMatch will store if the * file failed validation (true = failed validation). * $this->filterMatchType will contain the validation error. * $this->wellFormed will contain whether the xml file is well-formed. * * @note If multiple filters are hit, only one of them will have the * result stored in $this->filterMatchType. * * @param string $input a filename or string containing the XML element * @param callable|null $filterCallback (optional) * Function to call to do additional custom validity checks from the * SAX element handler event. This gives you access to the element * namespace, name, attributes, and text contents. * Filter should return a truthy value describing the error. * @param bool $isFile (optional) indicates if the first parameter is a * filename (default, true) or if it is a string (false) * @param array $options list of additional parsing options: * processing_instruction_handler: Callback for xml_set_processing_instruction_handler * external_dtd_handler: Callback for the url of external dtd subset * dtd_handler: Callback given the full text of the filterCallback = $filterCallback; $this->parserOptions = array_merge( $this->parserOptions, $options ); $this->validateFromInput( $input, $isFile ); } /** * Alternative constructor: from filename * * @param string $fname the filename of an XML document * @param callable|null $filterCallback (optional) * Function to call to do additional custom validity checks from the * SAX element handler event. This gives you access to the element * namespace, name, and attributes, but not to text contents. * Filter should return 'true' to toggle on $this->filterMatch * @return XmlTypeCheck */ public static function newFromFilename( $fname, $filterCallback = null ) { return new self( $fname, $filterCallback, true ); } /** * Alternative constructor: from string * * @param string $string a string containing an XML element * @param callable|null $filterCallback (optional) * Function to call to do additional custom validity checks from the * SAX element handler event. This gives you access to the element * namespace, name, and attributes, but not to text contents. * Filter should return 'true' to toggle on $this->filterMatch * @return XmlTypeCheck */ public static function newFromString( $string, $filterCallback = null ) { return new self( $string, $filterCallback, false ); } /** * Get the root element. Simple accessor to $rootElement * * @return string */ public function getRootElement() { return $this->rootElement; } /** * @param string $xml * @param bool $isFile */ private function validateFromInput( $xml, $isFile ) { $reader = new XMLReader(); if ( $isFile ) { $s = $reader->open( $xml, null, LIBXML_NOERROR | LIBXML_NOWARNING ); } else { $s = $reader->XML( $xml, null, LIBXML_NOERROR | LIBXML_NOWARNING ); } if ( $s !== true ) { // Couldn't open the XML $this->wellFormed = false; } else { $oldDisable = libxml_disable_entity_loader( true ); $reader->setParserProperty( XMLReader::SUBST_ENTITIES, true ); try { $this->validate( $reader ); } catch ( Exception $e ) { // Calling this malformed, because we didn't parse the whole // thing. Maybe just an external entity refernce. $this->wellFormed = false; $reader->close(); libxml_disable_entity_loader( $oldDisable ); throw $e; } $reader->close(); libxml_disable_entity_loader( $oldDisable ); } } private function readNext( XMLReader $reader ) { set_error_handler( function ( $line, $file ) { $this->wellFormed = false; } ); $ret = $reader->read(); restore_error_handler(); return $ret; } private function validate( $reader ) { // First, move through anything that isn't an element, and // handle any processing instructions with the callback do { if ( !$this->readNext( $reader ) ) { // Hit the end of the document before any elements $this->wellFormed = false; return; } if ( $reader->nodeType === XMLReader::PI ) { $this->processingInstructionHandler( $reader->name, $reader->value ); } if ( $reader->nodeType === XMLReader::DOC_TYPE ) { $this->dtdHandler( $reader ); } } while ( $reader->nodeType != XMLReader::ELEMENT ); // Process the rest of the document do { switch ( $reader->nodeType ) { case XMLReader::ELEMENT: $name = $this->expandNS( $reader->name, $reader->namespaceURI ); if ( $this->rootElement === '' ) { $this->rootElement = $name; } $empty = $reader->isEmptyElement; $attrs = $this->getAttributesArray( $reader ); $this->elementOpen( $name, $attrs ); if ( $empty ) { $this->elementClose(); } break; case XMLReader::END_ELEMENT: $this->elementClose(); break; case XMLReader::WHITESPACE: case XMLReader::SIGNIFICANT_WHITESPACE: case XMLReader::CDATA: case XMLReader::TEXT: $this->elementData( $reader->value ); break; case XMLReader::ENTITY_REF: // Unexpanded entity (maybe external?), // don't send to the filter (xml_parse didn't) break; case XMLReader::COMMENT: // Don't send to the filter (xml_parse didn't) break; case XMLReader::PI: // Processing instructions can happen after the header too $this->processingInstructionHandler( $reader->name, $reader->value ); break; case XMLReader::DOC_TYPE: // We should never see a doctype after first // element. $this->wellFormed = false; break; default: // One of DOC, ENTITY, END_ENTITY, // NOTATION, or XML_DECLARATION // xml_parse didn't send these to the filter, so we won't. } } while ( $this->readNext( $reader ) ); if ( $this->stackDepth !== 0 ) { $this->wellFormed = false; } elseif ( $this->wellFormed === null ) { $this->wellFormed = true; } } /** * Get all of the attributes for an XMLReader's current node * @param XMLReader $r * @return array of attributes */ private function getAttributesArray( XMLReader $r ) { $attrs = []; while ( $r->moveToNextAttribute() ) { if ( $r->namespaceURI === 'http://www.w3.org/2000/xmlns/' ) { // XMLReader treats xmlns attributes as normal // attributes, while xml_parse doesn't continue; } $name = $this->expandNS( $r->name, $r->namespaceURI ); $attrs[$name] = $r->value; } return $attrs; } /** * @param string $name element or attribute name, maybe with a full or short prefix * @param string $namespaceURI * @return string the name prefixed with namespaceURI */ private function expandNS( $name, $namespaceURI ) { if ( $namespaceURI ) { $parts = explode( ':', $name ); $localname = array_pop( $parts ); return "$namespaceURI:$localname"; } return $name; } /** * @param string $name * @param array $attribs */ private function elementOpen( $name, $attribs ) { $this->elementDataContext[] = [ $name, $attribs ]; $this->elementData[] = ''; $this->stackDepth++; } private function elementClose() { list( $name, $attribs ) = array_pop( $this->elementDataContext ); $data = array_pop( $this->elementData ); $this->stackDepth--; $callbackReturn = false; if ( is_callable( $this->filterCallback ) ) { $callbackReturn = ( $this->filterCallback )( $name, $attribs, $data ); } if ( $callbackReturn ) { // Filter hit! $this->filterMatch = true; $this->filterMatchType = $callbackReturn; } } /** * @param string $data */ private function elementData( $data ) { // Collect any data here, and we'll run the callback in elementClose $this->elementData[ $this->stackDepth - 1 ] .= trim( $data ); } /** * @param string $target * @param string $data */ private function processingInstructionHandler( $target, $data ) { $callbackReturn = false; if ( $this->parserOptions['processing_instruction_handler'] ) { // @phan-suppress-next-line PhanTypeInvalidCallable false positive $callbackReturn = $this->parserOptions['processing_instruction_handler']( $target, $data ); } if ( $callbackReturn ) { // Filter hit! $this->filterMatch = true; $this->filterMatchType = $callbackReturn; } } /** * Handle coming across a parserOptions['external_dtd_handler']; $generalCallback = $this->parserOptions['dtd_handler']; $checkIfSafe = $this->parserOptions['require_safe_dtd']; if ( !$externalCallback && !$generalCallback && !$checkIfSafe ) { return; } $dtd = $reader->readOuterXml(); $callbackReturn = false; if ( $generalCallback ) { $callbackReturn = $generalCallback( $dtd ); } if ( $callbackReturn ) { // Filter hit! $this->filterMatch = true; $this->filterMatchType = $callbackReturn; $callbackReturn = false; } $parsedDTD = $this->parseDTD( $dtd ); if ( $externalCallback && isset( $parsedDTD['type'] ) ) { $callbackReturn = $externalCallback( $parsedDTD['type'], $parsedDTD['publicid'] ?? null, $parsedDTD['systemid'] ?? null ); } if ( $callbackReturn ) { // Filter hit! $this->filterMatch = true; $this->filterMatchType = $callbackReturn; } if ( $checkIfSafe && isset( $parsedDTD['internal'] ) && !$this->checkDTDIsSafe( $parsedDTD['internal'] ) ) { $this->wellFormed = false; } } /** * Check if the internal subset of the DTD is safe. * * We whitelist an extremely restricted subset of DTD features. * * Safe is defined as: * * Only contains entity definitions (e.g. No 255 bytes). * * * allowed if matched exactly for compatibility with graphviz * * Comments. * * @param string $internalSubset The internal subset of the DTD * @return bool true if safe. */ private function checkDTDIsSafe( $internalSubset ) { $res = preg_match( '/^(?:\s*' . '|\s*' . '|\s*)*\s*$/', $internalSubset ); return (bool)$res; } /** * Parse DTD into parts. * * If there is an error parsing the dtd, sets wellFormed to false. * * @param string $dtd * @return array Possibly containing keys publicid, systemid, type and internal. */ private function parseDTD( $dtd ) { $m = []; $res = preg_match( '/^PUBLIC)\s*' . '(?:"(?P[^"]*)"|\'(?P[^\']*)\')' . // public identifer '\s*"(?P[^"]*)"|\'(?P[^\']*)\'' . // system identifier '|(?PSYSTEM)\s*' . '(?:"(?P[^"]*)"|\'(?P[^\']*)\')' . ')?\s*' . '(?:\[\s*(?P.*)\])?\s*>$/s', $dtd, $m ); if ( !$res ) { $this->wellFormed = false; return []; } $parsed = []; foreach ( $m as $field => $value ) { if ( $value === '' || is_numeric( $field ) ) { continue; } switch ( $field ) { case 'typepublic': case 'typesystem': $parsed['type'] = $value; break; case 'pubquote': case 'pubapos': $parsed['publicid'] = $value; break; case 'pubsysquote': case 'pubsysapos': case 'sysquote': case 'sysapos': $parsed['systemid'] = $value; break; case 'internal': $parsed['internal'] = $value; break; } } return $parsed; } }