invalid ) { return $text; } foreach ( $funcs as $func ) { $norm->$func(); } return $norm->serialize(); } protected function __construct( $text ) { $this->doc = new DOMDocument( '1.0', 'utf-8' ); // Note: parsing a supposedly XHTML document with an XML parser is not // guaranteed to give accurate results. For example, it may introduce // differences in the number of line breaks in
 tags.

		Wikimedia\suppressWarnings();
		if ( !$this->doc->loadXML( '' . $text . '' ) ) {
			$this->invalid = true;
		}
		Wikimedia\restoreWarnings();
		$this->xpath = new DOMXPath( $this->doc );
		$this->body = $this->xpath->query( '//body' )->item( 0 );
	}

	protected function removeTbody() {
		foreach ( $this->xpath->query( '//tbody' ) as $tbody ) {
			while ( $tbody->firstChild ) {
				$child = $tbody->firstChild;
				$tbody->removeChild( $child );
				$tbody->parentNode->insertBefore( $child, $tbody );
			}
			$tbody->parentNode->removeChild( $tbody );
		}
	}

	/**
	 * The point of this function is to produce a normalized DOM in which
	 * Tidy's output matches the output of RemexHtml. Tidy both trims
	 * and pretty-prints, so this requires fairly aggressive treatment.
	 *
	 * In particular, note that Tidy converts 
x
to
\nx\n
, * which theoretically affects display since the second line break is not * ignored by compliant HTML parsers. * * This function also removes empty elements, as does Tidy. */ protected function trimWhitespace() { foreach ( $this->xpath->query( '//text()' ) as $child ) { if ( strtolower( $child->parentNode->nodeName ) === 'pre' ) { // Just trim one line break from the start and end if ( substr_compare( $child->data, "\n", 0 ) === 0 ) { $child->data = substr( $child->data, 1 ); } if ( substr_compare( $child->data, "\n", -1 ) === 0 ) { $child->data = substr( $child->data, 0, -1 ); } } else { // Trim all whitespace $child->data = trim( $child->data ); } if ( $child->data === '' ) { $child->parentNode->removeChild( $child ); } } } /** * Serialize the XML DOM for comparison purposes. This does not generate HTML. * @return string */ protected function serialize() { return strtr( $this->doc->saveXML( $this->body ), [ '' => '', '' => '' ] ); } }