|null */ private $tagsWithChangedMisnestingBehavior = null; /** @var string|null */ private $obsoleteTagsRE = null; /** * We are trying to find HTML5 tags that have different behavior compared to HTML4 * in some misnesting scenarios around wikitext paragraphs. * * Ex: Input:

a

b

* Tidy output:

a

b

* HTML5 output:

a

b

* * So, all good here. * But, see how output changes when we use instead * * Ex: Input:

a

b

* Tidy output:

a

b

* HTML5 output:

a

b

* * The source wikitext is "a\n\nb". The difference persists even * when you have "a\n\n
b
" or "a\n\n{|\n|x\n|}\nbar". * * This is because Tidy seems to be doing the equivalent of HTM5-treebuilder's * active formatting element reconstruction step on all *inline* elements. * However, HTML5 parsers only do that on formatting elements. So, we need * to compute which HTML5 tags are subject to this differential behavior. * * We compute that by excluding the following tags from the list of all HTML5 tags * - If our sanitizer doesn't allow them, they will be escaped => ignore them * - HTML4 block tags are excluded (obviously) * - Void tags don't matter since they cannot wrap anything (obviously) * - Active formatting elements have special handling in the HTML5 tree building * algorithm where they are reconstructed to wrap all originally intended content. * (ex: above) * * Here is the list of 22 HTML5 tags that are affected: * ABBR, BDI, BDO, CITE, DATA, DEL, DFN, INS, KBD, MARK, * Q, RB, RP, RT, RTC, RUBY, SAMP, SPAN, SUB, SUP, TIME, VAR * * https://phabricator.wikimedia.org/T176363#3628173 verifies that this list of * tags all demonstrate this behavior. * * @return array * @phan-return array */ private function getTagsWithChangedMisnestingBehavior(): array { if ( $this->tagsWithChangedMisnestingBehavior === null ) { $this->tagsWithChangedMisnestingBehavior = []; foreach ( Consts::$HTML['HTML5Tags'] as $tag => $dummy ) { if ( isset( Consts::$Sanitizer['AllowedLiteralTags'][$tag] ) && !isset( Consts::$HTML['HTML4BlockTags'][$tag] ) && !isset( Consts::$HTML['FormattingTags'][$tag] ) && !isset( Consts::$HTML['VoidTags'][$tag] ) ) { $this->tagsWithChangedMisnestingBehavior[$tag] = true; } } } return $this->tagsWithChangedMisnestingBehavior; } /** * Finds a matching node at the "start" of this node. * @param DOMNode|null $node * @param DOMElement $match * @return DOMElement|null */ private function leftMostMisnestedDescendent( ?DOMNode $node, DOMElement $match ): ?DOMElement { if ( !$node instanceof DOMElement ) { return null; } if ( DOMUtils::isMarkerMeta( $node, 'mw:Placeholder/StrippedTag' ) ) { $name = DOMDataUtils::getDataParsoid( $node )->name ?? null; return $name === $match->nodeName ? $node : null; } if ( $node->nodeName === $match->nodeName ) { $dp = DOMDataUtils::getDataParsoid( $node ); if ( ( DOMDataUtils::getDataParsoid( $match )->stx ?? null ) === ( $dp->stx ?? null ) && !empty( $dp->autoInsertedStart ) ) { if ( !empty( $dp->autoInsertedEnd ) ) { return $this->getMatchingMisnestedNode( $node, $match ); } else { return $node; } } } return $this->leftMostMisnestedDescendent( $node->firstChild, $match ); } /** * $node has an 'autoInsertedEnd' flag set on it. We are looking for * its matching node that has an 'autoInsertedStart' flag set on it. * This happens when the tree-builder fixes up misnested tags. * This "adjacency" is wrt the HTML string. In a DOM, this can either * be the next sibling OR, it might be the left-most-descendent of * of $node's parent's sibling (and so on up the ancestor chain). * * @param DOMNode $node * @param DOMElement $match * @return DOMElement|null */ private function getMatchingMisnestedNode( DOMNode $node, DOMElement $match ): ?DOMElement { if ( DOMUtils::isBody( $node ) ) { return null; } if ( DOMUtils::nextNonSepSibling( $node ) ) { return $this->leftMostMisnestedDescendent( DOMUtils::nextNonSepSibling( $node ), $match ); } return $this->getMatchingMisnestedNode( $node->parentNode, $match ); } /** * Given a tplInfo object, determine whether we are: * - Not processing template content (could be extension or top level page) * - Processing encapsulated content that is produced by a single template. * If so, return the name of that template. * - Processing encapsulated content that comes from multiple templates. * If so, return a flag indicating this. * * FIXME: We might potentially be computing this information redundantly * for every lint we find within this template's content. It could probably * be cached in tplInfo after it is computed once. * * @param Env $env * @param stdClass|null $tplInfo Template info. * @return array|null */ private function findEnclosingTemplateName( Env $env, ?stdClass $tplInfo ): ?array { if ( !$tplInfo ) { return null; } if ( !DOMUtils::hasTypeOf( $tplInfo->first, 'mw:Transclusion' ) ) { return null; } $dmw = DOMDataUtils::getDataMw( $tplInfo->first ); if ( !empty( $dmw->parts ) && count( $dmw->parts ) === 1 ) { $p0 = $dmw->parts[0]; $name = null; if ( !empty( $p0->template->target->href ) ) { // Could be "function" // PORT-FIXME: Should that be SiteConfig::relativeLinkPrefix() rather than './'? $name = preg_replace( '#^\./#', '', $p0->template->target->href, 1 ); } elseif ( !empty( $p0->template ) ) { $name = trim( $p0->template->target->wt ); } else { $name = trim( $p0->templatearg->target->wt ); } return [ 'name' => $name ]; } else { return [ 'multiPartTemplateBlock' => true ]; } } /** * Compute the DSR information for the lint object. * - In the common case, this is simply the DSR value of the node * that generated the lint. But, occasionally, for some lints, * we might have to post-process the node's DSR. * - If the lint is found in template content, then the DSR spans * the transclusion markup in the toplevel page source. * * @param array|null $tplLintInfo * @param stdClass|null $tplInfo * @param DomSourceRange|null $nodeDSR * @param callable|null $updateNodeDSR * @return DomSourceRange|null */ private function findLintDSR( ?array $tplLintInfo, ?stdClass $tplInfo, ?DomSourceRange $nodeDSR, callable $updateNodeDSR = null ): ?DomSourceRange { if ( $tplLintInfo !== null || ( $tplInfo && !Utils::isValidDSR( $nodeDSR ) ) ) { return DOMDataUtils::getDataParsoid( $tplInfo->first )->dsr ?? null; } else { return $updateNodeDSR ? $updateNodeDSR( $nodeDSR ) : $nodeDSR; } } /** * Determine if a node has an identical nested tag (?) * @param DOMElement $node * @param string $name * @return bool */ private function hasIdenticalNestedTag( DOMElement $node, string $name ): bool { $c = $node->firstChild; while ( $c ) { if ( $c instanceof DOMElement ) { if ( $c->nodeName === $name && empty( DOMDataUtils::getDataParsoid( $c )->autoInsertedInd ) ) { return true; } return $this->hasIdenticalNestedTag( $c, $name ); } $c = $c->nextSibling; } return false; } /** * Determine if a node has misnestable content * @param DOMNode $node * @param string $name * @return bool */ private function hasMisnestableContent( DOMNode $node, string $name ): bool { // For A, TD, TH, H* tags, Tidy doesn't seem to propagate // the unclosed tag outside these tags. // No need to check for tr/table since content cannot show up there if ( DOMUtils::isBody( $node ) || preg_match( '/^(?:a|td|th|h\d)$/D', $node->nodeName ) ) { return false; } $next = DOMUtils::nextNonSepSibling( $node ); if ( !$next ) { return $this->hasMisnestableContent( $node->parentNode, $name ); } $contentNode = null; if ( $next->nodeName === 'p' && !WTUtils::isLiteralHTMLNode( $next ) ) { $contentNode = DOMUtils::firstNonSepChild( $next ); } else { $contentNode = $next; } // If the first "content" node we find is a matching // stripped tag, we have nothing that can get misnested return $contentNode && !( $contentNode instanceof DOMElement && DOMUtils::isMarkerMeta( $contentNode, 'mw:Placeholder/StrippedTag' ) && isset( DOMDataUtils::getDataParsoid( $contentNode )->name ) && DOMDataUtils::getDataParsoid( $contentNode )->name === $name ); } /** * Indicate whether an end tag is optional for this node * * See https://www.w3.org/TR/html5/syntax.html#optional-tags * * End tags for tr/td/th/li are entirely optional since they * require a parent container and can only be followed by like * kind. * * Caveat:
  • foo
    1. ..
    and
  • foo
      ..
    * generate different DOM trees, so explicit
  • tag * is required to specify which of the two was intended. * * With that one caveat around nesting, the parse with/without * the end tag is identical. For now, ignoring that caveat * since they aren't like to show up in our corpus much. * * For the other tags in that w3c spec section, I haven't reasoned * through when exactly they are optional. Not handling that complexity * for now since those are likely uncommon use cases in our corpus. * * @param DOMNode $node * @return bool */ private function endTagOptional( DOMNode $node ): bool { static $tagNames = [ 'tr', 'td', 'th', 'li' ]; return in_array( $node->nodeName, $tagNames, true ); } /** * Find the nearest ancestor heading tag * @param DOMNode $node * @return DOMNode|null */ private function getHeadingAncestor( DOMNode $node ): ?DOMNode { while ( $node && !preg_match( '/^h[1-6]$/D', $node->nodeName ) ) { $node = $node->parentNode; } return $node; } /** * For formatting tags, Tidy seems to be doing this "smart" fixup of * unclosed tags by looking for matching unclosed pairs of identical tags * and if the content ends in non-whitespace text, it treats the second * unclosed opening tag as a closing tag. But, a HTML5 parser won't do this. * So, detect this pattern and flag for linter fixup. * * @param DOMNode $c * @param stdClass $dp * @return bool */ private function matchedOpenTagPairExists( DOMNode $c, stdClass $dp ): bool { $lc = $c->lastChild; if ( !$lc instanceof DOMElement || $lc->nodeName !== $c->nodeName ) { return false; } $lcDP = DOMDataUtils::getDataParsoid( $lc ); if ( empty( $lcDP->autoInsertedEnd ) || ( $lcDP->stx ?? null ) !== ( $dp->stx ?? null ) ) { return false; } $prev = $lc->previousSibling; // PORT-FIXME: Do we care about non-ASCII whitespace here? if ( DOMUtils::isText( $prev ) && !preg_match( '/\s$/D', $prev->nodeValue ) ) { return true; } return false; } /** * Log Treebuilder fixups marked by dom.markTreeBuilderFixup.js * * It handles the following scenarios: * * 1. Unclosed end tags * 2. Unclosed start tags * 3. Stripped tags * * In addition, we have specialized categories for some patterns * where we encounter unclosed end tags. * * 4. misnested-tag * 5. html5-misnesting * 6. multiple-unclosed-formatting-tags * 7. unclosed-quotes-in-heading * * @param Env $env * @param DOMElement $c * @param stdClass $dp * @param stdClass|null $tplInfo */ private function logTreeBuilderFixup( Env $env, DOMElement $c, stdClass $dp, ?stdClass $tplInfo ): void { // This might have been processed as part of // misnested-tag category identification. if ( !empty( $dp->tmp->linted ) ) { return; } $templateInfo = $this->findEnclosingTemplateName( $env, $tplInfo ); // During DSR computation, stripped meta tags // surrender their width to its previous sibling. // We record the original DSR in the tmp attribute // for that reason. $dsr = $this->findLintDSR( $templateInfo, $tplInfo, $dp->tmp->origDSR ?? $dp->dsr ?? null ); $lintObj = null; if ( DOMUtils::isMarkerMeta( $c, 'mw:Placeholder/StrippedTag' ) ) { $lintObj = [ 'dsr' => $dsr, 'templateInfo' => $templateInfo, 'params' => [ 'name' => $dp->name ?? null ], ]; $env->recordLint( 'stripped-tag', $lintObj ); } // Dont bother linting for auto-inserted start/end or self-closing-tag if: // 1. c is a void element // Void elements won't have auto-inserted start/end tags // and self-closing versions are valid for them. // // 2. c is tbody (FIXME: don't remember why we have this exception) // // 3. c is not an HTML element (unless they are i/b quotes) // // 4. c doesn't have DSR info and doesn't come from a template either $cNodeName = strtolower( $c->nodeName ); $ancestor = null; $isHtmlElement = WTUtils::hasLiteralHTMLMarker( $dp ); if ( !Utils::isVoidElement( $cNodeName ) && $cNodeName !== 'tbody' && ( $isHtmlElement || DOMUtils::isQuoteElt( $c ) ) && ( $tplInfo !== null || $dsr !== null ) ) { if ( !empty( $dp->selfClose ) && $cNodeName !== 'meta' ) { $lintObj = [ 'dsr' => $dsr, 'templateInfo' => $templateInfo, 'params' => [ 'name' => $cNodeName ], ]; $env->recordLint( 'self-closed-tag', $lintObj ); // The other checks won't pass - no need to test them. return; } if ( ( $dp->autoInsertedEnd ?? null ) === true && ( $tplInfo || ( $dsr->openWidth ?? 0 ) > 0 ) ) { $lintObj = [ 'dsr' => $dsr, 'templateInfo' => $templateInfo, 'params' => [ 'name' => $cNodeName ], ]; // FIXME: This literal html marker check is strictly not required // (a) we've already checked that above and know that isQuoteElt is // not one of our tags. // (b) none of the tags in the list have native wikitext syntax => // they will show up as literal html tags. // But, in the interest of long-term maintenance in the face of // changes (to wikitext or html specs), let us make it explicit. if ( $isHtmlElement && isset( $this->getTagsWithChangedMisnestingBehavior()[$c->nodeName] ) && $this->hasMisnestableContent( $c, $c->nodeName ) && // Tidy WTF moment here! // I don't know why Tidy does something very different // when there is an identical nested tag here. // //

    aX

    b

    // vs. //

    a

    b

    OR //

    aX

    b

    // // For the first snippet, Tidy only wraps "a" with the id='1' span // For the second and third snippets, Tidy wraps "b" with the id='1' span as well. // // For the corresponding wikitext that generates the above token stream, // Parsoid (and Remex) won't wrap 'b' with the id=1' span at all. !$this->hasIdenticalNestedTag( $c, $c->nodeName ) ) { $env->recordLint( 'html5-misnesting', $lintObj ); // phpcs:ignore MediaWiki.ControlStructures.AssignmentInControlStructures.AssignmentInControlStructures } elseif ( !$isHtmlElement && DOMUtils::isQuoteElt( $c ) && ( $ancestor = $this->getHeadingAncestor( $c->parentNode ) ) ) { $lintObj['params']['ancestorName'] = strtolower( $ancestor->nodeName ); $env->recordLint( 'unclosed-quotes-in-heading', $lintObj ); } else { $adjNode = $this->getMatchingMisnestedNode( $c, $c ); if ( $adjNode ) { $adjDp = DOMDataUtils::getDataParsoid( $adjNode ); if ( !isset( $adjDp->tmp ) ) { $adjDp->tmp = new stdClass; } $adjDp->tmp->linted = true; $env->recordLint( 'misnested-tag', $lintObj ); } elseif ( !$this->endTagOptional( $c ) && empty( $dp->autoInsertedStart ) ) { $lintObj['params']['inTable'] = DOMUtils::hasAncestorOfName( $c, 'table' ); $env->recordLint( 'missing-end-tag', $lintObj ); if ( isset( Consts::$HTML['FormattingTags'][$c->nodeName] ) && $this->matchedOpenTagPairExists( $c, $dp ) ) { $env->recordLint( 'multiple-unclosed-formatting-tags', $lintObj ); } } } } } } /** * Log fostered content marked by markFosteredContent.js * * This will log cases like: * * {| * foo * |- * | bar * |} * * Here 'foo' gets fostered out. * * @param Env $env * @param DOMElement $node * @param stdClass $dp * @param stdClass|null $tplInfo * @return DOMElement|null */ private function logFosteredContent( Env $env, DOMElement $node, stdClass $dp, ?stdClass $tplInfo ): ?DOMElement { $maybeTable = $node->nextSibling; $clear = false; while ( $maybeTable && $maybeTable->nodeName !== 'table' ) { if ( $tplInfo && $maybeTable === $tplInfo->last ) { $clear = true; } $maybeTable = $maybeTable->nextSibling; } if ( !$maybeTable instanceof DOMElement ) { return null; } elseif ( $clear && $tplInfo ) { $tplInfo->clear = true; } // In pathological cases, we might walk past fostered nodes // that carry templating information. This then triggers // other errors downstream. So, walk back to that first node // and ignore this fostered content error. The new node will // trigger fostered content lint error. if ( !$tplInfo && WTUtils::hasParsoidAboutId( $maybeTable ) && !WTUtils::isFirstEncapsulationWrapperNode( $maybeTable ) ) { $tplNode = WTUtils::findFirstEncapsulationWrapperNode( $maybeTable ); if ( $tplNode !== null ) { return $tplNode; } // We got misled by the about id on 'maybeTable'. // Let us carry on with regularly scheduled programming. } $templateInfo = $this->findEnclosingTemplateName( $env, $tplInfo ); $lintObj = [ 'dsr' => $this->findLintDSR( $templateInfo, $tplInfo, DOMDataUtils::getDataParsoid( $maybeTable )->dsr ?? null ), 'templateInfo' => $templateInfo, ]; $env->recordLint( 'fostered', $lintObj ); return $maybeTable; } /** * Log obsolete HTML tags * @param Env $env * @param DOMElement $c * @param stdClass $dp * @param stdClass|null $tplInfo */ private function logObsoleteHTMLTags( Env $env, DOMElement $c, stdClass $dp, ?stdClass $tplInfo ): void { if ( !$this->obsoleteTagsRE ) { $elts = []; foreach ( Consts::$HTML['OlderHTMLTags'] as $tag => $dummy ) { // Looks like all existing editors let editors add the tag. // VE has a button to add , it seems so does the WikiEditor // and JS wikitext editor. So, don't flag BIG as an obsolete tag. if ( $tag !== 'big' ) { $elts[] = preg_quote( $tag, '/' ); } } $this->obsoleteTagsRE = '/^(?:' . implode( '|', $elts ) . ')$/D'; } $templateInfo = null; if ( ( empty( $dp->autoInsertedStart ) || empty( $dp->autoInsertedEnd ) ) && preg_match( $this->obsoleteTagsRE, $c->nodeName ) ) { $templateInfo = $this->findEnclosingTemplateName( $env, $tplInfo ); $lintObj = [ 'dsr' => $this->findLintDSR( $templateInfo, $tplInfo, $dp->dsr ?? null ), 'templateInfo' => $templateInfo, 'params' => [ 'name' => $c->nodeName ], ]; $env->recordLint( 'obsolete-tag', $lintObj ); } if ( $c->nodeName === 'font' && $c->hasAttribute( 'color' ) ) { /* ---------------------------------------------------------- * Tidy migrates into the link in these cases * [[Foo]] * [[Foo]]l (link-trail) * [[Foo]] * __NOTOC__[[Foo]] * [[Category:Foo]][[Foo]] * {{1x|[[Foo]]}} * * Tidy does not migrate into the link in these cases * [[Foo]] * [[Foo]] * [[Foo]]L (not a link-trail) * [[Foo]][[Bar]] * [[Foo]][[Bar]] * * is special. * This behavior is not seen with other formatting tags. * * Remex/parsoid won't do any of this. * This difference in behavior only matters when the font tag * specifies a link colour because the link no longer renders * as blue/red but in the font-specified colour. * ---------------------------------------------------------- */ $tidyFontBug = $c->firstChild !== null; $haveLink = false; for ( $n = $c->firstChild; $n; $n = $n->nextSibling ) { if ( $n->nodeName !== 'a' && !WTUtils::isRenderingTransparentNode( $n ) && !WTUtils::isTplMarkerMeta( $n ) ) { $tidyFontBug = false; break; } if ( $n->nodeName === 'a' || $n->nodeName === 'figure' ) { if ( !$haveLink ) { $haveLink = true; } else { $tidyFontBug = false; break; } } } if ( $tidyFontBug ) { $templateInfo = $this->findEnclosingTemplateName( $env, $tplInfo ); $env->recordLint( 'tidy-font-bug', [ 'dsr' => $this->findLintDSR( $templateInfo, $tplInfo, $dp->dsr ?? null ), 'templateInfo' => $templateInfo, 'params' => [ 'name' => 'font' ] ] ); } } } /** * Log bogus (=unrecognized) media options * * See - https://www.mediawiki.org/wiki/Help:Images#Syntax * * @param Env $env * @param DOMNode $c * @param stdClass $dp * @param stdClass|null $tplInfo */ private function logBogusMediaOptions( Env $env, DOMNode $c, stdClass $dp, ?stdClass $tplInfo ): void { if ( WTUtils::isGeneratedFigure( $c ) && !empty( $dp->optList ) ) { $items = []; foreach ( $dp->optList as $item ) { if ( $item['ck'] === 'bogus' ) { $items[] = $item['ak']; } } if ( $items ) { $templateInfo = $this->findEnclosingTemplateName( $env, $tplInfo ); $env->recordLint( 'bogus-image-options', [ 'dsr' => $this->findLintDSR( $templateInfo, $tplInfo, $dp->dsr ?? null ), 'templateInfo' => $templateInfo, 'params' => [ 'items' => $items ] ] ); } } } /** * Log tables Tidy deletes * * In this example below, the second table is in a fosterable position * (inside a ). The tree builder closes the first table at that point * and starts a new table there. We are detecting this pattern because * Tidy does something very different here. It strips the inner table * and retains the outer table. So, for preserving rendering of pages * that are tailored for Tidy, editors have to fix up this wikitext * to strip the inner table (to mimic what Tidy does). * * {| style='border:1px solid red;' * |a * |- * {| style='border:1px solid blue;' * |b * |c * |} * |} * * @param Env $env * @param DOMNode $c * @param stdClass $dp * @param stdClass|null $tplInfo */ private function logDeletableTables( Env $env, DOMNode $c, stdClass $dp, ?stdClass $tplInfo ): void { if ( $c->nodeName === 'table' ) { $prev = DOMUtils::previousNonSepSibling( $c ); if ( $prev instanceof DOMElement && $prev->nodeName === 'table' && !empty( DOMDataUtils::getDataParsoid( $prev )->autoInsertedEnd ) ) { $templateInfo = $this->findEnclosingTemplateName( $env, $tplInfo ); $dsr = $this->findLintDSR( $templateInfo, $tplInfo, $dp->dsr ?? null, function ( ?DomSourceRange $nodeDSR ): ?DomSourceRange { // Identify the dsr-span of the opening tag // of the table that needs to be deleted $x = $nodeDSR === null ? null : ( clone $nodeDSR ); if ( !empty( $x->openWidth ) ) { $x->end = $x->innerStart(); $x->openWidth = 0; $x->closeWidth = 0; } return $x; } ); $lintObj = [ 'dsr' => $dsr, 'templateInfo' => $templateInfo, 'params' => [ 'name' => 'table' ], ]; $env->recordLint( 'deletable-table-tag', $lintObj ); } } } /** * Find the first child passing the filter. * @param DOMNode $node * @param callable $filter * @return DOMNode|null */ private function findMatchingChild( DOMNode $node, callable $filter ): ?DOMNode { $c = $node->firstChild; while ( $c && !$filter( $c ) ) { $c = $c->nextSibling; } return $c; } /** * Test if the node has a 'nowrap' CSS rule * * In the general case, this CSS can come from a class, * or from a