|null */
private $tagsWithChangedMisnestingBehavior = null;
/** @var string|null */
private $obsoleteTagsRE = null;
/**
* We are trying to find HTML5 tags that have different behavior compared to HTML4
* in some misnesting scenarios around wikitext paragraphs.
*
* Ex: Input:
a
b
* Tidy output: a
b
* HTML5 output: a
b
*
* So, all good here.
* But, see how output changes when we use instead
*
* Ex: Input: a
b
* Tidy output: a
b
* HTML5 output: a
b
*
* The source wikitext is "a\n\nb". The difference persists even
* when you have "a\n\nb
" or "a\n\n{|\n|x\n|}\nbar".
*
* This is because Tidy seems to be doing the equivalent of HTM5-treebuilder's
* active formatting element reconstruction step on all *inline* elements.
* However, HTML5 parsers only do that on formatting elements. So, we need
* to compute which HTML5 tags are subject to this differential behavior.
*
* We compute that by excluding the following tags from the list of all HTML5 tags
* - If our sanitizer doesn't allow them, they will be escaped => ignore them
* - HTML4 block tags are excluded (obviously)
* - Void tags don't matter since they cannot wrap anything (obviously)
* - Active formatting elements have special handling in the HTML5 tree building
* algorithm where they are reconstructed to wrap all originally intended content.
* (ex: above)
*
* Here is the list of 22 HTML5 tags that are affected:
* ABBR, BDI, BDO, CITE, DATA, DEL, DFN, INS, KBD, MARK,
* Q, RB, RP, RT, RTC, RUBY, SAMP, SPAN, SUB, SUP, TIME, VAR
*
* https://phabricator.wikimedia.org/T176363#3628173 verifies that this list of
* tags all demonstrate this behavior.
*
* @return array
* @phan-return array
*/
private function getTagsWithChangedMisnestingBehavior(): array {
if ( $this->tagsWithChangedMisnestingBehavior === null ) {
$this->tagsWithChangedMisnestingBehavior = [];
foreach ( Consts::$HTML['HTML5Tags'] as $tag => $dummy ) {
if ( isset( Consts::$Sanitizer['AllowedLiteralTags'][$tag] ) &&
!isset( Consts::$HTML['HTML4BlockTags'][$tag] ) &&
!isset( Consts::$HTML['FormattingTags'][$tag] ) &&
!isset( Consts::$HTML['VoidTags'][$tag] )
) {
$this->tagsWithChangedMisnestingBehavior[$tag] = true;
}
}
}
return $this->tagsWithChangedMisnestingBehavior;
}
/**
* Finds a matching node at the "start" of this node.
* @param DOMNode|null $node
* @param DOMElement $match
* @return DOMElement|null
*/
private function leftMostMisnestedDescendent( ?DOMNode $node, DOMElement $match ): ?DOMElement {
if ( !$node instanceof DOMElement ) {
return null;
}
if ( DOMUtils::isMarkerMeta( $node, 'mw:Placeholder/StrippedTag' ) ) {
$name = DOMDataUtils::getDataParsoid( $node )->name ?? null;
return $name === $match->nodeName ? $node : null;
}
if ( $node->nodeName === $match->nodeName ) {
$dp = DOMDataUtils::getDataParsoid( $node );
if ( ( DOMDataUtils::getDataParsoid( $match )->stx ?? null ) === ( $dp->stx ?? null ) &&
!empty( $dp->autoInsertedStart )
) {
if ( !empty( $dp->autoInsertedEnd ) ) {
return $this->getMatchingMisnestedNode( $node, $match );
} else {
return $node;
}
}
}
return $this->leftMostMisnestedDescendent( $node->firstChild, $match );
}
/**
* $node has an 'autoInsertedEnd' flag set on it. We are looking for
* its matching node that has an 'autoInsertedStart' flag set on it.
* This happens when the tree-builder fixes up misnested tags.
* This "adjacency" is wrt the HTML string. In a DOM, this can either
* be the next sibling OR, it might be the left-most-descendent of
* of $node's parent's sibling (and so on up the ancestor chain).
*
* @param DOMNode $node
* @param DOMElement $match
* @return DOMElement|null
*/
private function getMatchingMisnestedNode( DOMNode $node, DOMElement $match ): ?DOMElement {
if ( DOMUtils::isBody( $node ) ) {
return null;
}
if ( DOMUtils::nextNonSepSibling( $node ) ) {
return $this->leftMostMisnestedDescendent( DOMUtils::nextNonSepSibling( $node ), $match );
}
return $this->getMatchingMisnestedNode( $node->parentNode, $match );
}
/**
* Given a tplInfo object, determine whether we are:
* - Not processing template content (could be extension or top level page)
* - Processing encapsulated content that is produced by a single template.
* If so, return the name of that template.
* - Processing encapsulated content that comes from multiple templates.
* If so, return a flag indicating this.
*
* FIXME: We might potentially be computing this information redundantly
* for every lint we find within this template's content. It could probably
* be cached in tplInfo after it is computed once.
*
* @param Env $env
* @param stdClass|null $tplInfo Template info.
* @return array|null
*/
private function findEnclosingTemplateName( Env $env, ?stdClass $tplInfo ): ?array {
if ( !$tplInfo ) {
return null;
}
if ( !DOMUtils::hasTypeOf( $tplInfo->first, 'mw:Transclusion' ) ) {
return null;
}
$dmw = DOMDataUtils::getDataMw( $tplInfo->first );
if ( !empty( $dmw->parts ) && count( $dmw->parts ) === 1 ) {
$p0 = $dmw->parts[0];
$name = null;
if ( !empty( $p0->template->target->href ) ) { // Could be "function"
// PORT-FIXME: Should that be SiteConfig::relativeLinkPrefix() rather than './'?
$name = preg_replace( '#^\./#', '', $p0->template->target->href, 1 );
} elseif ( !empty( $p0->template ) ) {
$name = trim( $p0->template->target->wt );
} else {
$name = trim( $p0->templatearg->target->wt );
}
return [ 'name' => $name ];
} else {
return [ 'multiPartTemplateBlock' => true ];
}
}
/**
* Compute the DSR information for the lint object.
* - In the common case, this is simply the DSR value of the node
* that generated the lint. But, occasionally, for some lints,
* we might have to post-process the node's DSR.
* - If the lint is found in template content, then the DSR spans
* the transclusion markup in the toplevel page source.
*
* @param array|null $tplLintInfo
* @param stdClass|null $tplInfo
* @param DomSourceRange|null $nodeDSR
* @param callable|null $updateNodeDSR
* @return DomSourceRange|null
*/
private function findLintDSR(
?array $tplLintInfo, ?stdClass $tplInfo, ?DomSourceRange $nodeDSR, callable $updateNodeDSR = null
): ?DomSourceRange {
if ( $tplLintInfo !== null || ( $tplInfo && !Utils::isValidDSR( $nodeDSR ) ) ) {
return DOMDataUtils::getDataParsoid( $tplInfo->first )->dsr ?? null;
} else {
return $updateNodeDSR ? $updateNodeDSR( $nodeDSR ) : $nodeDSR;
}
}
/**
* Determine if a node has an identical nested tag (?)
* @param DOMElement $node
* @param string $name
* @return bool
*/
private function hasIdenticalNestedTag( DOMElement $node, string $name ): bool {
$c = $node->firstChild;
while ( $c ) {
if ( $c instanceof DOMElement ) {
if ( $c->nodeName === $name && empty( DOMDataUtils::getDataParsoid( $c )->autoInsertedInd ) ) {
return true;
}
return $this->hasIdenticalNestedTag( $c, $name );
}
$c = $c->nextSibling;
}
return false;
}
/**
* Determine if a node has misnestable content
* @param DOMNode $node
* @param string $name
* @return bool
*/
private function hasMisnestableContent( DOMNode $node, string $name ): bool {
// For A, TD, TH, H* tags, Tidy doesn't seem to propagate
// the unclosed tag outside these tags.
// No need to check for tr/table since content cannot show up there
if ( DOMUtils::isBody( $node ) || preg_match( '/^(?:a|td|th|h\d)$/D', $node->nodeName ) ) {
return false;
}
$next = DOMUtils::nextNonSepSibling( $node );
if ( !$next ) {
return $this->hasMisnestableContent( $node->parentNode, $name );
}
$contentNode = null;
if ( $next->nodeName === 'p' && !WTUtils::isLiteralHTMLNode( $next ) ) {
$contentNode = DOMUtils::firstNonSepChild( $next );
} else {
$contentNode = $next;
}
// If the first "content" node we find is a matching
// stripped tag, we have nothing that can get misnested
return $contentNode && !(
$contentNode instanceof DOMElement &&
DOMUtils::isMarkerMeta( $contentNode, 'mw:Placeholder/StrippedTag' ) &&
isset( DOMDataUtils::getDataParsoid( $contentNode )->name ) &&
DOMDataUtils::getDataParsoid( $contentNode )->name === $name
);
}
/**
* Indicate whether an end tag is optional for this node
*
* See https://www.w3.org/TR/html5/syntax.html#optional-tags
*
* End tags for tr/td/th/li are entirely optional since they
* require a parent container and can only be followed by like
* kind.
*
* Caveat: foo..
and foo..
* generate different DOM trees, so explicit tag
* is required to specify which of the two was intended.
*
* With that one caveat around nesting, the parse with/without
* the end tag is identical. For now, ignoring that caveat
* since they aren't like to show up in our corpus much.
*
* For the other tags in that w3c spec section, I haven't reasoned
* through when exactly they are optional. Not handling that complexity
* for now since those are likely uncommon use cases in our corpus.
*
* @param DOMNode $node
* @return bool
*/
private function endTagOptional( DOMNode $node ): bool {
static $tagNames = [ 'tr', 'td', 'th', 'li' ];
return in_array( $node->nodeName, $tagNames, true );
}
/**
* Find the nearest ancestor heading tag
* @param DOMNode $node
* @return DOMNode|null
*/
private function getHeadingAncestor( DOMNode $node ): ?DOMNode {
while ( $node && !preg_match( '/^h[1-6]$/D', $node->nodeName ) ) {
$node = $node->parentNode;
}
return $node;
}
/**
* For formatting tags, Tidy seems to be doing this "smart" fixup of
* unclosed tags by looking for matching unclosed pairs of identical tags
* and if the content ends in non-whitespace text, it treats the second
* unclosed opening tag as a closing tag. But, a HTML5 parser won't do this.
* So, detect this pattern and flag for linter fixup.
*
* @param DOMNode $c
* @param stdClass $dp
* @return bool
*/
private function matchedOpenTagPairExists( DOMNode $c, stdClass $dp ): bool {
$lc = $c->lastChild;
if ( !$lc instanceof DOMElement || $lc->nodeName !== $c->nodeName ) {
return false;
}
$lcDP = DOMDataUtils::getDataParsoid( $lc );
if ( empty( $lcDP->autoInsertedEnd ) || ( $lcDP->stx ?? null ) !== ( $dp->stx ?? null ) ) {
return false;
}
$prev = $lc->previousSibling;
// PORT-FIXME: Do we care about non-ASCII whitespace here?
if ( DOMUtils::isText( $prev ) && !preg_match( '/\s$/D', $prev->nodeValue ) ) {
return true;
}
return false;
}
/**
* Log Treebuilder fixups marked by dom.markTreeBuilderFixup.js
*
* It handles the following scenarios:
*
* 1. Unclosed end tags
* 2. Unclosed start tags
* 3. Stripped tags
*
* In addition, we have specialized categories for some patterns
* where we encounter unclosed end tags.
*
* 4. misnested-tag
* 5. html5-misnesting
* 6. multiple-unclosed-formatting-tags
* 7. unclosed-quotes-in-heading
*
* @param Env $env
* @param DOMElement $c
* @param stdClass $dp
* @param stdClass|null $tplInfo
*/
private function logTreeBuilderFixup(
Env $env, DOMElement $c, stdClass $dp, ?stdClass $tplInfo
): void {
// This might have been processed as part of
// misnested-tag category identification.
if ( !empty( $dp->tmp->linted ) ) {
return;
}
$templateInfo = $this->findEnclosingTemplateName( $env, $tplInfo );
// During DSR computation, stripped meta tags
// surrender their width to its previous sibling.
// We record the original DSR in the tmp attribute
// for that reason.
$dsr = $this->findLintDSR( $templateInfo, $tplInfo, $dp->tmp->origDSR ?? $dp->dsr ?? null );
$lintObj = null;
if ( DOMUtils::isMarkerMeta( $c, 'mw:Placeholder/StrippedTag' ) ) {
$lintObj = [
'dsr' => $dsr,
'templateInfo' => $templateInfo,
'params' => [ 'name' => $dp->name ?? null ],
];
$env->recordLint( 'stripped-tag', $lintObj );
}
// Dont bother linting for auto-inserted start/end or self-closing-tag if:
// 1. c is a void element
// Void elements won't have auto-inserted start/end tags
// and self-closing versions are valid for them.
//
// 2. c is tbody (FIXME: don't remember why we have this exception)
//
// 3. c is not an HTML element (unless they are i/b quotes)
//
// 4. c doesn't have DSR info and doesn't come from a template either
$cNodeName = strtolower( $c->nodeName );
$ancestor = null;
$isHtmlElement = WTUtils::hasLiteralHTMLMarker( $dp );
if ( !Utils::isVoidElement( $cNodeName ) &&
$cNodeName !== 'tbody' &&
( $isHtmlElement || DOMUtils::isQuoteElt( $c ) ) &&
( $tplInfo !== null || $dsr !== null )
) {
if ( !empty( $dp->selfClose ) && $cNodeName !== 'meta' ) {
$lintObj = [
'dsr' => $dsr,
'templateInfo' => $templateInfo,
'params' => [ 'name' => $cNodeName ],
];
$env->recordLint( 'self-closed-tag', $lintObj );
// The other checks won't pass - no need to test them.
return;
}
if (
( $dp->autoInsertedEnd ?? null ) === true &&
( $tplInfo || ( $dsr->openWidth ?? 0 ) > 0 )
) {
$lintObj = [
'dsr' => $dsr,
'templateInfo' => $templateInfo,
'params' => [ 'name' => $cNodeName ],
];
// FIXME: This literal html marker check is strictly not required
// (a) we've already checked that above and know that isQuoteElt is
// not one of our tags.
// (b) none of the tags in the list have native wikitext syntax =>
// they will show up as literal html tags.
// But, in the interest of long-term maintenance in the face of
// changes (to wikitext or html specs), let us make it explicit.
if ( $isHtmlElement &&
isset( $this->getTagsWithChangedMisnestingBehavior()[$c->nodeName] ) &&
$this->hasMisnestableContent( $c, $c->nodeName ) &&
// Tidy WTF moment here!
// I don't know why Tidy does something very different
// when there is an identical nested tag here.
//
// aX
b
// vs.
// a
b
OR
// aX
b
//
// For the first snippet, Tidy only wraps "a" with the id='1' span
// For the second and third snippets, Tidy wraps "b" with the id='1' span as well.
//
// For the corresponding wikitext that generates the above token stream,
// Parsoid (and Remex) won't wrap 'b' with the id=1' span at all.
!$this->hasIdenticalNestedTag( $c, $c->nodeName )
) {
$env->recordLint( 'html5-misnesting', $lintObj );
// phpcs:ignore MediaWiki.ControlStructures.AssignmentInControlStructures.AssignmentInControlStructures
} elseif ( !$isHtmlElement && DOMUtils::isQuoteElt( $c ) &&
( $ancestor = $this->getHeadingAncestor( $c->parentNode ) )
) {
$lintObj['params']['ancestorName'] = strtolower( $ancestor->nodeName );
$env->recordLint( 'unclosed-quotes-in-heading', $lintObj );
} else {
$adjNode = $this->getMatchingMisnestedNode( $c, $c );
if ( $adjNode ) {
$adjDp = DOMDataUtils::getDataParsoid( $adjNode );
if ( !isset( $adjDp->tmp ) ) {
$adjDp->tmp = new stdClass;
}
$adjDp->tmp->linted = true;
$env->recordLint( 'misnested-tag', $lintObj );
} elseif ( !$this->endTagOptional( $c ) && empty( $dp->autoInsertedStart ) ) {
$lintObj['params']['inTable'] = DOMUtils::hasAncestorOfName( $c, 'table' );
$env->recordLint( 'missing-end-tag', $lintObj );
if ( isset( Consts::$HTML['FormattingTags'][$c->nodeName] ) &&
$this->matchedOpenTagPairExists( $c, $dp )
) {
$env->recordLint( 'multiple-unclosed-formatting-tags', $lintObj );
}
}
}
}
}
}
/**
* Log fostered content marked by markFosteredContent.js
*
* This will log cases like:
*
* {|
* foo
* |-
* | bar
* |}
*
* Here 'foo' gets fostered out.
*
* @param Env $env
* @param DOMElement $node
* @param stdClass $dp
* @param stdClass|null $tplInfo
* @return DOMElement|null
*/
private function logFosteredContent(
Env $env, DOMElement $node, stdClass $dp, ?stdClass $tplInfo
): ?DOMElement {
$maybeTable = $node->nextSibling;
$clear = false;
while ( $maybeTable && $maybeTable->nodeName !== 'table' ) {
if ( $tplInfo && $maybeTable === $tplInfo->last ) {
$clear = true;
}
$maybeTable = $maybeTable->nextSibling;
}
if ( !$maybeTable instanceof DOMElement ) {
return null;
} elseif ( $clear && $tplInfo ) {
$tplInfo->clear = true;
}
// In pathological cases, we might walk past fostered nodes
// that carry templating information. This then triggers
// other errors downstream. So, walk back to that first node
// and ignore this fostered content error. The new node will
// trigger fostered content lint error.
if ( !$tplInfo && WTUtils::hasParsoidAboutId( $maybeTable ) &&
!WTUtils::isFirstEncapsulationWrapperNode( $maybeTable )
) {
$tplNode = WTUtils::findFirstEncapsulationWrapperNode( $maybeTable );
if ( $tplNode !== null ) {
return $tplNode;
}
// We got misled by the about id on 'maybeTable'.
// Let us carry on with regularly scheduled programming.
}
$templateInfo = $this->findEnclosingTemplateName( $env, $tplInfo );
$lintObj = [
'dsr' => $this->findLintDSR(
$templateInfo, $tplInfo, DOMDataUtils::getDataParsoid( $maybeTable )->dsr ?? null
),
'templateInfo' => $templateInfo,
];
$env->recordLint( 'fostered', $lintObj );
return $maybeTable;
}
/**
* Log obsolete HTML tags
* @param Env $env
* @param DOMElement $c
* @param stdClass $dp
* @param stdClass|null $tplInfo
*/
private function logObsoleteHTMLTags(
Env $env, DOMElement $c, stdClass $dp, ?stdClass $tplInfo
): void {
if ( !$this->obsoleteTagsRE ) {
$elts = [];
foreach ( Consts::$HTML['OlderHTMLTags'] as $tag => $dummy ) {
// Looks like all existing editors let editors add the tag.
// VE has a button to add , it seems so does the WikiEditor
// and JS wikitext editor. So, don't flag BIG as an obsolete tag.
if ( $tag !== 'big' ) {
$elts[] = preg_quote( $tag, '/' );
}
}
$this->obsoleteTagsRE = '/^(?:' . implode( '|', $elts ) . ')$/D';
}
$templateInfo = null;
if ( ( empty( $dp->autoInsertedStart ) || empty( $dp->autoInsertedEnd ) ) &&
preg_match( $this->obsoleteTagsRE, $c->nodeName )
) {
$templateInfo = $this->findEnclosingTemplateName( $env, $tplInfo );
$lintObj = [
'dsr' => $this->findLintDSR( $templateInfo, $tplInfo, $dp->dsr ?? null ),
'templateInfo' => $templateInfo,
'params' => [ 'name' => $c->nodeName ],
];
$env->recordLint( 'obsolete-tag', $lintObj );
}
if ( $c->nodeName === 'font' && $c->hasAttribute( 'color' ) ) {
/* ----------------------------------------------------------
* Tidy migrates into the link in these cases
* [[Foo]]
* [[Foo]]l (link-trail)
* [[Foo]]
* __NOTOC__[[Foo]]
* [[Category:Foo]][[Foo]]
* {{1x|[[Foo]]}}
*
* Tidy does not migrate into the link in these cases
* [[Foo]]
* [[Foo]]
* [[Foo]]L (not a link-trail)
* [[Foo]][[Bar]]
* [[Foo]][[Bar]]
*
* is special.
* This behavior is not seen with other formatting tags.
*
* Remex/parsoid won't do any of this.
* This difference in behavior only matters when the font tag
* specifies a link colour because the link no longer renders
* as blue/red but in the font-specified colour.
* ---------------------------------------------------------- */
$tidyFontBug = $c->firstChild !== null;
$haveLink = false;
for ( $n = $c->firstChild; $n; $n = $n->nextSibling ) {
if ( $n->nodeName !== 'a' &&
!WTUtils::isRenderingTransparentNode( $n ) &&
!WTUtils::isTplMarkerMeta( $n )
) {
$tidyFontBug = false;
break;
}
if ( $n->nodeName === 'a' || $n->nodeName === 'figure' ) {
if ( !$haveLink ) {
$haveLink = true;
} else {
$tidyFontBug = false;
break;
}
}
}
if ( $tidyFontBug ) {
$templateInfo = $this->findEnclosingTemplateName( $env, $tplInfo );
$env->recordLint( 'tidy-font-bug', [
'dsr' => $this->findLintDSR( $templateInfo, $tplInfo, $dp->dsr ?? null ),
'templateInfo' => $templateInfo,
'params' => [ 'name' => 'font' ]
] );
}
}
}
/**
* Log bogus (=unrecognized) media options
*
* See - https://www.mediawiki.org/wiki/Help:Images#Syntax
*
* @param Env $env
* @param DOMNode $c
* @param stdClass $dp
* @param stdClass|null $tplInfo
*/
private function logBogusMediaOptions(
Env $env, DOMNode $c, stdClass $dp, ?stdClass $tplInfo
): void {
if ( WTUtils::isGeneratedFigure( $c ) && !empty( $dp->optList ) ) {
$items = [];
foreach ( $dp->optList as $item ) {
if ( $item['ck'] === 'bogus' ) {
$items[] = $item['ak'];
}
}
if ( $items ) {
$templateInfo = $this->findEnclosingTemplateName( $env, $tplInfo );
$env->recordLint( 'bogus-image-options', [
'dsr' => $this->findLintDSR( $templateInfo, $tplInfo, $dp->dsr ?? null ),
'templateInfo' => $templateInfo,
'params' => [ 'items' => $items ]
] );
}
}
}
/**
* Log tables Tidy deletes
*
* In this example below, the second table is in a fosterable position
* (inside a ). The tree builder closes the first table at that point
* and starts a new table there. We are detecting this pattern because
* Tidy does something very different here. It strips the inner table
* and retains the outer table. So, for preserving rendering of pages
* that are tailored for Tidy, editors have to fix up this wikitext
* to strip the inner table (to mimic what Tidy does).
*
* {| style='border:1px solid red;'
* |a
* |-
* {| style='border:1px solid blue;'
* |b
* |c
* |}
* |}
*
* @param Env $env
* @param DOMNode $c
* @param stdClass $dp
* @param stdClass|null $tplInfo
*/
private function logDeletableTables(
Env $env, DOMNode $c, stdClass $dp, ?stdClass $tplInfo
): void {
if ( $c->nodeName === 'table' ) {
$prev = DOMUtils::previousNonSepSibling( $c );
if ( $prev instanceof DOMElement && $prev->nodeName === 'table' &&
!empty( DOMDataUtils::getDataParsoid( $prev )->autoInsertedEnd )
) {
$templateInfo = $this->findEnclosingTemplateName( $env, $tplInfo );
$dsr = $this->findLintDSR(
$templateInfo,
$tplInfo,
$dp->dsr ?? null,
function ( ?DomSourceRange $nodeDSR ): ?DomSourceRange {
// Identify the dsr-span of the opening tag
// of the table that needs to be deleted
$x = $nodeDSR === null ? null : ( clone $nodeDSR );
if ( !empty( $x->openWidth ) ) {
$x->end = $x->innerStart();
$x->openWidth = 0;
$x->closeWidth = 0;
}
return $x;
}
);
$lintObj = [
'dsr' => $dsr,
'templateInfo' => $templateInfo,
'params' => [ 'name' => 'table' ],
];
$env->recordLint( 'deletable-table-tag', $lintObj );
}
}
}
/**
* Find the first child passing the filter.
* @param DOMNode $node
* @param callable $filter
* @return DOMNode|null
*/
private function findMatchingChild( DOMNode $node, callable $filter ): ?DOMNode {
$c = $node->firstChild;
while ( $c && !$filter( $c ) ) {
$c = $c->nextSibling;
}
return $c;
}
/**
* Test if the node has a 'nowrap' CSS rule
*
* In the general case, this CSS can come from a class,
* or from a