tokenizer = new PegTokenizer( $env ); } /** * DOM visitor that strips the double td for this test case: * ``` * |{{1x|{{!}} Foo}} * ``` * * @see https://phabricator.wikimedia.org/T52603 * @param DOMElement $node * @param Frame $frame * @return bool|DOMNode */ public function stripDoubleTDs( DOMElement $node, Frame $frame ) { $nextNode = $node->nextSibling; if ( !WTUtils::isLiteralHTMLNode( $node ) && $nextNode instanceof DOMElement && $nextNode->nodeName === 'td' && !WTUtils::isLiteralHTMLNode( $nextNode ) && DOMUtils::nodeEssentiallyEmpty( $node ) && ( // FIXME: will not be set for nested templates DOMUtils::hasTypeOf( $nextNode, 'mw:Transclusion' ) || // Hacky work-around for nested templates preg_match( '/^{{.*?}}$/D', DOMDataUtils::getDataParsoid( $nextNode )->src ?? '' ) ) ) { // Update the dsr. Since we are coalescing the first // node with the second (or, more precisely, deleting // the first node), we have to update the second DSR's // starting point and start tag width. $nodeDSR = DOMDataUtils::getDataParsoid( $node )->dsr ?? null; $nextNodeDP = DOMDataUtils::getDataParsoid( $nextNode ); if ( $nodeDSR && !empty( $nextNodeDP->dsr ) ) { $nextNodeDP->dsr->start = $nodeDSR->start; } $dataMW = DOMDataUtils::getDataMw( $nextNode ); $nodeSrc = WTUtils::getWTSource( $frame, $node ); if ( !isset( $dataMW->parts ) ) { $dataMW->parts = []; } array_unshift( $dataMW->parts, $nodeSrc ); // Delete the duplicated node. $node->parentNode->removeChild( $node ); // This node was deleted, so don't continue processing on it. return $nextNode; } return true; } /** * @param DOMNode $node * @return bool */ public function isSimpleTemplatedSpan( DOMNode $node ): bool { return $node->nodeName === 'span' && DOMUtils::hasTypeOf( $node, 'mw:Transclusion' ) && DOMUtils::allChildrenAreTextOrComments( $node ); } /** * @param Frame $frame * @param DOMElement $child * @param DOMElement $tdNode */ public function hoistTransclusionInfo( Frame $frame, DOMElement $child, DOMElement $tdNode ): void { $aboutId = $child->getAttribute( 'about' ); // Hoist all transclusion information from the child // to the parent tdNode. $tdNode->setAttribute( 'typeof', $child->getAttribute( 'typeof' ) ); $tdNode->setAttribute( 'about', $aboutId ); $dataMW = DOMDataUtils::getDataMw( $child ); $parts = $dataMW->parts ?? []; $dp = DOMDataUtils::getDataParsoid( $tdNode ); $childDP = DOMDataUtils::getDataParsoid( $child ); Assert::invariant( Utils::isValidDSR( $childDP->dsr ?? null ), 'Expected valid DSR' ); // In `handleTableCellTemplates`, we're creating a cell w/o dsr info. if ( !Utils::isValidDSR( $dp->dsr ?? null ) ) { $dp->dsr = clone $childDP->dsr; } // Get the td and content source up to the transclusion start if ( $dp->dsr->start < $childDP->dsr->start ) { $width = $childDP->dsr->start - $dp->dsr->start; array_unshift( $parts, PHPUtils::safeSubstr( $frame->getSrcText(), $dp->dsr->start, $width ) ); } // Add wikitext for the table cell content following the // transclusion. This is safe as we are currently only // handling a single transclusion in the content, which is // guaranteed to have a dsr that covers the transclusion // itself. if ( $childDP->dsr->end < $dp->dsr->end ) { $width = $dp->dsr->end - $childDP->dsr->end; $parts[] = PHPUtils::safeSubstr( $frame->getSrcText(), $childDP->dsr->end, $width ); } // Save the new data-mw on the tdNode DOMDataUtils::setDataMw( $tdNode, (object)[ 'parts' => $parts ] ); $dp->pi = $childDP->pi ?? []; DOMDataUtils::setDataMw( $child, null ); // tdNode wraps everything now. // Remove template encapsulation from here on. // This simplifies the problem of analyzing the // for additional fixups (|| Boo || Baz) by potentially // invoking 'reparseTemplatedAttributes' on split cells // with some modifications. while ( $child ) { if ( $child->nodeName === 'span' && $child->getAttribute( 'about' ) === $aboutId ) { // Remove the encapsulation attributes. If there are no more attributes left, // the span wrapper is useless and can be removed. $child->removeAttribute( 'about' ); $child->removeAttribute( 'typeof' ); if ( DOMDataUtils::noAttrs( $child ) ) { $next = $child->firstChild ?: $child->nextSibling; DOMUtils::migrateChildren( $child, $tdNode, $child ); $child->parentNode->removeChild( $child ); $child = $next; } else { $child = $child->nextSibling; } } else { $child = $child->nextSibling; } } } /** * Build the result * * @param array $buf * @param array $nowikis * @param DOMElement|null $transclusionNode * @return array */ private static function buildRes( array $buf, array $nowikis, ?DOMElement $transclusionNode ): array { return [ 'txt' => implode( '', $buf ), 'nowikis' => $nowikis, 'transclusionNode' => $transclusionNode, ]; } /** * Collect potential attribute content. * * We expect this to be text nodes without a pipe character followed by one or * more nowiki spans, followed by a template encapsulation with pure-text and * nowiki content. Collection stops when encountering other nodes or a pipe * character. * * @param Env $env * @param DOMElement $node * @param DOMElement|null $templateWrapper * @return array */ public function collectAttributishContent( Env $env, DOMElement $node, ?DOMElement $templateWrapper ): array { $buf = []; $nowikis = []; $transclusionNode = $templateWrapper ?: ( DOMUtils::hasTypeOf( $node, 'mw:Transclusion' ) ? $node : null ); $child = $node->firstChild; /* * In this loop below, where we are trying to collect text content, * it is safe to use child.textContent since textContent skips over * comments. See this transcript of a node session: * * > d.body.childNodes[0].outerHTML * 'bar' * > d.body.childNodes[0].textContent * 'bar' * * PHP parser strips comments during parsing, i.e. they don't impact * how other wikitext constructs are parsed. So, in this code below, * we have to skip over comments. */ while ( $child ) { if ( DOMUtils::isComment( $child ) ) { // are not comments in CSS and PHP parser strips them } elseif ( DOMUtils::isText( $child ) ) { $buf[] = $child->nodeValue; } elseif ( $child->nodeName !== 'span' ) { // The idea here is that style attributes can only // be text/comment nodes, and nowiki-spans at best. // So, if we hit anything else, there is nothing more // to do here! return self::buildRes( $buf, $nowikis, $transclusionNode ); } else { '@phan-var DOMElement $child'; /** @var DOMElement $child */ if ( DOMUtils::hasTypeOf( $child, 'mw:Entity' ) ) { $buf[] = $child->textContent; } elseif ( DOMUtils::hasTypeOf( $child, 'mw:Nowiki' ) ) { // Nowiki span were added to protect otherwise // meaningful wikitext chars used in attributes. // Save the content. $nowikis[] = $child->textContent; // And add in a marker to splice out later. $buf[] = ''; } elseif ( $this->isSimpleTemplatedSpan( $child ) ) { // And only handle a single nested transclusion for now. // TODO: Handle data-mw construction for multi-transclusion content // as well, then relax this restriction. // // If we already had a transclusion node, we return // without attempting to fix this up. if ( $transclusionNode ) { $env->log( 'error/dom/tdfixup', 'Unhandled TD-fixup scenario.', 'Encountered multiple transclusion children of a ' ); return [ 'transclusionNode' => null ]; } // We encountered a transclusion wrapper $buf[] = $child->textContent; $transclusionNode = $child; } elseif ( $transclusionNode && DOMUtils::assertElt( $transclusionNode ) && ( !$child->hasAttribute( 'typeof' ) ) && $child->getAttribute( 'about' ) === $transclusionNode->getAttribute( 'about' ) && DOMUtils::allChildrenAreTextOrComments( $child ) ) { // Continue accumulating only if we hit grouped template content $buf[] = $child->textContent; } else { return self::buildRes( $buf, $nowikis, $transclusionNode ); } } // Are we done accumulating? if ( count( $buf ) > 0 && preg_match( '/(?:^|[^|])\|(?:[^|]|$)/D', PHPUtils::lastItem( $buf ) ) ) { return self::buildRes( $buf, $nowikis, $transclusionNode ); } $child = $child->nextSibling; } return self::buildRes( $buf, $nowikis, $transclusionNode ); } /** * T46498, second part of T52603 * * Handle wikitext like * ``` * {| * |{{nom|Bar}} * |} * ``` * where nom expands to `style="foo" class="bar"|Bar`. The attributes are * tokenized and stripped from the table contents. * * This method works well for the templates documented in * https://en.wikipedia.org/wiki/Template:Table_cell_templates/doc * * Nevertheless, there are some limitations: * - We assume that attributes don't contain wiki markup (apart from ) * and end up in text or nowiki nodes. * - Only a single table cell is produced / opened by the template that * contains the attributes. This limitation could be lifted with more * aggressive re-parsing if really needed in practice. * - There is only a single transclusion in the table cell content. This * limitation can be lifted with more advanced data-mw construction. * * @param Frame $frame * @param DOMElement $node * @param DOMElement|null $templateWrapper */ public function reparseTemplatedAttributes( Frame $frame, DOMElement $node, ?DOMElement $templateWrapper ): void { $env = $frame->getEnv(); // Collect attribute content and examine it $attributishContent = $this->collectAttributishContent( $env, $node, $templateWrapper ); // Check for the pipe character in the attributish text. if ( !preg_match( '/^[^|]+\|([^|].*)?$/D', $attributishContent['txt'] ?? '' ) ) { return; } // Try to re-parse the attributish text content // PORT-CHECK-ME, it was refactored without testing!!! if ( preg_match( '/^[^|]+\|/', $attributishContent['txt'] ?? '', $matches ) ) { $attributishPrefix = $matches[0]; } else { $attributishPrefix = ''; } // Splice in nowiki content. We added in markers to prevent the // above regexps from matching on nowiki-protected chars. if ( preg_match( '//', $attributishPrefix ) ) { $attributishPrefix = preg_replace_callback( '//', function ( $unused ) use ( &$attributishContent ) { // This is a little tricky. We want to use the content from the // nowikis to reparse the string to kev/val pairs but the rule, // single_cell_table_args, will invariably get tripped up on // newlines which, to this point, were shuttled through in the // nowiki. php's santizer will do this replace in attr vals so // it's probably a safe assumption ... return preg_replace( '/\s+/', ' ', array_shift( $attributishContent['nowikis'] ) ); }, $attributishPrefix ); } // re-parse the attributish prefix $attributeTokens = $this->tokenizer->tokenizeTableCellAttributes( $attributishPrefix, false ); // No attributes => nothing more to do! if ( !$attributeTokens ) { return; } // Note that `row_syntax_table_args` (the rule used for tokenizing above) // returns an array consisting of [table_attributes, spaces, pipe] $attrs = $attributeTokens[0]; // Found attributes; sanitize them // and transfer the sanitized attributes to the td node Sanitizer::applySanitizedArgs( $env, $node, $attrs ); // If the transclusion node was embedded within the td node, // lift up the about group to the td node. $transclusionNode = $attributishContent['transclusionNode'] ?? null; if ( $transclusionNode !== null && $node !== $transclusionNode ) { $this->hoistTransclusionInfo( $frame, $transclusionNode, $node ); } // Drop nodes that have been consumed by the reparsed attribute content. $n = $node->firstChild; while ( $n ) { if ( preg_match( '/[|]/', $n->textContent ) ) { // Remove the consumed prefix from the text node $nValue = $n->nodeName === '#text' ? $n->nodeValue : $n->textContent; // and convert it into a simple text node $textNode = $node->ownerDocument->createTextNode( preg_replace( '/^[^|]*[|]/', '', $nValue, 1 ) ); $node->replaceChild( $textNode, $n ); break; } else { $next = $n->nextSibling; // content was consumed by attributes, so just drop it from the cell $node->removeChild( $n ); $n = $next; } } } /** * @param DOMNode $node * @return bool */ public function needsReparsing( DOMNode $node ): bool { $testRE = ( $node->nodeName === 'td' ) ? '/[|]/' : '/[!|]/'; $child = $node->firstChild; while ( $child ) { if ( DOMUtils::isText( $child ) && preg_match( $testRE, $child->textContent ) ) { return true; } elseif ( $child->nodeName === 'span' ) { if ( WTUtils::hasParsoidAboutId( $child ) && preg_match( $testRE, $child->textContent ) ) { return true; } } $child = $child->nextSibling; } return false; } /** * @param DOMElement $node * @param Frame $frame * @return bool */ public function handleTableCellTemplates( DOMElement $node, Frame $frame ): bool { // Don't bother with literal HTML nodes or nodes that don't need reparsing. if ( WTUtils::isLiteralHTMLNode( $node ) || !$this->needsReparsing( $node ) ) { return true; } // If the cell didn't have attrs, extract and reparse templated attrs $about = null; $dp = DOMDataUtils::getDataParsoid( $node ); $hasAttrs = empty( $dp->tmp->noAttrs ); if ( !$hasAttrs ) { $templateWrapper = DOMUtils::hasTypeOf( $node, 'mw:Transclusion' ) ? $node : null; $this->reparseTemplatedAttributes( $frame, $node, $templateWrapper ); } // Now, examine the to see if it hides additional s // and split it up if required. // // DOMTraverser will process the new cell and invoke // handleTableCellTemplates on it which ensures that // if any addition attribute fixup or splits are required, // they will get done. $newCell = null; $ownerDoc = $node->ownerDocument; $child = $node->firstChild; while ( $child ) { $next = $child->nextSibling; if ( $newCell ) { $newCell->appendChild( $child ); } elseif ( DOMUtils::isText( $child ) || $this->isSimpleTemplatedSpan( $child ) ) { $cellName = $node->nodeName; $hasSpanWrapper = !DOMUtils::isText( $child ); $match = null; if ( $cellName === 'td' ) { preg_match( '/^(.*?[^|])?\|\|([^|].*)?$/D', $child->textContent, $match ); } else { /* cellName === 'th' */ // Find the first match of || or !! preg_match( '/^(.*?[^|])?\|\|([^|].*)?$/D', $child->textContent, $match1 ); preg_match( '/^(.*?[^!])?\!\!([^!].*)?$/D', $child->textContent, $match2 ); if ( $match1 && $match2 ) { $match = strlen( $match1[1] ?? '' ) < strlen( $match2[1] ?? '' ) ? $match1 : $match2; } else { $match = $match1 ?: $match2; } } if ( $match ) { $child->textContent = $match[1] ?? ''; $newCell = $ownerDoc->createElement( $cellName ); if ( $hasSpanWrapper ) { /** * $hasSpanWrapper, above, ensures $child is a span. * * @var DOMElement $child */ '@phan-var DOMElement $child'; // Fix up transclusion wrapping $about = $child->getAttribute( 'about' ); $this->hoistTransclusionInfo( $frame, $child, $node ); } else { // Refetch the about attribute since 'reparseTemplatedAttributes' // might have added one to it. $about = $node->getAttribute( 'about' ); } // about may not be present if the cell was inside // wrapped template content rather than being part // of the outermost wrapper. if ( $about ) { $newCell->setAttribute( 'about', $about ); } $newCell->appendChild( $ownerDoc->createTextNode( $match[2] ?? '' ) ); $node->parentNode->insertBefore( $newCell, $node->nextSibling ); // Set data-parsoid noAttrs flag $newCellDP = DOMDataUtils::getDataParsoid( $newCell ); $newCellDP->tmp->noAttrs = true; } } $child = $next; } return true; } }