true, "i" => true, "h1" => true, "h2" => true, "h3" => true, "h4" => true, "h5" => true, "h6" => true, "ul" => true, "ol" => true, "dl" => true, "li" => true, "dt" => true, "dd" => true, "table" => true, "caption" => true, "tr" => true, "td" => true, "th" => true, "hr" => true, // void element "br" => true, // void element "pre" => true, ]; /** * Do $parsoidData->tsr values span the entire DOM subtree rooted at $n? * * @param DOMElement $n * @param stdClass $parsoidData * @return bool */ private function tsrSpansTagDOM( DOMElement $n, stdClass $parsoidData ): bool { // - tags known to have tag-specific tsr // - html tags with 'stx' set // - tags with certain typeof properties (Parsoid-generated // constructs: placeholders, lang variants) $name = $n->nodeName; return !( isset( self::$WtTagsWithLimitedTSR[$name] ) || DOMUtils::matchTypeOf( $n, '/^mw:(Placeholder|LanguageVariant)$/D' ) || WTUtils::hasLiteralHTMLMarker( $parsoidData ) ); } /** * Is the inconsistency between two different ways of computing * start offset ($cs, $s) explainable and acceptable? * If so, we can suppress warnings. * * @param array $opts * @param DOMNode $node * @param int $cs * @param int $s * @return bool */ private function acceptableInconsistency( array $opts, DOMNode $node, int $cs, int $s ): bool { /** * 1. For wikitext URL links, suppress cs-s diff warnings because * the diffs can come about because of various reasons since the * canonicalized/decoded href will become the a-link text whose width * will not match the tsr width of source wikitext * * (a) urls with encoded chars (ex: 'http://example.com/?foo=bar') * (b) non-canonical spaces (ex: 'RFC 123' instead of 'RFC 123') * * 2. We currently dont have source offsets for attributes. * So, we get a lot of spurious complaints about cs/s mismatch * when DSR computation hit the
tag on this attribute. * $opts['attrExpansion'] tell us when we are processing an attribute * and let us suppress the mismatch warning on the tag. * * 3. Other scenarios .. to be added */ if ( $node->nodeName === 'a' && DOMUtils::assertElt( $node ) && ( WTUtils::usesURLLinkSyntax( $node, null ) || WTUtils::usesMagicLinkSyntax( $node, null ) ) ) { return true; } elseif ( isset( $opts['attrExpansion'] ) && DOMUtils::isBody( $node ) ) { return true; } else { return false; } } /** * Compute wikitext string length that contributes to this * list item's open tag. Closing tag width is always 0 for lists. * * @param DOMNode $li * @return int */ private function computeListEltWidth( DOMNode $li ): int { if ( !$li->previousSibling && $li->firstChild ) { if ( DOMUtils::isList( $li->firstChild ) ) { // Special case!! // First child of a list that is on a chain // of nested lists doesn't get a width. return 0; } } // count nest listing depth and assign // that to the opening tag width. $depth = 0; while ( $li->nodeName === 'li' || $li->nodeName === 'dd' ) { $depth++; $li = $li->parentNode->parentNode; } return $depth; } /** * Compute wikitext string lengths that contribute to this * anchor's opening () and closing () tags. * * @param DOMElement $node * @param stdClass|null $dp * @return int[]|null */ private function computeATagWidth( DOMElement $node, ?stdClass $dp ): ?array { /* ------------------------------------------------------------- * Tag widths are computed as per this logic here: * * 1. [[Foo|bar]] <-- piped mw:WikiLink * -> start-tag: "[[Foo|" * -> content : "bar" * -> end-tag : "]]" * * 2. [[Foo]] <-- non-piped mw:WikiLink * -> start-tag: "[[" * -> content : "Foo" * -> end-tag : "]]" * * 3. [[{{1x|Foo}}|Foo]] <-- tpl-attr mw:WikiLink * Dont bother setting tag widths since dp->sa['href'] will be * the expanded target and won't correspond to original source. * We dont always have access to the meta-tag that has the source. * * 4. [http://wp.org foo] <-- mw:ExtLink * -> start-tag: "[http://wp.org " * -> content : "foo" * -> end-tag : "]" * -------------------------------------------------------------- */ if ( !$dp ) { return null; } else { if ( WTUtils::usesWikiLinkSyntax( $node, $dp ) && !WTUtils::hasExpandedAttrsType( $node ) ) { if ( isset( $dp->stx ) && $dp->stx === "piped" ) { // this seems like some kind of a phan bug $href = $dp->sa['href'] ?? null; if ( $href ) { return [ strlen( $href ) + 3, 2 ]; } else { return null; } } else { return [ 2, 2 ]; } } elseif ( isset( $dp->tsr ) && WTUtils::usesExtLinkSyntax( $node, $dp ) ) { return [ $dp->extLinkContentOffsets->start - $dp->tsr->start, 1 ]; } elseif ( WTUtils::usesURLLinkSyntax( $node, $dp ) || WTUtils::usesMagicLinkSyntax( $node, $dp ) ) { return [ 0, 0 ]; } else { return null; } } } /** * Compute wikitext string lengths that contribute to this * node's opening and closing tags. * * @param int[] $widths * @param DOMElement $node * @param DataParsoid $dp * @return int[] */ private function computeTagWidths( array $widths, DOMElement $node, stdClass $dp ): array { if ( isset( $dp->extTagOffsets ) ) { return [ $dp->extTagOffsets->openWidth, $dp->extTagOffsets->closeWidth ]; } $stWidth = $widths[0]; $etWidth = $widths[1]; if ( WTUtils::hasLiteralHTMLMarker( $dp ) ) { if ( !empty( $dp->selfClose ) ) { $etWidth = 0; } } elseif ( DOMUtils::hasTypeOf( $node, 'mw:LanguageVariant' ) ) { $stWidth = 2; // -{ $etWidth = 2; // }- } else { $nodeName = $node->nodeName; // 'tr' tags not in the original source have zero width if ( $nodeName === 'tr' && !isset( $dp->startTagSrc ) ) { $stWidth = 0; $etWidth = 0; } else { $wtTagWidth = Consts::$WtTagWidths[$nodeName] ?? null; if ( $stWidth === null ) { // we didn't have a tsr to tell us how wide this tag was. if ( $nodeName === 'a' ) { DOMUtils::assertElt( $node ); $wtTagWidth = $this->computeATagWidth( $node, $dp ); $stWidth = $wtTagWidth ? $wtTagWidth[0] : null; } elseif ( $nodeName === 'li' || $nodeName === 'dd' ) { DOMUtils::assertElt( $node ); $stWidth = $this->computeListEltWidth( $node ); } elseif ( $wtTagWidth ) { $stWidth = $wtTagWidth[0]; } } if ( $etWidth === null && $wtTagWidth ) { $etWidth = $wtTagWidth[1]; } } } return [ $stWidth, $etWidth ]; } /** * @param Env $env * @param mixed ...$args */ private function trace( Env $env, ...$args ): void { $env->log( "trace/dsr", function () use ( $args ) { $buf = ''; foreach ( $args as $arg ) { $buf .= ( gettype( $arg ) === 'string' ? $arg : PHPUtils::jsonEncode( $arg ) ); } return $buf; } ); } /** * TSR = "Tag Source Range". Start and end offsets giving the location * where the tag showed up in the original source. * * DSR = "DOM Source Range". dsr->start and dsr->end are open and end, * dsr->openWidth and dsr->closeWidth are widths of the container tag. * * TSR is set by the tokenizer. In most cases, it only applies to the * specific tag (opening or closing). However, for self-closing * tags that the tokenizer generates, the TSR values applies to the entire * DOM subtree (opening tag + content + closing tag). * * Ex: So [[foo]] will get tokenized to a SelfClosingTagTk(...) with a TSR * value of [0,7]. The DSR algorithm will then use that info and assign * the a-tag rooted at the foo DOM subtree a DSR value of * [0,7,2,2], where 2 and 2 refer to the opening and closing tag widths. * * [s,e) -- if defined, start/end position of wikitext source that generated * node's subtree * * @param Frame $frame * @param DOMNode $node node to process * @param int|null $s start position, inclusive * @param int|null $e end position, exclusive * @param int $dsrCorrection * @param array $opts * @return array */ private function computeNodeDSR( Frame $frame, DOMNode $node, ?int $s, ?int $e, int $dsrCorrection, array $opts ): array { $env = $frame->getEnv(); if ( $e === null && !$node->hasChildNodes() ) { $e = $s; } $this->trace( $env, "BEG: ", $node->nodeName, " with [s, e]=", [ $s, $e ] ); $savedEndTagWidth = null; $ce = $e; // Initialize $cs to $ce to handle the zero-children case properly // if this $node has no child content, then the start and end for // the child dom are indeed identical. Alternatively, we could // explicitly code this check before everything and bypass this. $cs = $ce; $rtTestMode = $env->getSiteConfig()->rtTestMode(); $child = $node->lastChild; while ( $child !== null ) { $prevChild = $child->previousSibling; $isMarkerTag = false; $origCE = $ce; $cType = $child->nodeType; $endTagInfo = null; $fosteredNode = false; $cs = null; // In edit mode, StrippedTag marker tags will be removed and wont // be around to miss in the filling gap. So, absorb its width into // the DSR of its previous sibling. Currently, this fix is only for // B and I tags where the fix is clear-cut and obvious. if ( !$rtTestMode ) { $next = $child->nextSibling; if ( $next && ( $next instanceof DOMElement ) ) { $ndp = DOMDataUtils::getDataParsoid( $next ); if ( isset( $ndp->src ) && DOMUtils::hasTypeOf( $next, 'mw:Placeholder/StrippedTag' ) ) { if ( isset( Consts::$WTQuoteTags[$ndp->name] ) && isset( Consts::$WTQuoteTags[$child->nodeName] ) ) { $correction = strlen( $ndp->src ); $ce += $correction; $dsrCorrection = $correction; if ( Utils::isValidDSR( $ndp->dsr ?? null ) ) { // Record original DSR for the meta tag // since it will now get corrected to zero width // since child acquires its width-> if ( !$ndp->tmp ) { $ndp->tmp = []; } $ndp->tmp->origDSR = new DomSourceRange( $ndp->dsr->start, $ndp->dsr->end, null, null ); } } } } } $env->log( "trace/dsr", function () use ( $child, $cs, $ce ) { // slow, for debugging only $i = 0; foreach ( $child->parentNode->childNodes as $x ) { if ( $x === $child ) { break; } $i++; } return " CHILD: <" . $child->parentNode->nodeName . ":" . $i . ">=" . ( $child instanceof DOMElement ? '' : ( DOMUtils::isText( $child ) ? '#' : '!' ) ) . ( ( $child instanceof DOMElement ) ? ( $child->nodeName === 'meta' ? DOMCompat::getOuterHTML( $child ) : $child->nodeName ) : PHPUtils::jsonEncode( $child->nodeValue ) ) . " with " . PHPUtils::jsonEncode( [ $cs, $ce ] ); } ); if ( $cType === XML_TEXT_NODE ) { if ( $ce !== null ) { // This code is replicated below. Keep both in sync. $cs = $ce - strlen( $child->textContent ) - WTUtils::indentPreDSRCorrection( $child ); } } elseif ( $cType === XML_COMMENT_NODE ) { '@phan-var \DOMComment $child'; // @var \DOMComment $child if ( $ce !== null ) { // Decode HTML entities & re-encode as wikitext to find length $cs = $ce - WTUtils::decodedCommentLength( $child ); } } elseif ( $cType === XML_ELEMENT_NODE ) { DOMUtils::assertElt( $child ); $dp = DOMDataUtils::getDataParsoid( $child ); $tsr = $dp->tsr ?? null; $oldCE = $tsr ? $tsr->end : null; $propagateRight = false; $stWidth = null; $etWidth = null; $fosteredNode = $dp->fostered ?? false; // In edit-mode, we are making dsr corrections to account for // stripped tags (end tags usually). When stripping happens, // in most common use cases, a corresponding end tag is added // back elsewhere in the DOM. // // So, when an autoInsertedEnd tag is encountered and a matching // dsr-correction is found, make a 1-time correction in the // other direction. // // Currently, this fix is only for // B and I tags where the fix is clear-cut and obvious. if ( !$rtTestMode && $ce !== null && !empty( $dp->autoInsertedEnd ) && DOMUtils::isQuoteElt( $child ) ) { $correction = 3 + strlen( $child->nodeName ); if ( $correction === $dsrCorrection ) { $ce -= $correction; $dsrCorrection = 0; } } if ( $child->nodeName === "meta" ) { // Unless they have been foster-parented, // meta marker tags have valid tsr info-> if ( DOMUtils::matchTypeOf( $child, '#^mw:(EndTag|TSRMarker)$#D' ) ) { if ( DOMUtils::hasTypeOf( $child, "mw:EndTag" ) ) { // FIXME: This seems like a different function that is // tacked onto DSR computation, but there is no clean place // to do this one-off thing without doing yet another pass // over the DOM -- maybe we need a 'do-misc-things-pass'. // // Update table-end syntax using info from the meta tag $prev = $child->previousSibling; if ( $prev && $prev->nodeName === "table" ) { DOMUtils::assertElt( $prev ); $prevDP = DOMDataUtils::getDataParsoid( $prev ); if ( !WTUtils::hasLiteralHTMLMarker( $prevDP ) ) { if ( isset( $dp->endTagSrc ) ) { $prevDP->endTagSrc = $dp->endTagSrc; } } } } $isMarkerTag = true; // TSR info will be absent if the tsr-marker came // from a template since template tokens have all // their tsr info-> stripped-> if ( $tsr ) { $endTagInfo = [ 'width' => $tsr->end - $tsr->start, 'nodeName' => $child->getAttribute( "data-etag" ), ]; $cs = $tsr->end; $ce = $tsr->end; $propagateRight = true; } } elseif ( $tsr ) { if ( WTUtils::isTplMarkerMeta( $child ) ) { // If this is a meta-marker tag (for templates, extensions), // we have a new valid '$cs'. This marker also effectively resets tsr // back to the top-level wikitext source range from nested template // source range. $cs = $tsr->start; $ce = $tsr->end; $propagateRight = true; } else { // All other meta-tags: