$value, 'srcOffsets' => new SourceRange( $start, $end ) ]; } public static function buildTableTokens( string $tagName, string $wtChar, $attrInfo, SourceRange $tsr, int $endPos, $content, bool $addEndTag = false ): array { $a = null; $dp = (object)[ 'tsr' => $tsr ]; if ( !$attrInfo ) { $a = []; if ( $tagName === 'td' || $tagName === 'th' ) { // Add a flag that indicates that the tokenizer didn't // encounter a "|...|" attribute box. This is useful when // deciding which / cells need attribute fixups. $dp->tmp = PHPUtils::arrayToObject( [ 'noAttrs' => true ] ); } } else { $a = $attrInfo[0]; if ( count( $a ) === 0 ) { $dp->startTagSrc = $wtChar . $attrInfo[1]; } if ( ( count( $a ) === 0 && $attrInfo[2] ) || $attrInfo[2] !== '|' ) { // Variation from default // 1. Separator present with an empty attribute block // 2. Not "|" $dp->attrSepSrc = $attrInfo[2]; } } $dataAttribs = (object)[ 'tsr' => new SourceRange( $endPos, $endPos ) ]; $endTag = null; if ( $addEndTag ) { $endTag = new EndTagTk( $tagName, [], $dataAttribs ); } else { // We rely on our tree builder to close the table cell (td/th) as needed. // We cannot close the cell here because cell content can come from // multiple parsing contexts and we cannot close the tag in the same // parsing context in which the td was opened: // Ex: {{1x|{{!}}foo}}{{1x|bar}} has to output foobar // // But, add a marker meta-tag to capture tsr info. // SSS FIXME: Unsure if this is actually helpful, but adding it in just in case. // Can test later and strip it out if it doesn't make any diff to rting. $endTag = new SelfclosingTagTk( 'meta', [ new KV( 'typeof', 'mw:TSRMarker' ), new KV( 'data-etag', $tagName ) ], $dataAttribs ); } return array_merge( [ new TagTk( $tagName, $a, $dp ) ], $content, [ $endTag ] ); } public static function buildXMLTag( string $name, string $lcName, array $attribs, $endTag, bool $selfClose, SourceRange $tsr ) { $tok = null; $da = (object)[ 'tsr' => $tsr, 'stx' => 'html' ]; if ( $name !== $lcName ) { $da->srcTagName = $name; } if ( $endTag !== null ) { $tok = new EndTagTk( $lcName, $attribs, $da ); } elseif ( $selfClose ) { $da->selfClose = true; $tok = new SelfclosingTagTk( $lcName, $attribs, $da ); } else { $tok = new TagTk( $lcName, $attribs, $da ); } return $tok; } /** * Inline breaks, flag-enabled rule which detects end positions for * active higher-level rules in inline and other nested rules. * Those inner rules are then exited, so that the outer rule can * handle the end marker. */ public static function inlineBreaks( string $input, int $pos, array $stops ) { $c = $input[$pos]; $c2 = $input[$pos + 1] ?? ''; switch ( $c ) { case '=': if ( $stops['arrow'] && $c2 === '>' ) { return true; } return $stops['equal'] || $stops['h'] && ( $pos === strlen( $input ) - 1 // possibly more equals followed by spaces or comments || preg_match( '/^=*(?:[ \t]|<\!--(?:(?!-->).)*-->)*(?:[\r\n]|$)/sD', substr( $input, $pos + 1 ) ) ); case '|': return ( $stops['templateArg'] && !$stops['extTag'] ) || $stops['tableCellArg'] || $stops['linkdesc'] || ( $stops['table'] && $pos < strlen( $input ) - 1 && preg_match( '/[}|]/', $input[$pos + 1] ) ); case '!': return $stops['th'] && !$stops['templatedepth'] && $c2 === '!'; case '{': // {{!}} pipe templates.. // FIXME: Presumably these should mix with and match | above. // phpcs:ignore Squiz.WhiteSpace.LanguageConstructSpacing.IncorrectSingle return ( $stops['tableCellArg'] && substr( $input, $pos, 5 ) === '{{!}}' ) || ( $stops['table'] && substr( $input, $pos, 10 ) === '{{!}}{{!}}' ); case '}': $preproc = $stops['preproc']; return ( $c2 === '}' && $preproc === '}}' ) || ( $c2 === '-' && $preproc === '}-' ); case ':': return $stops['colon'] && !$stops['extlink'] && !$stops['templatedepth'] && !$stops['linkdesc'] && !( $stops['preproc'] === '}-' ); case ';': return $stops['semicolon']; case "\r": return $stops['table'] && preg_match( '/\r\n?\s*[!|]/', substr( $input, $pos ) ); case "\n": // The code below is just a manual / efficient // version of this check. // // stops.table && /^\n\s*[!|]/.test(input.substr(pos)); // // It eliminates a substr on the string and eliminates // a potential perf problem since "\n" and the inline_breaks // test is common during tokenization. if ( !$stops['table'] ) { return false; } // Allow leading whitespace in tables // Since we switched on 'c' which is input[pos], // we know that input[pos] is "\n". // So, the /^\n/ part of the regexp is already satisfied. // Look for /\s*[!|]/ below. $n = strlen( $input ); for ( $i = $pos + 1; $i < $n; $i++ ) { $d = $input[$i]; if ( preg_match( '/[!|]/', $d ) ) { return true; } elseif ( !( preg_match( '/\s/', $d ) ) ) { return false; } } return false; case '[': // This is a special case in php's doTableStuff, added in // response to T2553. If it encounters a `[[`, it bails on // parsing attributes and interprets it all as content. return $stops['tableCellArg'] && $c2 === '['; case '-': // Same as above: a special case in doTableStuff, added // as part of T153140 return $stops['tableCellArg'] && $c2 === '{'; case ']': if ( $stops['extlink'] ) { return true; } return $stops['preproc'] === ']]' && $c2 === ']'; default: throw new \Exception( 'Unhandled case!' ); } } /** Pop off the end comments, if any. */ public static function popComments( array &$attrs ) { $buf = []; for ( $i = count( $attrs ) - 1; $i > -1; $i-- ) { $kv = $attrs[$i]; if ( is_string( $kv->k ) && !$kv->v && preg_match( '/^\s*$/D', $kv->k ) ) { // permit whitespace array_unshift( $buf, $kv->k ); } elseif ( is_array( $kv->k ) && !$kv->v ) { // all should be comments foreach ( $kv->k as $k ) { if ( !( $k instanceof CommentTk ) ) { break 2; } } array_splice( $buf, 0, 0, $kv->k ); } else { break; } } // ensure we found a comment while ( count( $buf ) && !( $buf[0] instanceof CommentTk ) ) { array_shift( $buf ); } if ( count( $buf ) ) { array_splice( $attrs, -count( $buf ), count( $buf ) ); return [ 'buf' => $buf, 'commentStartPos' => $buf[0]->dataAttribs->tsr->start ]; } else { return null; } } public static function enforceParserResourceLimits( Env $env, $token ) { if ( $token && ( $token instanceof TagTk || $token instanceof SelfclosingTagTk ) ) { switch ( $token->getName() ) { case 'listItem': $env->bumpWt2HtmlResourceUse( 'listItem' ); break; case 'template': $env->bumpWt2HtmlResourceUse( 'transclusion' ); break; case 'td': case 'th': $env->bumpWt2HtmlResourceUse( 'tableCell' ); break; } } } public static function protectAttrs( string $name ) { if ( self::$protectAttrsRegExp === null ) { self::$protectAttrsRegExp = "/^(about|data-mw.*|data-parsoid.*|data-x.*|" . DOMDataUtils::DATA_OBJECT_ATTR_NAME . '|property|rel|typeof)$/i'; } return preg_replace( self::$protectAttrsRegExp, 'data-x-$1', $name ); } public static function isIncludeTag( $name ) { return $name === 'includeonly' || $name === 'noinclude' || $name === 'onlyinclude'; } }