*/ private $textContentBuffer; /** @var bool */ private $needTransclusionShadow; /** * @param Env $env * @param array $options * @param string $stageId * @param PipelineStage|null $prevStage */ public function __construct( Env $env, array $options = [], string $stageId = "", $prevStage = null ) { parent::__construct( $env, $prevStage ); $this->traceTime = $env->hasTraceFlag( 'time' ); // Reset variable state and set up the parser $this->resetState( [] ); } /** * @inheritDoc */ public function resetState( array $options ): void { // Reset vars $this->tagId = 1; // Assigned to start/self-closing tags $this->inTransclusion = false; $this->bag = new DataBag(); /* -------------------------------------------------------------------- * Crude tracking of whether we are in a table * * The only requirement for correctness of detecting fostering content * is that as long as there is an unclosed tag, this value * is positive. * * We can ensure that by making sure that independent of how many * excess
tags we run into, this value is never negative. * * So, since this.tableDepth >= 0 always, whenever a tag is seen, * this.tableDepth >= 1 always, and our requirement is met. * -------------------------------------------------------------------- */ $this->tableDepth = 0; // We only need one for every run of strings and newline tokens. $this->needTransclusionShadow = false; $this->domBuilder = new DOMBuilder( [ 'suppressHtmlNamespace' => true ] ); $treeBuilder = new TreeBuilder( $this->domBuilder ); $this->dispatcher = new Dispatcher( $treeBuilder ); // PORT-FIXME: Necessary to setEnableCdataCallback $tokenizer = new Tokenizer( $this->dispatcher, '', [ 'ignoreErrors' => true ] ); $this->dispatcher->startDocument( $tokenizer, null, null ); $this->dispatcher->doctype( 'html', '', '', false, 0, 0 ); $this->dispatcher->startTag( 'body', new PlainAttributes(), false, 0, 0 ); } /** * Process a chunk of tokens and feed it to the HTML5 tree builder. * This doesn't return anything. * * @param array $tokens Array of tokens to process */ public function processChunk( array $tokens ): void { $s = null; if ( $this->traceTime ) { $s = PHPUtils::getStartHRTime(); } $n = count( $tokens ); for ( $i = 0; $i < $n; $i++ ) { $this->processToken( $tokens[$i] ); } if ( $this->traceTime ) { $this->env->bumpTimeUse( 'HTML5 TreeBuilder', PHPUtils::getHRTimeDifferential( $s ), 'HTML5' ); } } /** * @inheritDoc */ public function finalizeDOM() { // Check if the EOFTk actually made it all the way through, and flag the // page where it did not! if ( isset( $this->lastToken ) && !( $this->lastToken instanceof EOFTk ) ) { $this->env->log( 'error', 'EOFTk was lost in page', $this->env->getPageConfig()->getTitle() ); } $doc = $this->domBuilder->getFragment(); '@phan-var \DOMDocument $doc'; // @var \DOMDocument $doc // Special case where we can't call `env.createDocument()` $this->env->referenceDataObject( $doc, $this->bag ); // Preparing the DOM is considered one "unit" with treebuilding, // so traversing is done here rather than during post-processing. // // Necessary when testing the port, since: // - de-duplicating data-object-ids must be done before we can store // data-attributes to cross language barriers; // - the calls to fosterCommentData below are storing data-object-ids, // which must be reinserted, again before storing ... $seenDataIds = []; $t = new DOMTraverser(); $t->addHandler( null, function ( ...$args ) use ( &$seenDataIds ) { return PrepareDOM::handler( $seenDataIds, ...$args ); } ); $t->traverse( $this->env, DOMCompat::getBody( $doc ), [], false, null ); // PORT-FIXME: Are we reusing this? Switch to `init()` // $this->resetState([]); return $doc; } /** * @param array $maybeAttribs * @return array */ private function kvArrToAttr( array $maybeAttribs ): array { return array_reduce( $maybeAttribs, function ( $prev, $next ) { $prev[$next->k] = $next->v; return $prev; }, [] ); } /** * @param array $maybeAttribs * @return array */ private function kvArrToFoster( array $maybeAttribs ): array { return array_map( function ( $attr ) { return [ $attr->k, $attr->v ]; }, $maybeAttribs ); } /** * Keep this in sync with `DOMDataUtils.setNodeData()` * * @param array $attribs * @param object $dataAttribs * @return array */ public function stashDataAttribs( array $attribs, object $dataAttribs ): array { $data = [ 'parsoid' => $dataAttribs ]; $attribs = array_filter( $attribs, function ( $attr ) use ( &$data ) { if ( $attr->k === 'data-mw' ) { Assert::invariant( !isset( $data['mw'] ), "data-mw already set." ); $data['mw'] = json_decode( $attr->v ); return false; } return true; } ); $docId = $this->bag->stashObject( (object)$data ); $attribs[] = new KV( DOMDataUtils::DATA_OBJECT_ATTR_NAME, (string)$docId ); return $attribs; } /** * Adapt the token format to internal HTML tree builder format, call the actual * html tree builder by emitting the token. * * @param Token|string $token */ public function processToken( $token ): void { if ( $this->pipelineId === 0 ) { $this->env->bumpWt2HtmlResourceUse( 'token' ); } $attribs = $token->attribs ?? []; $dataAttribs = $token->dataAttribs ?? (object)[ 'tmp' => new stdClass ]; if ( !isset( $dataAttribs->tmp ) ) { $dataAttribs->tmp = new stdClass; } if ( $this->inTransclusion ) { $dataAttribs->tmp->inTransclusion = true; } // Assign tagId to open/self-closing tags if ( $token instanceof TagTk || $token instanceof SelfclosingTagTk ) { $dataAttribs->tmp->tagId = $this->tagId++; } $attribs = $this->stashDataAttribs( $attribs, $dataAttribs ); $this->env->log( 'trace/html', $this->pipelineId, function () use ( $token ) { return PHPUtils::jsonEncode( $token ); } ); // Store the last token $this->lastToken = $token; // If we encountered a non-string non-nl token, we have broken a run of // string+nl content. If we need transclusion shadow protection, now's // the time to insert it. if ( !is_string( $token ) && !( $token instanceof NlTk ) && $this->needTransclusionShadow ) { $this->needTransclusionShadow = false; // If inside a table and a transclusion, add a meta tag after every // text node so that we can detect fostered content that came from // a transclusion. $this->env->log( 'debug/html', $this->pipelineId, 'Inserting shadow transclusion meta' ); $this->dispatcher->startTag( 'meta', new PlainAttributes( $this->kvArrToAttr( [ new KV( 'typeof', 'mw:TransclusionShadow' ) ] ) ), true, 0, 0 ); } if ( is_string( $token ) || $token instanceof NlTk ) { $data = ( $token instanceof NlTk ) ? "\n" : $token; $this->dispatcher->characters( $data, 0, strlen( $data ), 0, 0 ); // NlTks are only fostered when accompanied by non-whitespace. // Safe to ignore. if ( $this->inTransclusion && $this->tableDepth > 0 && is_string( $token ) ) { $this->needTransclusionShadow = true; } } elseif ( $token instanceof TagTk ) { $tName = $token->getName(); if ( $tName === 'table' ) { $this->tableDepth++; // Don't add foster box in transclusion // Avoids unnecessary insertions, the case where a table // doesn't have tsr info, and the messy unbalanced table case, // like the navbox if ( !$this->inTransclusion ) { $this->env->log( 'debug/html', $this->pipelineId, 'Inserting foster box meta' ); $this->dispatcher->startTag( 'table', new PlainAttributes( $this->kvArrToAttr( [ new KV( 'typeof', 'mw:FosterBox' ) ] ) ), false, 0, 0 ); } } $this->dispatcher->startTag( $tName, new PlainAttributes( $this->kvArrToAttr( $attribs ) ), false, 0, 0 ); if ( empty( $dataAttribs->autoInsertedStart ) ) { $this->env->log( 'debug/html', $this->pipelineId, 'Inserting shadow meta for', $tName ); $attrs = $this->stashDataAttribs( [ new KV( 'typeof', 'mw:StartTag' ), new KV( 'data-stag', "{$tName}:{$dataAttribs->tmp->tagId}" ) ], Utils::clone( $dataAttribs ) ); $this->dispatcher->comment( WTUtils::fosterCommentData( 'mw:shadow', $this->kvArrToFoster( $attrs ), false ), 0, 0 ); } } elseif ( $token instanceof SelfclosingTagTk ) { $tName = $token->getName(); // Re-expand an empty-line meta-token into its constituent comment + WS tokens if ( TokenUtils::isEmptyLineMetaToken( $token ) ) { $this->processChunk( $dataAttribs->tokens ); return; } $wasInserted = false; // Convert mw metas to comments to avoid fostering. // But <*include*> metas, behavior switch metas // should be fostered since they end up generating // HTML content at the marker site. if ( $tName === 'meta' ) { $shouldFoster = TokenUtils::matchTypeOf( $token, '#^mw:Includes/(OnlyInclude|IncludeOnly|NoInclude)(/|$)#' ); if ( !$shouldFoster ) { $prop = $token->getAttribute( 'property' ) ?: ''; $shouldFoster = preg_match( '/^(mw:PageProp\/[a-zA-Z]*)\b/', $prop ); } if ( !$shouldFoster ) { // transclusions state $transType = TokenUtils::matchTypeOf( $token, '#^mw:Transclusion#' ); if ( $transType ) { // typeof starts with mw:Transclusion $this->inTransclusion = ( $transType === 'mw:Transclusion' ); } $this->dispatcher->comment( WTUtils::fosterCommentData( $token->getAttribute( 'typeof' ) ?? '', $this->kvArrToFoster( $attribs ), false ), 0, 0 ); $wasInserted = true; } } if ( !$wasInserted ) { $this->dispatcher->startTag( $tName, new PlainAttributes( $this->kvArrToAttr( $attribs ) ), true, 0, 0 ); if ( !Utils::isVoidElement( $tName ) ) { // PORT-FIXME: startTag has a self-closed flag? // VOID_ELEMENTS are automagically treated as self-closing by // the tree builder $this->dispatcher->endTag( $tName, 0, 0 ); } } } elseif ( $token instanceof EndTagTk ) { $tName = $token->getName(); if ( $tName === 'table' && $this->tableDepth > 0 ) { $this->tableDepth--; } $this->dispatcher->endTag( $tName, 0, 0 ); if ( empty( $dataAttribs->autoInsertedEnd ) ) { $this->env->log( 'debug/html', $this->pipelineId, 'Inserting shadow meta for', $tName ); $attrs = array_merge( $attribs, [ new KV( 'typeof', 'mw:EndTag' ), new KV( 'data-etag', $tName ) ] ); $this->dispatcher->comment( WTUtils::fosterCommentData( 'mw:shadow', $this->kvArrToFoster( $attrs ), false ), 0, 0 ); } } elseif ( $token instanceof CommentTk ) { $this->dispatcher->comment( $token->value, 0, 0 ); } elseif ( $token instanceof EOFTk ) { $this->dispatcher->endDocument( 0 ); } else { $errors = [ '-------- Unhandled token ---------', 'TYPE: ' . $token->getType(), 'VAL : ' . PHPUtils::jsonEncode( $token ) ]; $this->env->log( 'error', implode( "\n", $errors ) ); } } /** * @inheritDoc */ public function process( $input, array $opts = null ) { '@phan-var array $input'; // @var array $input $this->processChunk( $input ); return $this->finalizeDOM(); } /** * @inheritDoc */ public function processChunkily( $input, array $opts = null ): Generator { if ( $this->prevStage ) { foreach ( $this->prevStage->processChunkily( $input, $opts ) as $chunk ) { '@phan-var array $chunk'; // @var array $chunk $this->processChunk( $chunk ); } yield $this->finalizeDOM(); } else { yield $this->process( $input, $opts ); } } }