. * @var bool */ public $escapeText = false; /** * Used as fast patch for special protected characters in WikitextEscapeHandlers and * comes from LanguageVariantHandler * @var string|null */ public $protect; /** @var Env */ private $env; /** @var DOMElement */ private $prevNode; /** * Log prefix to use in trace output * @var string */ private $logPrefix = 'OUT:'; /** * @param WikitextSerializer $serializer * @param array $options */ public function __construct( WikitextSerializer $serializer, array $options = [] ) { $this->env = $serializer->env; $this->serializer = $serializer; $this->extApi = new ParsoidExtensionAPI( $this->env, [ 'html2wt' => [ 'state' => $this ] ] ); foreach ( $options as $name => $option ) { // PORT-FIXME validate if ( !( $option instanceof Env ) ) { $this->$name = Utils::clone( $option ); } } $this->resetCurrLine( null ); $this->singleLineContext = new SingleLineContext(); $this->resetSep(); } /** * @note Porting note: this replaces direct access * @return Env */ public function getEnv(): Env { return $this->env; } /** * Initialize a few boolean flags based on serialization mode. * FIXME: Ideally, this should be private. Requires shuffing around * where SerializerState is constructed so that $selserMode is known * at the time of construction. * @private for use by WikitextSerializer only * @param bool $selserMode Are we running selective serialization? */ public function initMode( bool $selserMode ): void { $this->useWhitespaceHeuristics = Semver::satisfies( $this->env->getInputContentVersion(), '>=1.7.0' ); $this->selserMode = $selserMode; $this->rtTestMode = $this->rtTestMode && !$this->selserMode; // Always false in selser mode. } /** * Appends the seperator source and updates the SOL state if necessary. * @param string $src */ public function appendSep( string $src ): void { $this->sep->src = ( $this->sep->src ?: '' ) . $src; $this->sepIntroducedSOL( $src ); } /** * Cycle the state after processing a node. * @param DOMNode $node */ public function updateSep( DOMNode $node ): void { $this->sep->lastSourceNode = $node; } private function resetSep() { $this->sep = PHPUtils::arrayToObject( [ 'constraints' => null, 'src' => null, 'lastSourceNode' => null, ] ); } /** * Reset the current line state. * @param DOMNode|null $node */ private function resetCurrLine( ?DOMNode $node ): void { $this->currLine = (object)[ 'text' => '', 'chunks' => [], 'firstNode' => $node ]; } /** * Process and emit a line of ConstrainedText chunks, adjusting chunk boundaries as necessary. * (Start of line and end of line are always safe for ConstrainedText chunks, so we don't need * to buffer more than the last line.) */ private function flushLine(): void { $this->out .= ConstrainedText::escapeLine( $this->currLine->chunks ); $this->currLine->chunks = []; } /** * Extracts a subset of the page source bound by the supplied indices. * @param int $start Start offset, in bytes * @param int $end End offset, in bytes * @return string|null */ public function getOrigSrc( int $start, int $end ): ?string { Assert::invariant( $this->selserMode, 'SerializerState::$selserMode must be set' ); if ( $start <= $end && // FIXME: Having a $start greater than the source length is // probably a canary for corruption. Maybe we should be throwing // here instead. See T240053 $start <= strlen( $this->selserData->oldText ) ) { return substr( $this->selserData->oldText, $start, $end - $start ); } else { return null; } } /** * Like it says on the tin. * @param DOMNode $node */ public function updateModificationFlags( DOMNode $node ): void { $this->prevNodeUnmodified = $this->currNodeUnmodified; $this->currNodeUnmodified = false; $this->prevNode = $node; } /** * Separators put us in SOL state. * @param string $sep */ private function sepIntroducedSOL( string $sep ): void { // Don't get tripped by newlines in comments! Be wary of nowikis added // by makeSepIndentPreSafe on the last line. if ( substr( preg_replace( Utils::COMMENT_REGEXP, '', $sep ), -1 ) === "\n" ) { // Since we are stashing away newlines for emitting // before the next element, we are in SOL state wrt // the content of that next element. // // FIXME: The only serious caveat is if all these newlines // will get stripped out in the context of any parent node // that suppress newlines (ex:
foo
\nbar
" * Edited HTML : "foo
\nbar
" * Annotated DOM: "foo
\nbar
" * Expected WT : "foo\n\nbar" * ``` * * Note the additional newline between "foo" and "bar" even though originally, * there was just a single newline. * * So, even though the two P tags and the separator between them is * unmodified, it is insufficient to rely on just that. We have to look at * what has happened on the two wikitext lines onto which the two P tags * will get serialized. * * Now, if you check the code for `nextToDeletedBlockNodeInWT`, that code is * not really looking at ALL the nodes before/after the nodes that could * serialize onto the wikitext lines. It is looking at the immediately * adjacent nodes, i.e. it is not necessary to look if a block-tag was * deleted 2 or 5 siblings away. If we had to actually examine all of those, * nodes, this would get very complex, and it would be much simpler to just * discard the original separators => potentially lots of dirty diffs. * * To understand why it is sufficient (for correctness) to examine just * the immediately adjacent nodes, let us look at an additional example. * ``` * Original WT : "aa
c
e
\nf
" * ``` * Note how `` tags interleave in the HTML. This would be * the case always no matter how much inline content showed up between the * block tags in wikitext. If the b-`
a
*b
// -- EDIT -->a
*b
// -- html2wt --> a\n\na
,*b
, and#c
// will be marked unmodified and will be processed below. if ( $this->selserMode && $this->onSOL && $this->currNodeUnmodified // 'node' came from original Parsoid HTML unmodified. So, if its content // needs nowiki-escaping, we know that the reason it didn't parse into // lists/headings/whatever is because it didn't occur at the start of the // line => it had a block-tag in the original wikitext. So if the previous // node was also unmodified (and since it also came from original Parsoid // HTML), we can safely infer that it couldn't have been an inline node or // a P-tag (if it were, the p-wrapping code would have swallowed that content // into 'node'). So, it would have to be some sort of block tag => this.onSOL // couldn't have been true (because we could have serialized 'node' on the // same line as the block tag) => we can save some effort by eliminating // scenarios where 'this.prevNodeUnmodified' is true. && !$this->prevNodeUnmodified && $node->nodeName === 'p' && !WTUtils::isLiteralHTMLNode( $node ) ) { $pChild = DOMUtils::firstNonSepChild( $node ); // If a text node, we have to make sure that the text doesn't // get reparsed as non-text in the wt2html pipeline. if ( $pChild && DOMUtils::isText( $pChild ) ) { $match = $res->match( $this->solWikitextRegexp() ); if ( $match && isset( $match[2] ) ) { if ( preg_match( '/^([\*#:;]|{\||.*=$)/D', $match[2] ) // ! and | chars are harmless outside tables || ( preg_match( '/^[\|!]/', $match[2] ) && $this->wikiTableNesting > 0 ) // indent-pres are suppressed inside|| ( preg_match( '/^ [^\s]/', $match[2] ) && !DOMUtils::hasAncestorOfName( $node, 'blockquote' ) ) ) { $res = ConstrainedText::cast( ( $match[1] ?: '' ) . '' . substr( $match[2], 0, 1 ) . ' ' . substr( $match[2], 1 ), $node ); } } } } } // Emitting text that has not been escaped $this->currLine->text .= $res->text; // Output res $this->serializer->trace( '--->', $this->logPrefix, function () use ( $res ) { return PHPUtils::jsonEncode( $res->text ); } ); $this->pushToCurrLine( $res, $node ); // Update sol flag. Test for newlines followed by optional includeonly or comments if ( !$res->match( $this->solRegexp() ) ) { $this->onSOL = false; } // We've emit something so we're no longer at SOO. $this->atStartOfOutput = false; } /** * Serialize the children of a DOM node, sharing the global serializer state. * Typically called by a DOM-based handler to continue handling its children. * @param DOMElement $node * @param callable|null $wtEscaper ( $state, $text, $opts ) * PORT-FIXME document better; should this be done via WikitextEscapeHandlers somehow? * @param DOMNode|null $firstChild */ public function serializeChildren( DOMElement $node, callable $wtEscaper = null, DOMNode $firstChild = null ): void { // SSS FIXME: Unsure if this is the right thing always if ( $wtEscaper ) { $this->wteHandlerStack[] = $wtEscaper; } $child = $firstChild ?: $node->firstChild; while ( $child !== null ) { // We always get the next child to process $child = $this->serializer->serializeNode( $child ); } if ( $wtEscaper ) { array_pop( $this->wteHandlerStack ); } // If we serialized children explicitly, // we were obviously processing a modified node. $this->currNodeUnmodified = false; } /** * Abstracts some steps taken in `serializeChildrenToString` and `serializeDOM` * @param DOMElement $node * @param callable|null $wtEscaper See {@link serializeChildren()} * @internal For use by WikitextSerializer only */ public function kickOffSerialize( DOMElement $node, callable $wtEscaper = null ): void { $this->updateSep( $node ); $this->currNodeUnmodified = false; $this->updateModificationFlags( $node ); $this->resetCurrLine( $node->firstChild ); $this->serializeChildren( $node, $wtEscaper ); // Emit child-parent seps. $this->emitSepForNode( $node ); // We've reached EOF, flush the remaining buffered text. $this->flushLine(); } /** * Serialize children to a string * * FIXME(arlorla): Shouldn't affect the separator state, but accidents have * have been known to happen. T109793 suggests using its own wts / state. * * @param DOMElement $node * @param callable|null $wtEscaper See {@link serializeChildren()} * @param string $inState * @return string */ private function serializeChildrenToString( DOMElement $node, ?callable $wtEscaper, string $inState ): string { $states = [ 'inLink', 'inCaption', 'inIndentPre', 'inHTMLPre', 'inPHPBlock', 'inAttribute' ]; Assert::parameter( in_array( $inState, $states, true ), '$inState', 'Must be one of: ' . implode( ', ', $states ) ); // FIXME: Make sure that the separators emitted here conform to the // syntactic constraints of syntactic context. $oldSep = $this->sep; $oldSOL = $this->onSOL; $oldOut = $this->out; $oldStart = $this->atStartOfOutput; $oldCurrLine = $this->currLine; $oldLogPrefix = $this->logPrefix; // Modification flags $oldPrevNodeUnmodified = $this->prevNodeUnmodified; $oldCurrNodeUnmodified = $this->currNodeUnmodified; $oldPrevNode = $this->prevNode; $this->out = ''; $this->logPrefix = 'OUT(C):'; $this->resetSep(); $this->onSOL = false; $this->atStartOfOutput = false; $this->$inState = true; $this->kickOffSerialize( $node, $wtEscaper ); // restore the state $bits = $this->out; $this->out = $oldOut; $this->$inState = false; $this->sep = $oldSep; $this->onSOL = $oldSOL; $this->atStartOfOutput = $oldStart; $this->currLine = $oldCurrLine; $this->logPrefix = $oldLogPrefix; // Modification flags $this->prevNodeUnmodified = $oldPrevNodeUnmodified; $this->currNodeUnmodified = $oldCurrNodeUnmodified; $this->prevNode = $oldPrevNode; return $bits; } /** * Serialize children of a link to a string * @param DOMElement $node * @param callable|null $wtEscaper See {@link serializeChildren()} * @return string */ public function serializeLinkChildrenToString( $node, $wtEscaper = null ): string { return $this->serializeChildrenToString( $node, $wtEscaper, 'inLink' ); } /** * Serialize children of a caption to a string * @param DOMElement $node * @param callable|null $wtEscaper See {@link serializeChildren()} * @return string */ public function serializeCaptionChildrenToString( $node, $wtEscaper = null ): string { return $this->serializeChildrenToString( $node, $wtEscaper, 'inCaption' ); } /** * Serialize children of an indent-pre to a string * @param DOMElement $node * @param callable|null $wtEscaper See {@link serializeChildren()} * @return string */ public function serializeIndentPreChildrenToString( $node, $wtEscaper = null ): string { return $this->serializeChildrenToString( $node, $wtEscaper, 'inIndentPre' ); } }