` tags). * * The main entry point is the static function `ConstrainedText::escapeLine()`. */ class ConstrainedText { /** * This adds necessary escapes to a line of chunks. We provide * the `ConstrainedText#escape` function with its left and right * context, and it can determine what escapes are needed. * * The `line` parameter is an array of `ConstrainedText` *chunks* * which make up a line (or part of a line, in some cases of nested * processing). * * @param ConstrainedText[] $line * @return string */ public static function escapeLine( array $line ): string { // The left context will be precise (that is, it is the result // of `ConstrainedText#escape` and will include any escapes // triggered by chunks on the left), but the right context // is just the (unescaped) text property from the chunk. // As we work left to right we will piece together a fully-escaped // string. Be careful not to shoot yourself in the foot -- if the // escaped text is significantly different from the chunk's `text` // property, the preceding chunk may not have made the correct // decisions about emitting an escape suffix. We could solve // this by looping until the state converges (or until we detect // a loop) but for now let's hope that's not necessary. $state = new State( $line ); $safeLeft = ''; for ( $state->pos = 0; $state->pos < count( $line ); $state->pos++ ) { $chunk = $line[$state->pos]; // Process the escapes for this chunk, given escaped previous chunk $state->rightContext = substr( $state->rightContext, strlen( $chunk->text ) ); $thisEscape = $chunk->escape( $state ); $state->leftContext .= ( $thisEscape->prefix ?? '' ) . $thisEscape->text . ( $thisEscape->suffix ?? '' ); if ( $thisEscape->greedy ) { // protect the left context: this will be matched greedily // by this chunk, so there's no chance that a subsequent // token will include this in its prefix. $safeLeft .= $state->leftContext; $state->leftContext = ''; } } // right context should be empty here. return $safeLeft . $state->leftContext; } /** * The wikitext string associated with this chunk. * @var string */ public $text; /** * The DOM Node associated with this chunk. * @var DOMNode */ public $node; /** * The prefix string to add if the start of the chunk doesn't match its * constraints. * @var string */ public $prefix; /** * The suffix string to add if the end of the chunk doesn't match its * constraints. * @var string */ public $suffix; /** * Does this chunk come from selser? * @var bool */ public $selser; /** * Suppress separators? * @var bool */ public $noSep; /** * @param array{text:string,node:DOMNode,prefix?:string,suffix?:string} $args Options. */ public function __construct( array $args ) { $this->text = $args['text']; $this->node = $args['node']; $this->prefix = $args['prefix'] ?? null; $this->suffix = $args['suffix'] ?? null; $this->selser = false; $this->noSep = false; } /** * Ensure that the argument `o`, which is perhaps a string, is a instance of * `ConstrainedText`. * @param string|ConstrainedText $o * @param DOMNode $node * The {@link DOMNode} corresponding to `o`. * @return ConstrainedText */ public static function cast( $o, DOMNode $node ): ConstrainedText { if ( $o instanceof ConstrainedText ) { return $o; } return new ConstrainedText( [ 'text' => $o ?? '', 'node' => $node ] ); } /** * Use the provided `state`, which gives context and access to the entire * list of chunks, to determine the proper escape prefix/suffix. * Returns an object with a `text` property as well as optional * `prefix` and 'suffix' properties giving desired escape strings. * @param State $state Context state * @return Result */ public function escape( State $state ): Result { // default implementation: no escaping, no prefixes or suffixes. return new Result( $this->text, $this->prefix, $this->suffix ); } /** * Simple equality. This enforces type equality * (ie subclasses are not equal). * @param ConstrainedText $ct * @return bool */ public function equals( ConstrainedText $ct ): bool { return $this === $ct || ( get_class( $this ) === self::class && get_class( $ct ) === self::class && $this->text === $ct->text ); } /** * Useful shortcut: execute a regular expression on the raw wikitext. * @param string $re * @return array|null * An array containing the matched results or null if there were no matches. */ public function match( string $re ): ?array { $r = preg_match( $re, $this->text, $m ); if ( $r === false ) { throw new \Error( 'Bad regular expression' ); } return $r === 0 ? null : $m; } /** * SelSer support: when we come across an unmodified node in during * selective serialization, we know we can use the original wikitext * for that node unmodified. *But* there may be boundary conditions * on the left and right sides of the selser'ed text which are going * to require escaping. * * So rather than turning the node into a plain old `ConstrainedText` * chunk, allow subclasses of `ConstrainedText` to register as potential * handlers of selser nodes. A selser'ed magic link, for example, * will then turn into a `MagicLinkText` and thus be able to enforce * the proper boundary constraints. * * @param string $text * @param DOMElement $node * @param stdClass $dataParsoid * @param Env $env * @param array $opts * @return ConstrainedText[] */ public static function fromSelSer( string $text, DOMElement $node, stdClass $dataParsoid, Env $env, array $opts = [] ): array { // Main dispatch point: iterate through registered subclasses, asking // each if they can handle this node (by invoking `fromSelSerImpl`). // We define parent types before subtypes, so search the list backwards // to be sure we check subtypes before parent types. $types = self::$types; for ( $i = count( $types ) - 1; $i >= 0; $i-- ) { $ct = call_user_func( [ $types[$i], 'fromSelSerImpl' ], $text, $node, $dataParsoid, $env, $opts ); if ( !$ct ) { continue; } if ( !is_array( $ct ) ) { $ct = [ $ct ]; } // tag these chunks as coming from selser foreach ( $ct as $t ) { $t->selser = true; } return $ct; } // ConstrainedText::fromSelSerImpl should handle everything which reaches it // so nothing should make it here. throw new \Error( 'Should never happen.' ); } /** * Base case: the given node type does not correspond to a special * `ConstrainedText` subclass. We still have to be careful: the leftmost * (rightmost) children of `node` may still be exposed to our left (right) * context. If so (ie, their DSR bounds coincide) split the selser text * and emit multiple `ConstrainedText` chunks to preserve the proper * boundary conditions. * * @param string $text * @param DOMElement $node * @param stdClass $dataParsoid * @param Env $env * @param array $opts * @return ConstrainedText|ConstrainedText[] */ protected static function fromSelSerImpl( string $text, DOMElement $node, stdClass $dataParsoid, Env $env, array $opts ) { // look at leftmost and rightmost children, it may be that we need // to turn these into ConstrainedText chunks in order to preserve // the proper escape conditions on the prefix/suffix text. $firstChild = DOMUtils::firstNonDeletedChild( $node ); $lastChild = DOMUtils::lastNonDeletedChild( $node ); $firstChildDp = $firstChild instanceof DOMElement ? DOMDataUtils::getDataParsoid( $firstChild ) : null; $lastChildDp = $lastChild instanceof DOMElement ? DOMDataUtils::getDataParsoid( $lastChild ) : null; $prefixChunks = []; $suffixChunks = []; $len = null; $ignorePrefix = $opts['ignorePrefix'] ?? false; $ignoreSuffix = $opts['ignoreSuffix'] ?? false; // check to see if first child's DSR start is the same as this node's // DSR start. If so, the first child is exposed to the (modified) // left-hand context, and so recursively convert it to the proper // list of specialized chunks. if ( !$ignorePrefix && $firstChildDp && Utils::isValidDSR( $firstChildDp->dsr ?? null ) && $dataParsoid->dsr->start === $firstChildDp->dsr->start ) { DOMUtils::assertElt( $firstChild ); // implied by $firstChildDp $len = $firstChildDp->dsr->length(); if ( $len < 0 ) { // T254412: Bad DSR $env->log( "error/html2wt/dsr", "Bad DSR: " . PHPUtils::jsonEncode( $firstChildDp->dsr ), "Node: " . DOMCompat::getOuterHTML( $firstChild ) ); $prefixChunks = []; } else { if ( $len > strlen( $text ) ) { // T254412: Bad DSR $env->log( "error/html2wt/dsr", "Bad DSR: " . PHPUtils::jsonEncode( $firstChildDp->dsr ), "Node: " . DOMCompat::getOuterHTML( $firstChild ) ); $len = strlen( $text ); } $prefixChunks = self::fromSelSer( substr( $text, 0, $len ), $firstChild, $firstChildDp, $env, // this child node's right context will be protected: [ 'ignoreSuffix' => true ] ); $text = substr( $text, $len ); } } // check to see if last child's DSR end is the same as this node's // DSR end. If so, the last child is exposed to the (modified) // right-hand context, and so recursively convert it to the proper // list of specialized chunks. if ( !$ignoreSuffix && $lastChild !== $firstChild && $lastChildDp && Utils::isValidDSR( $lastChildDp->dsr ?? null ) && $dataParsoid->dsr->end === $lastChildDp->dsr->end ) { DOMUtils::assertElt( $lastChild ); // implied by $lastChildDp $len = $lastChildDp->dsr->length(); if ( $len < 0 ) { // T254412: Bad DSR $env->log( "error/html2wt/dsr", "Bad DSR: " . PHPUtils::jsonEncode( $lastChildDp->dsr ), "Node: " . DOMCompat::getOuterHTML( $lastChild ) ); $suffixChunks = []; } else { if ( $len > strlen( $text ) ) { // T254412: Bad DSR $env->log( "error/html2wt/dsr", "Bad DSR: " . PHPUtils::jsonEncode( $lastChildDp->dsr ), "Node: " . DOMCompat::getOuterHTML( $lastChild ) ); $len = strlen( $text ); } $suffixChunks = self::fromSelSer( substr( $text, -$len ), $lastChild, $lastChildDp, $env, // this child node's left context will be protected: [ 'ignorePrefix' => true ] ); $text = substr( $text, 0, -$len ); } } // glue together prefixChunks, whatever's left of `text`, and suffixChunks $chunks = [ self::cast( $text, $node ) ]; $chunks = array_merge( $prefixChunks, $chunks, $suffixChunks ); // top-level chunks only: if ( !( $ignorePrefix || $ignoreSuffix ) ) { // ensure that the first chunk belongs to `node` in order to // emit separators correctly before `node` if ( $chunks[0]->node !== $node ) { array_unshift( $chunks, self::cast( '', $node ) ); } // set 'noSep' flag on all but the first chunk, so we don't get // extra separators from `SSP.emitChunk` foreach ( $chunks as $i => $t ) { if ( $i > 0 ) { $t->noSep = true; } } } return $chunks; } /** * List of types we attempt `fromSelSer` with. This should include all the * concrete subclasses of `ConstrainedText` (`RegExpConstrainedText` is * missing since it is an abstract class). We also include the * `ConstrainedText` class as the first element (even though it is * an abstract base class) as a little bit of a hack: it simplifies * `ConstrainedText.fromSelSer` by factoring some of its work into * `ConstrainedText.fromSelSerImpl`. * @var class-string[] */ private static $types = [ // Base class is first, as a special case self::class, // All concrete subclasses of ConstrainedText WikiLinkText::class, ExtLinkText::class, AutoURLLinkText::class, MagicLinkText::class, LanguageVariantText::class ]; }