$value ) { if ( is_array( $value ) ) { $ret = array_merge( $ret, array_flatten( $value ) ); } else { $ret[$key] = $value; } } return $ret; } /** * MediaWiki-compatible italic/bold handling as a token stream transformation. */ class QuoteTransformer extends TokenHandler { /** Chunks alternate between quote tokens and sequences of non-quote * tokens. The quote tokens are later replaced with the actual tag * token for italic or bold. The first chunk is a non-quote chunk. * @var array */ private $chunks; /** * The current chunk we're accumulating into. * @var array */ private $currentChunk; /** * Last italic / last bold open tag seen. Used to add autoInserted flags * where necessary. * @var array */ private $last; /** * @param TokenTransformManager $manager manager environment * @param array $options options */ public function __construct( TokenTransformManager $manager, array $options ) { parent::__construct( $manager, $options ); $this->reset(); } /** * Reset the buffering of chunks * */ private function reset(): void { // Chunks alternate between quote tokens and sequences of non-quote // tokens. The quote tokens are later replaced with the actual tag // token for italic or bold. The first chunk is a non-quote chunk. $this->chunks = []; // The current chunk we're accumulating into. $this->currentChunk = []; // last italic / last bold open tag seen. Used to add autoInserted flags // where necessary. $this->last = []; $this->onAnyEnabled = false; } /** * Make a copy of the token context * */ private function startNewChunk(): void { $this->chunks[] = $this->currentChunk; $this->currentChunk = []; } /** * Handles mw-quote tokens and td/th tokens * @inheritDoc */ public function onTag( Token $token ) { $tkName = is_string( $token ) ? '' : $token->getName(); if ( $tkName === 'mw-quote' ) { return $this->onQuote( $token ); } elseif ( $tkName === 'td' || $tkName === 'th' ) { return $this->processQuotes( $token ); } else { return $token; } } /** * On encountering a NlTk, processes quotes on the current line * @inheritDoc */ public function onNewline( NlTk $token ) { return $this->processQuotes( $token ); } /** * On encountering an EOFTk, processes quotes on the current line * @inheritDoc */ public function onEnd( EOFTk $token ) { return $this->processQuotes( $token ); } /** * @inheritDoc */ public function onAny( $token ) { $this->manager->env->log( "trace/quote", $this->manager->pipelineId, "ANY |", function () use ( $token ) { return PHPUtils::jsonEncode( $token ); } ); if ( $this->onAnyEnabled ) { $this->currentChunk[] = $token; return []; } else { return $token; } } /** * Handle QUOTE tags. These are collected in italic/bold lists depending on * the length of quote string. Actual analysis and conversion to the * appropriate tag tokens is deferred until the next NEWLINE token triggers * processQuotes. * * @param Token $token token * @return array */ private function onQuote( Token $token ): array { $v = $token->getAttribute( 'value' ); $qlen = strlen( $v ); $this->manager->env->log( "trace/quote", $this->manager->pipelineId, "QUOTE |", function () use ( $token ) { return PHPUtils::jsonEncode( $token ); } ); $this->onAnyEnabled = true; if ( $qlen === 2 || $qlen === 3 || $qlen === 5 ) { $this->startNewChunk(); $this->currentChunk[] = $token; $this->startNewChunk(); } return []; } /** * Handle NEWLINE tokens, which trigger the actual quote analysis on the * collected quote tokens so far. * * @param Token $token token * @return Token|array */ private function processQuotes( Token $token ) { if ( !$this->onAnyEnabled ) { // Nothing to do, quick abort. return $token; } $this->manager->env->log( "trace/quote", $this->manager->pipelineId, "NL |", function () use( $token ) { return PHPUtils::jsonEncode( $token ); } ); if ( ( $token->getName() === 'td' || $token->getName() === 'th' ) && ( $token->dataAttribs->stx ?? '' ) === 'html' ) { return $token; } // count number of bold and italics $numbold = 0; $numitalics = 0; $chunkCount = count( $this->chunks ); for ( $i = 1; $i < $chunkCount; $i += 2 ) { // quote token Assert::invariant( count( $this->chunks[$i] ) === 1, 'Expected a single token in the chunk' ); $qlen = strlen( $this->chunks[$i][0]->getAttribute( "value" ) ); if ( $qlen === 2 || $qlen === 5 ) { $numitalics++; } if ( $qlen === 3 || $qlen === 5 ) { $numbold++; } } // balance out tokens, convert placeholders into tags if ( ( $numitalics % 2 === 1 ) && ( $numbold % 2 === 1 ) ) { $firstsingleletterword = -1; $firstmultiletterword = -1; $firstspace = -1; $chunkCount = count( $this->chunks ); for ( $i = 1; $i < $chunkCount; $i += 2 ) { // only look at bold tags if ( strlen( $this->chunks[$i][0]->getAttribute( "value" ) ) !== 3 ) { continue; } $tok = $this->chunks[$i][0]; $lastCharIsSpace = $tok->getAttribute( 'isSpace_1' ); $secondLastCharIsSpace = $tok->getAttribute( 'isSpace_2' ); if ( $lastCharIsSpace && $firstspace === -1 ) { $firstspace = $i; } elseif ( !$lastCharIsSpace ) { if ( $secondLastCharIsSpace && $firstsingleletterword === -1 ) { $firstsingleletterword = $i; // if firstsingleletterword is set, we don't need // to look at the options options, so we can bail early break; } elseif ( $firstmultiletterword === -1 ) { $firstmultiletterword = $i; } } } // now see if we can convert a bold to an italic and an apostrophe if ( $firstsingleletterword > -1 ) { $this->convertBold( $firstsingleletterword ); } elseif ( $firstmultiletterword > -1 ) { $this->convertBold( $firstmultiletterword ); } elseif ( $firstspace > -1 ) { $this->convertBold( $firstspace ); } else { // (notice that it is possible for all three to be -1 if, for // example, there is only one pentuple-apostrophe in the line) // In this case, do no balancing. } } // convert the quote tokens into tags $this->convertQuotesToTags(); // return all collected tokens including the newline $this->currentChunk[] = $token; $this->startNewChunk(); // PORT-FIXME: Is there a more efficient way of doing this? $res = [ "tokens" => array_flatten( array_merge( [], $this->chunks ) ) ]; $this->manager->env->log( "trace/quote", $this->manager->pipelineId, "----->", function () use ( $res ) { return PHPUtils::jsonEncode( $res["tokens"] ); } ); // prepare for next line $this->reset(); return $res; } /** * Convert a bold token to italic to balance an uneven number of both bold and * italic tags. In the process, one quote needs to be converted back to text. * * @param int $i index into chunks */ private function convertBold( int $i ): void { // this should be a bold tag. Assert::invariant( $i > 0 && count( $this->chunks[$i] ) === 1 && strlen( $this->chunks[$i][0]->getAttribute( "value" ) ) === 3, 'this should be a bold tag' ); // we're going to convert it to a single plain text ' plus an italic tag $this->chunks[$i - 1][] = "'"; $oldbold = $this->chunks[$i][0]; $tsr = $oldbold->dataAttribs->tsr ?? null; if ( $tsr ) { $tsr = new SourceRange( $tsr->start + 1, $tsr->end ); } $newbold = new SelfclosingTagTk( 'mw-quote', [], (object)[ "tsr" => $tsr ] ); $newbold->setAttribute( "value", "''" ); // italic! $this->chunks[$i] = [ $newbold ]; } /** * Convert quote tokens to tags, using the same state machine as the * PHP parser uses. */ private function convertQuotesToTags(): void { $lastboth = -1; $state = ''; $chunkCount = count( $this->chunks ); for ( $i = 1; $i < $chunkCount; $i += 2 ) { Assert::invariant( count( $this->chunks[$i] ) === 1, 'expected count chunks[i] == 1' ); $qlen = strlen( $this->chunks[$i][0]->getAttribute( "value" ) ); if ( $qlen === 2 ) { if ( $state === 'i' ) { $this->quoteToTag( $i, [ new EndTagTk( 'i' ) ] ); $state = ''; } elseif ( $state === 'bi' ) { $this->quoteToTag( $i, [ new EndTagTk( 'i' ) ] ); $state = 'b'; } elseif ( $state === 'ib' ) { // annoying! $this->quoteToTag( $i, [ new EndTagTk( 'b' ), new EndTagTk( 'i' ), new TagTk( 'b' ), ], true ); $state = 'b'; } elseif ( $state === 'both' ) { $this->quoteToTag( $lastboth, [ new TagTk( 'b' ), new TagTk( 'i' ) ] ); $this->quoteToTag( $i, [ new EndTagTk( 'i' ) ] ); $state = 'b'; } else { // state can be 'b' or '' $this->quoteToTag( $i, [ new TagTk( 'i' ) ] ); $state .= 'i'; } } elseif ( $qlen === 3 ) { if ( $state === 'b' ) { $this->quoteToTag( $i, [ new EndTagTk( 'b' ) ] ); $state = ''; } elseif ( $state === 'ib' ) { $this->quoteToTag( $i, [ new EndTagTk( 'b' ) ] ); $state = 'i'; } elseif ( $state === 'bi' ) { // annoying! $this->quoteToTag( $i, [ new EndTagTk( 'i' ), new EndTagTk( 'b' ), new TagTk( 'i' ), ], true ); $state = 'i'; } elseif ( $state === 'both' ) { $this->quoteToTag( $lastboth, [ new TagTk( 'i' ), new TagTk( 'b' ) ] ); $this->quoteToTag( $i, [ new EndTagTk( 'b' ) ] ); $state = 'i'; } else { // state can be 'i' or '' $this->quoteToTag( $i, [ new TagTk( 'b' ) ] ); $state .= 'b'; } } elseif ( $qlen === 5 ) { if ( $state === 'b' ) { $this->quoteToTag( $i, [ new EndTagTk( 'b' ), new TagTk( 'i' ) ] ); $state = 'i'; } elseif ( $state === 'i' ) { $this->quoteToTag( $i, [ new EndTagTk( 'i' ), new TagTk( 'b' ) ] ); $state = 'b'; } elseif ( $state === 'bi' ) { $this->quoteToTag( $i, [ new EndTagTk( 'i' ), new EndTagTk( 'b' ) ] ); $state = ''; } elseif ( $state === 'ib' ) { $this->quoteToTag( $i, [ new EndTagTk( 'b' ), new EndTagTk( 'i' ) ] ); $state = ''; } elseif ( $state === 'both' ) { $this->quoteToTag( $lastboth, [ new TagTk( 'i' ), new TagTk( 'b' ) ] ); $this->quoteToTag( $i, [ new EndTagTk( 'b' ), new EndTagTk( 'i' ) ] ); $state = ''; } else { // state == '' $lastboth = $i; $state = 'both'; } } } // now close all remaining tags. notice that order is important. if ( $state === 'both' ) { $this->quoteToTag( $lastboth, [ new TagTk( 'b' ), new TagTk( 'i' ) ] ); $state = 'bi'; } if ( $state === 'b' || $state === 'ib' ) { $this->currentChunk[] = new EndTagTk( 'b' ); $this->last["b"]->dataAttribs->autoInsertedEnd = true; } if ( $state === 'i' || $state === 'bi' || $state === 'ib' ) { $this->currentChunk[] = new EndTagTk( 'i' ); $this->last["i"]->dataAttribs->autoInsertedEnd = true; } if ( $state === 'bi' ) { $this->currentChunk[] = new EndTagTk( 'b' ); $this->last["b"]->dataAttribs->autoInsertedEnd = true; } } /** * Convert italics/bolds into tags. * * @param int $chunk chunk buffer * @param array $tags token * @param bool $ignoreBogusTwo optional defaults to false */ private function quoteToTag( int $chunk, array $tags, bool $ignoreBogusTwo = false ): void { Assert::invariant( count( $this->chunks[$chunk] ) === 1, 'expected count chunks[i] == 1' ); $result = []; $oldtag = $this->chunks[$chunk][0]; // make tsr $tsr = $oldtag->dataAttribs->tsr ?? null; $startpos = $tsr ? $tsr->start : null; $endpos = $tsr ? $tsr->end : null; $numTags = count( $tags ); for ( $i = 0; $i < $numTags; $i++ ) { if ( $tsr ) { if ( $i === 0 && $ignoreBogusTwo ) { $this->last[$tags[$i]->getName()]->dataAttribs->autoInsertedEnd = true; } elseif ( $i === 2 && $ignoreBogusTwo ) { $tags[$i]->dataAttribs->autoInsertedStart = true; } elseif ( $tags[$i]->getName() === 'b' ) { $tags[$i]->dataAttribs->tsr = new SourceRange( $startpos, $startpos + 3 ); $startpos = $tags[$i]->dataAttribs->tsr->end; } elseif ( $tags[$i]->getName() === 'i' ) { $tags[$i]->dataAttribs->tsr = new SourceRange( $startpos, $startpos + 2 ); $startpos = $tags[$i]->dataAttribs->tsr->end; } } $this->last[$tags[$i]->getName()] = ( $tags[$i]->getType() === "EndTagTk" ) ? null : $tags[$i]; $result[] = $tags[$i]; } if ( $tsr ) { Assert::invariant( $startpos === $endpos, 'Start: $startpos !== end: $endpos' ); } $this->chunks[$chunk] = $result; } }