true ] );
parent::__construct( $manager, $newOptions );
$this->tokenizer = new PegTokenizer( $this->env );
$this->templateHandler = new TemplateHandler( $manager, $options );
$this->reset();
}
private function reset() {
$this->srcOffset = 0;
$this->sol = true;
$this->tokenBuf = [];
$this->wikiTableNesting = 0;
// This marker tries to track the most recent table-cell token (td/th)
// that was converted to string. For those, we want to get rid
// of their corresponding mw:TSRMarker meta tag.
//
// This marker is set when we convert a td/th token to string
//
// This marker is cleared in one of the following scenarios:
// 1. When we clear a mw:TSRMarker corresponding to the token set earlier
// 2. When we change table nesting
// 3. When we hit a tr/td/th/caption token that wasn't converted to string
$this->lastConvertedTableCellToken = null;
}
/**
* @inheritDoc
*/
public function onNewline( NlTk $token ) {
$this->manager->env->log( 'trace/tsp', $this->manager->pipelineId,
function () use ( $token ) {
return PHPUtils::jsonEncode( $token );
}
);
$this->srcOffset = $token->dataAttribs->tsr->end ?? null;
$this->sol = true;
$this->tokenBuf[] = $token;
return [ 'tokens' => [] ];
}
/**
* @inheritDoc
*/
public function onEnd( EOFTk $token ) {
$res = $this->onAny( $token );
$this->reset();
return $res;
}
/**
* Clear start of line info
*/
private function clearSOL() {
// clear tsr and sol flag
$this->srcOffset = null;
$this->sol = false;
}
/**
* @param Token $token
* @return array
*/
private function convertTokenToString( Token $token ): array {
$da = $token->dataAttribs;
$tsr = $da->tsr ?? null;
if ( $tsr && $tsr->end > $tsr->start ) {
// > will only hold if these are valid numbers
$str = $tsr->substr( $this->manager->getFrame()->getSrcText() );
// sol === false ensures that the pipe will not be parsed as a
again
$toks = $this->tokenizer->tokenizeSync( $str, [ 'sol' => false ] );
array_pop( $toks ); // pop EOFTk
// Update tsr
TokenUtils::shiftTokenTSR( $toks, $tsr->start );
$ret = [];
for ( $i = 0; $i < count( $toks ); $i++ ) {
$t = $toks[$i];
if ( !$t ) {
continue;
}
// Reprocess magic words to completion.
// FIXME: This doesn't handle any templates that got retokenized.
// That requires processing this whole thing in a tokens/x-mediawiki
// pipeline which is not possible right now because TSP runs in the
// synchronous 3rd phase. So, not tackling that in this patch.
// This has been broken for the longest time and feels similar to
// https://gerrit.wikimedia.org/r/#/c/105018/
// All of these need uniform handling. To be addressed separately
// if this proves to be a real problem on production pages.
if ( $t instanceof SelfclosingTagTk && $t->getName() === 'template' ) {
$t = $this->templateHandler->processSpecialMagicWord( $this->atTopLevel, $t ) ?? [ $t ];
} else {
$t = [ $t ];
}
$ret = array_merge( $ret, $t );
}
return $ret;
} elseif ( !empty( $da->autoInsertedStart ) && !empty( $da->autoInsertedEnd ) ) {
return [ '' ];
} else {
// SSS FIXME: What about "!!" and "||"??
switch ( $token->getName() ) {
case 'td':
return [ '|' ];
case 'th':
return [ '!' ];
case 'tr':
return [ '|-' ];
case 'caption':
return [ $token instanceof TagTk ? '|+' : '' ];
case 'table':
if ( $token instanceof EndTagTk ) {
return [ '|}' ];
}
}
// No conversion if we get here
return [ $token ];
}
}
/**
* @inheritDoc
*/
public function onAny( $token ) {
$this->manager->env->log( 'trace/tsp', $this->manager->pipelineId,
function () use ( $token ) {
return PHPUtils::jsonEncode( $token );
} );
$tokens = [ $token ];
$tc = TokenUtils::getTokenType( $token );
switch ( $tc ) {
case 'string':
// While we are buffering newlines to suppress them
// in case we see a category, buffer all intervening
// white-space as well.
if ( count( $this->tokenBuf ) > 0 && preg_match( '/^\s*$/D', $token ) ) {
$this->tokenBuf[] = $token;
return [ 'tokens' => [] ];
}
// TRICK #1:
// Attempt to match "{|" after a newline and convert
// it to a table token.
if ( $this->sol ) {
if ( $this->atTopLevel && preg_match( '/^\{\|/', $token ) ) {
// Reparse string with the 'table_start_tag' rule
// and shift tsr of result tokens by source offset
$retoks = $this->tokenizer->tokenizeAs( $token, 'table_start_tag', /* sol */true );
if ( $retoks === false ) {
// XXX: The string begins with table start syntax,
// we really shouldn't be here. Anything else on the
// line would get swallowed up as attributes.
$this->manager->env->log( 'error', 'Failed to tokenize table start tag.' );
$this->clearSOL();
} else {
TokenUtils::shiftTokenTSR( $retoks, $this->srcOffset );
$tokens = $retoks;
$this->wikiTableNesting++;
$this->lastConvertedTableCellToken = null;
}
} elseif ( preg_match( '/^\s*$/D', $token ) ) {
// White-space doesn't change SOL state
// Update srcOffset
$this->srcOffset += strlen( $token );
} else {
$this->clearSOL();
}
} else {
$this->clearSOL();
}
break;
case 'CommentTk':
// Comments don't change SOL state
// Update srcOffset
$this->srcOffset = $token->dataAttribs->tsr->end ?? null;
break;
case 'SelfclosingTagTk':
if ( $token->getName() === 'meta' && ( $token->dataAttribs->stx ?? '' ) !== 'html' ) {
$this->srcOffset = $token->dataAttribs->tsr->end ?? null;
if ( TokenUtils::hasTypeOf( $token, 'mw:TSRMarker' ) &&
$this->lastConvertedTableCellToken !== null &&
$this->lastConvertedTableCellToken->getName() === $token->getAttribute( 'data-etag' )
) {
// Swallow the token and clear the marker
$this->lastConvertedTableCellToken = null;
return [ 'tokens' => [] ];
} elseif (
count( $this->tokenBuf ) > 0 &&
TokenUtils::hasTypeOf( $token, 'mw:Transclusion' )
) {
// If we have buffered newlines, we might very well encounter
// a category link, so continue buffering.
$this->tokenBuf[] = $token;
return [ 'tokens' => [] ];
}
} elseif ( $token->getName() === 'link' &&
$token->getAttribute( 'rel' ) === 'mw:PageProp/Category'
) {
// Replace buffered newline & whitespace tokens with mw:EmptyLine
// meta-tokens. This tunnels them through the rest of the transformations
// without affecting them. During HTML building, they are expanded
// back to newlines / whitespace.
$n = count( $this->tokenBuf );
if ( $n > 0 ) {
$i = 0;
while ( $i < $n &&
!( $this->tokenBuf[$i] instanceof SelfclosingTagTk )
) {
$i++;
}
$toks = [
new SelfclosingTagTk( 'meta',
[ new KV( 'typeof', 'mw:EmptyLine' ) ],
(object)[ 'tokens' => array_slice( $this->tokenBuf, 0, $i ) ]
)
];
if ( $i < $n ) {
$toks[] = $this->tokenBuf[$i];
if ( $i + 1 < $n ) {
$toks[] = new SelfclosingTagTk( 'meta',
[ new KV( 'typeof', 'mw:EmptyLine' ) ],
(object)[ 'tokens' => array_slice( $this->tokenBuf, $i + 1 ) ]
);
}
}
$tokens = array_merge( $toks, $tokens );
$this->tokenBuf = [];
}
$this->clearSOL();
} else {
$this->clearSOL();
}
break;
case 'TagTk':
if ( $this->atTopLevel && !TokenUtils::isHTMLTag( $token ) ) {
if ( $token->getName() === 'table' ) {
$this->lastConvertedTableCellToken = null;
$this->wikiTableNesting++;
} elseif (
in_array(
$token->getName(),
[ 'td', 'th', 'tr', 'caption' ],
true
)
) {
if ( $this->wikiTableNesting === 0 ) {
if ( $token->getName() === 'td' || $token->getName() === 'th' ) {
$this->lastConvertedTableCellToken = $token;
}
$tokens = $this->convertTokenToString( $token );
} else {
$this->lastConvertedTableCellToken = null;
}
}
}
$this->clearSOL();
break;
case 'EndTagTk':
if ( $this->atTopLevel && !TokenUtils::isHTMLTag( $token ) ) {
if ( $this->wikiTableNesting > 0 ) {
if ( $token->getName() === 'table' ) {
$this->lastConvertedTableCellToken = null;
$this->wikiTableNesting--;
}
} elseif ( $token->getName() === 'table' || $token->getName() === 'caption' ) {
// Convert this to "|}"
$tokens = $this->convertTokenToString( $token );
}
}
$this->clearSOL();
break;
default:
break;
}
// Emit buffered newlines (and a transclusion meta-token, if any)
if ( count( $this->tokenBuf ) > 0 ) {
$tokens = array_merge( $this->tokenBuf, $tokens );
$this->tokenBuf = [];
}
return [ 'tokens' => $tokens ];
}
}
|