/**
* Combined Wiki (MediaWiki) and HTML tokenizer based on pegjs. Emits several
* chunks of tokens (one chunk per top-level block matched) and eventually an
* end event. Tokens map to HTML tags as far as possible, with custom tokens
* used where further processing on the token stream is needed.
*/
{
use Wikimedia\Parsoid\Config\Env;
use Wikimedia\Parsoid\Config\SiteConfig;
use Wikimedia\Parsoid\Config\WikitextConstants;
use Wikimedia\Parsoid\Core\DomSourceRange;
use Wikimedia\Parsoid\Tokens\CommentTk;
use Wikimedia\Parsoid\Tokens\EndTagTk;
use Wikimedia\Parsoid\Tokens\EOFTk;
use Wikimedia\Parsoid\Tokens\KV;
use Wikimedia\Parsoid\Tokens\KVSourceRange;
use Wikimedia\Parsoid\Tokens\NlTk;
use Wikimedia\Parsoid\Tokens\SelfclosingTagTk;
use Wikimedia\Parsoid\Tokens\SourceRange;
use Wikimedia\Parsoid\Tokens\TagTk;
use Wikimedia\Parsoid\Tokens\Token;
use Wikimedia\Parsoid\Utils\TokenUtils;
use Wikimedia\Parsoid\Utils\Utils;
use Wikimedia\Parsoid\Utils\PHPUtils;
use Wikimedia\Parsoid\Utils\WTUtils;
}
{
/** @var Env */
private $env;
/** @var SiteConfig */
private $siteConfig;
/** @var array */
private $pipelineOpts;
/** @var int */
private $pipelineOffset;
private $extTags;
protected function initialize() {
$this->env = $this->options['env'];
$this->siteConfig = $this->env->getSiteConfig();
$tokenizer = $this->options['pegTokenizer'];
$this->pipelineOpts = $tokenizer->getOptions();
$this->pipelineOffset = $this->options['pipelineOffset'] ?? 0;
$this->extTags = $this->siteConfig->getExtensionTagNameMap();
}
private $prevOffset = 0;
private $headingIndex = 0;
private function assert( $condition, $text ) {
if ( !$condition ) {
throw new \Exception( "Grammar.pegphp assertion failure: $text" );
}
}
private function unreachable() {
throw new \Exception( "Grammar.pegphp: this should be unreachable" );
}
// Some shorthands for legibility
private function startOffset() {
return $this->savedPos;
}
private function endOffset() {
return $this->currPos;
}
private function tsrOffsets( $flag = 'default' ): SourceRange {
switch ( $flag ) {
case 'start':
return new SourceRange( $this->savedPos, $this->savedPos );
case 'end':
return new SourceRange( $this->currPos, $this->currPos );
default:
return new SourceRange( $this->savedPos, $this->currPos );
}
}
/*
* Emit a chunk of tokens to our consumers. Once this has been done, the
* current expression can return an empty list (true).
*/
private function emitChunk( $tokens ) {
// Shift tsr of all tokens by the pipeline offset
TokenUtils::shiftTokenTSR( $tokens, $this->pipelineOffset );
$this->env->log( 'trace/peg', $this->options['pipelineId'] ?? '0', '----> ', $tokens );
$i = null;
$n = count( $tokens );
// Enforce parsing resource limits
for ( $i = 0; $i < $n; $i++ ) {
TokenizerUtils::enforceParserResourceLimits( $this->env, $tokens[ $i ] );
}
return $tokens;
}
/* ------------------------------------------------------------------------
* Extension tags should be parsed with higher priority than anything else.
*
* The trick we use is to strip out the content inside a matching tag-pair
* and not tokenize it. The content, if it needs to parsed (for example,
* for [, <*include*> tags), is parsed in a fresh tokenizer context
* which means any error correction that needs to happen is restricted to
* the scope of the extension content and doesn't spill over to the higher
* level. Ex: ]