/** * Combined Wiki (MediaWiki) and HTML tokenizer based on pegjs. Emits several * chunks of tokens (one chunk per top-level block matched) and eventually an * end event. Tokens map to HTML tags as far as possible, with custom tokens * used where further processing on the token stream is needed. */ { use Wikimedia\Parsoid\Config\Env; use Wikimedia\Parsoid\Config\SiteConfig; use Wikimedia\Parsoid\Config\WikitextConstants; use Wikimedia\Parsoid\Core\DomSourceRange; use Wikimedia\Parsoid\Tokens\CommentTk; use Wikimedia\Parsoid\Tokens\EndTagTk; use Wikimedia\Parsoid\Tokens\EOFTk; use Wikimedia\Parsoid\Tokens\KV; use Wikimedia\Parsoid\Tokens\KVSourceRange; use Wikimedia\Parsoid\Tokens\NlTk; use Wikimedia\Parsoid\Tokens\SelfclosingTagTk; use Wikimedia\Parsoid\Tokens\SourceRange; use Wikimedia\Parsoid\Tokens\TagTk; use Wikimedia\Parsoid\Tokens\Token; use Wikimedia\Parsoid\Utils\TokenUtils; use Wikimedia\Parsoid\Utils\Utils; use Wikimedia\Parsoid\Utils\PHPUtils; use Wikimedia\Parsoid\Utils\WTUtils; } { /** @var Env */ private $env; /** @var SiteConfig */ private $siteConfig; /** @var array */ private $pipelineOpts; /** @var int */ private $pipelineOffset; private $extTags; protected function initialize() { $this->env = $this->options['env']; $this->siteConfig = $this->env->getSiteConfig(); $tokenizer = $this->options['pegTokenizer']; $this->pipelineOpts = $tokenizer->getOptions(); $this->pipelineOffset = $this->options['pipelineOffset'] ?? 0; $this->extTags = $this->siteConfig->getExtensionTagNameMap(); } private $prevOffset = 0; private $headingIndex = 0; private function assert( $condition, $text ) { if ( !$condition ) { throw new \Exception( "Grammar.pegphp assertion failure: $text" ); } } private function unreachable() { throw new \Exception( "Grammar.pegphp: this should be unreachable" ); } // Some shorthands for legibility private function startOffset() { return $this->savedPos; } private function endOffset() { return $this->currPos; } private function tsrOffsets( $flag = 'default' ): SourceRange { switch ( $flag ) { case 'start': return new SourceRange( $this->savedPos, $this->savedPos ); case 'end': return new SourceRange( $this->currPos, $this->currPos ); default: return new SourceRange( $this->savedPos, $this->currPos ); } } /* * Emit a chunk of tokens to our consumers. Once this has been done, the * current expression can return an empty list (true). */ private function emitChunk( $tokens ) { // Shift tsr of all tokens by the pipeline offset TokenUtils::shiftTokenTSR( $tokens, $this->pipelineOffset ); $this->env->log( 'trace/peg', $this->options['pipelineId'] ?? '0', '----> ', $tokens ); $i = null; $n = count( $tokens ); // Enforce parsing resource limits for ( $i = 0; $i < $n; $i++ ) { TokenizerUtils::enforceParserResourceLimits( $this->env, $tokens[ $i ] ); } return $tokens; } /* ------------------------------------------------------------------------ * Extension tags should be parsed with higher priority than anything else. * * The trick we use is to strip out the content inside a matching tag-pair * and not tokenize it. The content, if it needs to parsed (for example, * for , <*include*> tags), is parsed in a fresh tokenizer context * which means any error correction that needs to happen is restricted to * the scope of the extension content and doesn't spill over to the higher * level. Ex:

).)*-->/g, "")
// but, as always, things around here are a little more complicated.
//
// We accept the same comments, but because we emit them as HTML comments
// instead of deleting them, we have to encode the data to ensure that
// we always emit a valid HTML5 comment.  See the encodeComment helper
// for further details.

comment =
	' " .)* ('-->' / eof) {
		$data = WTUtils::encodeComment( $c );
		return [ new CommentTk( $data, (object)[ 'tsr' => $this->tsrOffsets() ] ) ];
	}


// Behavior switches. See:
// https://www.mediawiki.org/wiki/Help:Magic_words#Behavior_switches
behavior_switch =
	bs:$('__' behavior_text '__') {
		if ( $this->siteConfig->isMagicWord( $bs ) ) {
			return [
				new SelfclosingTagTk( 'behavior-switch', [ new KV( 'word', $bs ) ],
					(object)[ 'tsr' => $this->tsrOffsets(), 'src' => $bs, 'magicSrc' => $bs ]
				)
			];
		} else {
			return [ $bs ];
		}
	}

// Instead of defining a charset, the old parser's doDoubleUnderscore concats a
// regexp of all the language specific aliases of the behavior switches and
// then does a match and replace. Just be as permissive as possible and let the
// BehaviorSwitchPreprocessor back out of any overreach.
behavior_text = $( !'__' ( text_char / "-" ) )+


/**************************************************************
 * External (bracketed and autolinked) links
 **************************************************************/

autolink =
	! // this must be a word boundary, so previous character must be non-word
	! { return Utils::isUniWord(Utils::lastUniChar( $this->input, $this->endOffset() ) ); }
	r:(
		autourl
		/ autoref
		/ isbn
	) { return $r; }

extlink "extlink" =
	! // extlink cannot be nested
	r:(
		"["
		p0:( "" { return $this->endOffset(); })
		addr:(url_protocol ipv6urladdr / "")
		target:(extlink_nonipv6url / "")
		p1:( "" { return $this->endOffset(); })
		& {
			// Protocol must be valid and there ought to be at least one
			// post-protocol character.  So strip last char off target
			// before testing protocol.
			$flat = TokenizerUtils::flattenString( [ $addr, $target ] );
			if ( is_array( $flat ) ) {
				// There are templates present, alas.
				return count( $flat ) > 0;
			}
			return Utils::isProtocolValid( substr( $flat, 0, -1 ), $this->env );
		}
		sp:$( space / unispace )*
		p2:( "" { return $this->endOffset(); })
		content:inlineline ?
		p3:( "" { return $this->endOffset(); })
		"]" {
			$tsr1 = new SourceRange( $p0, $p1 );
			$tsr2 = new SourceRange( $p2, $p3 );
			return [
				new SelfclosingTagTk( 'extlink', [
						new KV( 'href', TokenizerUtils::flattenString( [ $addr, $target ] ), $tsr1->expandTsrV() ),
						new KV( 'mw:content', $content ?? '', $tsr2->expandTsrV() ),
						new KV( 'spaces', $sp )
					], (object)[
						'tsr' => $this->tsrOffsets(),
						'extLinkContentOffsets' => $tsr2,
					]
				)
			]; }
	) { return $r; }

autoref =
	ref:('RFC' / 'PMID') sp:space_or_nbsp+ identifier:$[0-9]+ end_of_word
	{
		$base_urls = [
			'RFC' => 'https://tools.ietf.org/html/rfc%s',
			'PMID' => '//www.ncbi.nlm.nih.gov/pubmed/%s?dopt=Abstract'
		];
		$tsr = $this->tsrOffsets();
		return [
			new SelfclosingTagTk( 'extlink', [
					new KV( 'href', sprintf( $base_urls[ $ref ], $identifier ) ),
					new KV( 'mw:content', TokenizerUtils::flattenString( [ $ref, $sp, $identifier ] ), $tsr->expandTsrV() ),
					new KV( 'typeof', 'mw:ExtLink/' . $ref )
				],
				(object)[ 'stx' => 'magiclink', 'tsr' => $tsr ]
			)
		];
	}

isbn =
	'ISBN' sp:space_or_nbsp+ isbn:(
		[0-9]
		((space_or_nbsp_or_dash / "") [0-9])+
		((space_or_nbsp_or_dash / "") [xX] / "")
	)
	isbncode:(
		end_of_word
		{
			// Convert isbn token-and-entity array to stripped string.
			$stripped = '';
			foreach ( TokenizerUtils::flattenStringlist( $isbn ) as $part ) {
				if ( is_string( $part ) ) {
					$stripped .= $part;
				}
			}
			return strtoupper( preg_replace( '/[^\dX]/i', '', $stripped ) );
		}
	)
	&{
		// ISBNs can only be 10 or 13 digits long (with a specific format)
		return strlen( $isbncode ) === 10
			|| ( strlen( $isbncode ) === 13 && preg_match( '/^97[89]/', $isbncode ) );
	}
	{
		$tsr = $this->tsrOffsets();
		return [
			new SelfclosingTagTk( 'extlink', [
					new KV( 'href', 'Special:BookSources/' . $isbncode ),
					new KV( 'mw:content', TokenizerUtils::flattenString( [ 'ISBN', $sp, $isbn ] ), $tsr->expandTsrV() ),
					new KV( 'typeof', 'mw:WikiLink/ISBN' )
				],
				(object)[ 'stx' => 'magiclink', 'tsr' => $tsr ]
			)
		];
	}


/* Default URL protocols in MediaWiki (see DefaultSettings). Normally
 * these can be configured dynamically. */

url_protocol =
	p:$( '//' / [A-Za-z] [-A-Za-z0-9+.]* ':' '//'? )
	& { return Utils::isProtocolValid( $p, $this->env ); }
	{ return $p; }

// no punctuation, and '{<' to trigger directives
no_punctuation_char = [^ \]\[\r\n"'<>\x00-\x20\x7f&\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000{]

// this is the general url rule
// on the PHP side, the path part matches EXT_LINK_URL_CLASS
// which is '[^][<>"\\x00-\\x20\\x7F\p{Zs}]'
url =
	proto:url_protocol
	addr:(ipv6urladdr / "")
	path:(
		!inline_breaks c:(
			no_punctuation_char
			/ comment
			/ tplarg_or_template
			/ ['{]
			/ ! ( "&" ( [lL][tT] / [gG][tT] ) ";" )
				r:(
					& "&" he:htmlentity { return $he; }
					/ "&"
				) { return $r; }
		) { return $c; }
	)*
	// Must be at least one character after the protocol
	& { return $addr !== '' || count( $path ) > 0; }
	{
		return TokenizerUtils::flattenString( array_merge( [ $proto, $addr ], $path ) );
	}

// this is the somewhat-restricted rule used in autolinks
// See Parser::doMagicLinks and Parser.php::makeFreeExternalLink.
// The `path` portion matches EXT_LINK_URL_CLASS, as in the general
// url rule.  As in PHP, we do some fancy fixup to yank out
// trailing punctuation, perhaps including parentheses.
autourl =
	! '//' // protocol-relative autolinks not allowed (T32269)
	r:(
		proto:url_protocol
		addr:(ipv6urladdr / "")
		path:(
			!inline_breaks c:(
				no_punctuation_char
				/ comment
				/ tplarg_or_template
				/ $("'" !"'") // single quotes are ok, double quotes are bad
				/ "{"
				/ ! ( rhe:raw_htmlentity &{ return $rhe === '<' || $rhe === '>' || $rhe === "\u{A0}"; } )
					r:(
						& "&" he:htmlentity { return $he; }
						/ "&"
					) { return $r; }
			) { return $c; }
		)*
		{
			// as in Parser.php::makeFreeExternalLink, we're going to
			// yank trailing punctuation out of this match.
			$url = TokenizerUtils::flattenStringlist( array_merge( [ $proto, $addr ], $path ) );
			// only need to look at last element; HTML entities are strip-proof.
			$last = PHPUtils::lastItem( $url );
			$trim = 0;
			if ( is_string( $last ) ) {
				$strip = ',;\.:!?';
				if ( array_search( '(', $path ) === false ) {
					$strip .= ')';
				}
				$trim = strspn( strrev( $last ), $strip );
				$url[ count( $url ) - 1 ] = substr( $last, 0, strlen( $last ) - $trim );
			}
			$url = TokenizerUtils::flattenStringlist( $url );
			if ( count( $url ) === 1 && is_string( $url[0] ) && strlen( $url[0] ) <= strlen( $proto ) ) {
				return null; // ensure we haven't stripped everything: T106945
			}
			$this->currPos -= $trim;
			return $url;
		}
	)
	&{ return $r !== null; }
	{
		$tsr = $this->tsrOffsets();
		$res = [ new SelfclosingTagTk( 'urllink', [ new KV( 'href', $r, $tsr->expandTsrV() ) ], (object)[ 'tsr' => $tsr ] ) ];
		return $res;
	}

// This is extracted from EXT_LINK_ADDR in Parser.php: a simplified
// expression to match an IPv6 address.  The IPv4 address and "at least
// one character of a host name" portions are punted to the `path`
// component of the `autourl` and `url` productions
ipv6urladdr =
	$( "[" [0-9A-Fa-f:.]+ "]" )

/**************************************************************
 * Templates, -arguments and wikilinks
 **************************************************************/

/*
 * Precedence: template arguments win over templates. See
 * http://www.mediawiki.org/wiki/Preprocessor_ABNF#Ideal_precedence
 * 4: {{{{·}}}} → {·{{{·}}}·}
 * 5: {{{{{·}}}}} → {{·{{{·}}}·}}
 * 6: {{{{{{·}}}}}} → {{{·{{{·}}}·}}}
 * 7: {{{{{{{·}}}}}}} → {·{{{·{{{·}}}·}}}·}
 * This is only if close has > 3 braces; otherwise we just match open
 * and close as we find them.
 */
tplarg_or_template =
	&'{{'
	templatedepth: &{
		// Refuse to recurse beyond `maxDepth` levels. Default in the old parser
		// is $wgMaxTemplateDepth = 40; This is to prevent crashing from
		// buggy wikitext with lots of unclosed template calls, as in
		// eswiki/Usuario:C%C3%A1rdenas/PRUEBAS?oldid=651094
		return $templatedepth + 1 < $this->siteConfig->getMaxTemplateDepth();
	}
	t:tplarg_or_template_guarded {
		return $t;
	}

tplarg_or_template_guarded =
	&('{{' &('{{{'+ !'{') tplarg) a:(template/broken_template) { return $a; }
	/ a:$('{' &('{{{'+ !'{'))? b:tplarg { return [ $a, $b ]; }
	/ a:$('{' &('{{' !'{'))? b:template { return [ $a, $b ]; }
	/ broken_template

tplarg_or_template_or_bust =
	r:(tplarg_or_template / .)+ { return TokenizerUtils::flattenIfArray( $r ); }

template =
	template_preproc<&preproc="}}">

// The old preprocessor maintains a single stack of "closing token we
// are currently looking for", with no backtracking.  This means that
// once you see `[[ {{` you are looking only for `}}` -- if that template
// turns out to be broken you will never pop the `}}` and there is no way
// to close the `[[`.  Since the PEG tokenizer in Parsoid uses backtracking
// and parses in a single pass (instead of PHP's split preprocessor/parser)
// we have to be a little more careful when we emulate this behavior.
// If we use a rule like:
//   template = "{{" tplname tplargs* "}}"?
// Then we end up having to reinterpret `tplname tplargs*` as a tlb if it
// turns out we never find the `}}`, which involves a lot of tedious gluing
// tokens back together with fingers crossed we haven't discarded any
// significant newlines/whitespace/etc.  An alternative would be a rule like:
//   broken_template = "{{" tlb
// but again, `template` is used in many different contexts; `tlb` isn't
// necessarily the right one to recursively invoke.  Instead we get the
// broken template off of the PEGjs production stack by returning immediately
// after `{{`, but we set the "preproc" reference parameter to false (the
// reference parameter feature having been introduced for this sole purpose)
// to indicate to the parent rule that we're "still in" the {{ context and
// shouldn't ever inlineBreak for any closing tokens above this one.  For
// example:
//   [[Foo{{Bar]]
// This will match as:
//   wikilink->text,template->text             --> FAILS looking for }}
//     backtracks, popping "bracket_bracket" and "brace_brace" off preproc stack
//   wikilink->text,broken_template,text       --> FAILS looking for ]]
//     backtracks, popping "bracket_bracket" and false off preproc stack
//   broken_wikilink,text,broken_template,text --> OK
//     with [false, false] left on the preproc stack

broken_template =
	preproc:<&preproc>
	t:"{{" {
		$preproc = null;
		return $t;
	}

template_preproc =
	"{{" leadWS:$( nl_comment_space* )
	target:template_param_value
	params:(
		nl_comment_space* "|"
		r:(
			p0:("" { return $this->endOffset(); })
			v:nl_comment_space*
			p:("" { return $this->endOffset(); })
			&("|" / "}}")
			{
				// empty argument
				$tsr0 = new SourceRange( $p0, $p );
				return new KV( '', TokenizerUtils::flattenIfArray( $v ), $tsr0->expandTsrV() );
			}
			/ template_param
		) { return $r; }
	)*
	trailWS:$( nl_comment_space* )
	inline_breaks "}}"
	{
		// Insert target as first positional attribute, so that it can be
		// generically expanded. The TemplateHandler then needs to shift it out
		// again.
		array_unshift( $params, new KV( TokenizerUtils::flattenIfArray( $target['tokens'] ), '', $target['srcOffsets']->expandTsrK() ) );
		$obj = new SelfclosingTagTk( 'template', $params,
			(object)[
				'tsr' => $this->tsrOffsets(), 'src' => $this->text(),
				'tmp' => (object)[ 'leadWS' => $leadWS, 'trailWS' => $trailWS ]
			] );
		return $obj;
	}
	/ $('{{' space_or_newline* '}}')

tplarg =
	tplarg_preproc<&preproc="}}">

tplarg_preproc =
	"{{{"
	p:("" { return $this->endOffset(); })
	target:template_param_value?
	params:(
		nl_comment_space* "|"
		r:(
			p0:("" { return $this->endOffset(); })
			v:nl_comment_space*
			p1:("" { return $this->endOffset(); })
			&("|" / "}}}")
			{
				// empty argument
				return [ 'tokens' => $v, 'srcOffsets' => new SourceRange( $p0, $p1 ) ];
			}
			/ template_param_value
		) { return $r; }
	)*
	nl_comment_space*
	inline_breaks "}}}"
	{
		$kvs = [];

		if ( $target === null ) {
			$target = [ 'tokens' => '', 'srcOffsets' => new SourceRange( $p, $p ) ];
		}
		// Insert target as first positional attribute, so that it can be
		// generically expanded. The TemplateHandler then needs to shift it out
		// again.
		$kvs[] = new KV( TokenizerUtils::flattenIfArray( $target['tokens'] ), '', $target['srcOffsets']->expandTsrK() );

		foreach ( $params as $o ) {
			$s = $o['srcOffsets'];
			$kvs[] = new KV( '', TokenizerUtils::flattenIfArray( $o['tokens'] ), $s->expandTsrV() );
		}

		$obj = new SelfclosingTagTk( 'templatearg', $kvs, (object)[ 'tsr' => $this->tsrOffsets(), 'src' => $this->text() ] );
		return $obj;
	}

template_param =
	name:template_param_name
	val:(
		kEndPos:("" { return $this->endOffset(); })
		optionalSpaceToken
		"="
		vStartPos:("" { return $this->endOffset(); })
		optionalSpaceToken
		tpv:template_param_value? {
			return [
				'kEndPos' => $kEndPos,
				'vStartPos' => $vStartPos,
				'value' => $tpv ? $tpv['tokens'] : []
			];
		}
	)? {
		if ( $val !== null ) {
			if ( $val['value'] !== null ) {
				$so = new KVSourceRange(
					$this->startOffset(), $val['kEndPos'],
					$val['vStartPos'], $this->endOffset()
				);
				return new KV(
					$name,
					TokenizerUtils::flattenIfArray( $val['value'] ),
					$so
				);
			} else {
				return new KV(
					TokenizerUtils::flattenIfArray( $name ),
					'',
					$so
				);
			}
		} else {
			$so = new SourceRange( $this->startOffset(), $this->endOffset() );
			return new KV(
				'',
				TokenizerUtils::flattenIfArray( $name ),
				$so->expandTsrV()
			);
		}
	}
	// empty parameter
	/ & [|}] {
		$so = new SourceRange( $this->startOffset(), $this->endOffset() );
		return new KV( '', '', $so->expandTsrV() );
	}

template_param_name =
	template_param_text / (&'=' { return ''; })

template_param_value =
	tpt:template_param_text {
		return [ 'tokens' => $tpt, 'srcOffsets' => $this->tsrOffsets() ];
	}

template_param_text =
	il:(nested_block / newlineToken)+
	{
		// il is guaranteed to be an array -- so, tu.flattenIfArray will
		// always return an array
		$r = TokenizerUtils::flattenIfArray( $il );
		if ( count( $r ) === 1 && is_string( $r[0] ) ) {
			$r = $r[0];
		}
		return $r;
	}

//// Language converter block markup of language variants: -{ ... }-

// Note that "rightmost opening" precedence rule (see
// https://www.mediawiki.org/wiki/Preprocessor_ABNF ) means
// that neither -{{ nor -{{{ are parsed as a -{ token, although
// -{{{{ is (since {{{ has precedence over {{).

lang_variant_or_tpl =
	&('-{' &('{{{'+ !'{') tplarg) a:lang_variant { return $a; }
	/ a:$('-' &('{{{'+ !'{')) b:tplarg { return [ $a, $b ]; }
	/ a:$('-' &('{{' '{{{'* !'{')) b:template { return [ $a, $b ]; }
	/ &'-{' a:lang_variant { return $a; }

broken_lang_variant =
	r:"-{"
	preproc:<&preproc>
	{
		$preproc = null;
		return $r;
	}

lang_variant =
	lang_variant_preproc<&preproc="}-">
	/ broken_lang_variant

lang_variant_preproc =
	lv0:("-{" { return $this->startOffset(); })
	f:(
		&{ return $this->env->langConverterEnabled(); }
		ff:opt_lang_variant_flags {
			// if flags contains 'R', then don't treat ; or : specially inside.
			if ( isset( $ff['flags'] ) ) {
				$ff['raw'] = isset( $ff['flags']['R'] ) || isset( $ff['flags']['N'] );
			} elseif ( isset( $ff['variants'] ) ) {
				$ff['raw'] = true;
			}
			return $ff;
		} /
		&{ return !$this->env->langConverterEnabled(); }
		"" {
			// if language converter not enabled, don't try to parse inside.
			return [ 'raw' => true ];
		}
	)
	ts:(
		&{ return $f['raw']; }
		lv:lang_variant_text { return [ [ 'text' => $lv ] ]; }
		/
		&{ return !$f['raw']; }
		lv:lang_variant_option_list { return $lv; }
	)
	inline_breaks
	lv1:("}-" { return $this->endOffset(); })
	{
		if ( !$this->env->langConverterEnabled() ) {
			return [ '-{', $ts[0]['text']['tokens'], '}-' ];
		}
		$lvsrc = substr( $this->input, $lv0, $lv1 - $lv0 );
		$attribs = [];

		foreach ( $ts as &$t ) {
			// move token strings into KV attributes so that they are
			// properly expanded by early stages of the token pipeline
			foreach ( [ 'text', 'from', 'to' ] as $fld ) {
				if ( !isset( $t[$fld] ) ) {
					continue;
				}
				$name = 'mw:lv' . count( $attribs );
				// Note that AttributeExpander will expect the tokens array to be
				// flattened.  We do that in lang_variant_text / lang_variant_nowiki
				$attribs[] = new KV( $name, $t[$fld]['tokens'], $t[$fld]['srcOffsets']->expandTsrV() );
				$t[$fld] = $name;
			}
		}
		unset( $t );

		$flags = isset( $f['flags'] ) ? array_keys( $f['flags'] ) : [];
		sort( $flags );
		$variants = isset( $f['variants'] ) ? array_keys( $f['variants'] ) : [];
		sort( $variants );

		return [
			new SelfclosingTagTk(
				'language-variant',
				$attribs,
				(object)[
					'tsr' => new SourceRange( $lv0, $lv1 ),
					'src' => $lvsrc,
					'flags' => $flags,
					'variants' => $variants,
					'original' => $f['original'],
					'flagSp' => $f['sp'],
					'texts' => $ts
				]
			)
		];
	}

opt_lang_variant_flags =
	f:( ff:lang_variant_flags "|" { return $ff; } )?
	{
		// Collect & separate flags and variants into a hashtable (by key) and ordered list
		$flags = [];
		$variants = [];
		$flagList = [];
		$flagSpace = [];
		$variantList = [];
		$variantSpace = [];
		$useVariants = false;
		if ( $f !== null ) {
			// lang_variant_flags returns arrays in reverse order.
			$spPtr = count( $f['sp'] ) - 1;
			for ( $i = count( $f['flags'] ) - 1; $i >= 0; $i--) {
				$item = $f['flags'][$i];
				if ( isset( $item['flag'] ) ) {
					$flagSpace[] = $f['sp'][$spPtr--];
					$flags[$item['flag']] = true;
					$flagList[] = $item['flag'];
					$flagSpace[] = $f['sp'][$spPtr--];
				}
				if ( isset( $item['variant'] ) ) {
					$variantSpace[] = $f['sp'][$spPtr--];
					$variants[$item['variant']] = true;
					$variantList[] = $item['variant'];
					$variantSpace[] = $f['sp'][$spPtr--];
				}
			}
			if ( $spPtr >= 0 ) {
				// handle space after a trailing semicolon
				$flagSpace[] = $f['sp'][$spPtr];
				$variantSpace[] = $f['sp'][$spPtr];
			}
		}
		// Parse flags (this logic is from core/languages/ConverterRule.php
		// in the parseFlags() function)
		if ( count( $flags ) === 0 && count( $variants ) === 0 ) {
			$flags['$S'] = true;
		} elseif ( isset( $flags['R'] ) ) {
			$flags = [ 'R' => true ]; // remove other flags
		} elseif ( isset( $flags['N'] ) ) {
			$flags = [ 'N' => true ]; // remove other flags
		} elseif ( isset( $flags['-'] ) ) {
			$flags = [ '-' => true ]; // remove other flags
		} elseif ( isset( $flags['T'] ) && count( $flags ) === 1 ) {
			$flags['H'] = true;
		} elseif ( isset( $flags['H'] ) ) {
			// Replace A flag, and remove other flags except T and D
			$nf = [ '$+' => true, 'H' => true ];
			if ( isset( $flags['T'] ) ) { $nf['T'] = true; }
			if ( isset( $flags['D'] ) ) { $nf['D'] = true; }
			$flags = $nf;
		} elseif ( count( $variants ) > 0 ) {
			$useVariants = true;
		} else {
			if ( isset( $flags['A'] ) ) {
				$flags['$+'] = true;
				$flags['$S'] = true;
			}
			if ( isset( $flags['D'] ) ) {
				unset( $flags['$S'] );
			}
		}
		if ( $useVariants ) {
			return [ 'variants' => $variants, 'original' => $variantList, 'sp' => $variantSpace ];
		} else {
			return [ 'flags' => $flags, 'original' => $flagList, 'sp' => $flagSpace ];
		}
	}

lang_variant_flags =
	sp1:$(space_or_newline*) f:lang_variant_flag sp2:$(space_or_newline*)
	more:( ";" lang_variant_flags? )?
	{
		$r = ( $more && $more[1] ) ? $more[1] : [ 'sp' => [], 'flags' => [] ];
		// Note that sp and flags are in reverse order, since we're using
		// right recursion and want to push instead of unshift.
		$r['sp'][] = $sp2;
		$r['sp'][] = $sp1;
		$r['flags'][] = $f;
		return $r;
	}
	/ sp:$(space_or_newline*) {
		return [ 'sp' => [ $sp ], 'flags' => [] ];
	}

lang_variant_flag =
	f:[-+A-Z]           { return [ 'flag' => $f ]; }
	/ v:lang_variant_name { return [ 'variant' => $v ]; }
	/ b:$(!space_or_newline !nowiki [^{}|;])+ { return [ 'bogus' => $b ]; /* bad flag */}

// language variant name, like zh, zh-cn, etc.
lang_variant_name =
	$([a-z] [-a-zA-Z]+)
	// Escaped otherwise-unrepresentable language names
	// Primarily for supporting html2html round trips; PHP doesn't support
	// using nowikis here (yet!)
	/ nowiki_text

lang_variant_option_list =
	o:lang_variant_option rest:( ";" oo:lang_variant_option { return $oo; })*
	tr:( ";" $bogus_lang_variant_option )* // optional trailing crap
	{
		array_unshift( $rest, $o );
		// if the last bogus option is just spaces, keep them; otherwise
		// drop all this bogus stuff on the ground
		if ( count($tr) > 0 ) {
			$last = $tr[count($tr)-1];
			if (preg_match('/^\s*$/Du', $last[1])) {
				$rest[] = [ 'semi' => true, 'sp' => $last[1] ];
			}
		}
		return $rest;
	}
	/ lvtext:lang_variant_text { return [ [ 'text' => $lvtext ] ]; }

bogus_lang_variant_option =
	lang_variant_text?

lang_variant_option =
	sp1:$(space_or_newline*) lang:lang_variant_name
	sp2:$(space_or_newline*) ":"
	sp3:$(space_or_newline*)
	lvtext:(lang_variant_nowiki / lang_variant_text_no_semi)
	{
		return [
			'twoway' => true,
			'lang' => $lang,
			'text' => $lvtext,
			'sp' => [ $sp1, $sp2, $sp3 ]
		];
	}
	/ sp1:$(space_or_newline*)
	from:(lang_variant_nowiki / lang_variant_text_no_semi_or_arrow)
	"=>"
	sp2:$(space_or_newline*) lang:lang_variant_name
	sp3:$(space_or_newline*) ":"
	sp4:$(space_or_newline*)
	to:(lang_variant_nowiki / lang_variant_text_no_semi)
	{
		return [
			'oneway' => true,
			'from' => $from,
			'lang' => $lang,
			'to' => $to,
			'sp' => [ $sp1, $sp2, $sp3, $sp4 ]
		];
	}

// html2wt support: If a language name or conversion string can't be
// represented w/o breaking wikitext, just wrap it in a .
// PHP doesn't support this (yet), but Parsoid does.
lang_variant_nowiki =
	n:nowiki_text
	sp:$space_or_newline* {
		$tsr = $this->tsrOffsets();
		$tsr->end -= strlen( $sp );
		return [
			'tokens' => [ $n ],
			'srcOffsets' => $tsr,
		];
	}

lang_variant_text =
	tokens:(inlineline / "|" )*
	{
		return [
			'tokens' => TokenizerUtils::flattenStringlist( $tokens ),
			'srcOffsets' => $this->tsrOffsets(),
		];
	}

lang_variant_text_no_semi =
	lang_variant_text lang_variant_text_no_semi_or_arrow =
	lang_variant_text_no_semi wikilink_content =
	(
		pipe
		startPos:("" { return $this->endOffset(); })
		lt:link_text? {
			$tsr = new SourceRange( $startPos, $this->endOffset() );
			$maybeContent = new KV( 'mw:maybeContent', $lt ?? [], $tsr->expandTsrV() );
			$maybeContent->vsrc = substr( $this->input, $startPos, $this->endOffset() - $startPos );
			return $maybeContent;
		}
	)*

wikilink =
	wikilink_preproc<&preproc="]]">
	/ broken_wikilink

// `broken-link` (see [[:mw:Preprocessor_ABNF]]), but careful because the
// second bracket could start an extlink.  Set preproc to false as a reference
// parameter in the parent since we haven't seen a double-close bracket.
// (See full explanation above broken_template production.)
broken_wikilink =
	&"[["
	preproc:<&preproc>
	&{ $preproc =  null; return true; }
	a:("[" (extlink / "[")) {
		return $a;
	}

wikilink_preproc =
	"[["
	spos:("" { return $this->endOffset(); })
	target:wikilink_preprocessor_text?
	tpos:("" { return $this->endOffset(); })
	lcs:wikilink_content
	inline_breaks "]]"
	{
		$pipeTrick = count( $lcs ) === 1 && count( $lcs[0]->v ) === 0;
		$textTokens = [];
		if ( $target === null || $pipeTrick ) {
			$textTokens[] = '[[';
			if ( $target ) {
				$textTokens[] = $target;
			}
			foreach ( $lcs as $a ) {
				// a is a mw:maybeContent attribute
				$textTokens[] = '|';
				if ( count( $a->v ) > 0 ) {
					$textTokens[] = $a->v;
				}
			}
			$textTokens[] = ']]';
			return $textTokens;
		}
		$obj = new SelfclosingTagTk( 'wikilink' );
		$tsr = new SourceRange( $spos, $tpos );
		$hrefKV = new KV( 'href', $target, $tsr->expandTsrV() );
		$hrefKV->vsrc = $tsr->substr( $this->input );
		// XXX: Point to object with path, revision and input information
		// obj.source = input;
		$obj->attribs[] = $hrefKV;
		$obj->attribs = array_merge( $obj->attribs, $lcs );
		$obj->dataAttribs = (object)[
			'tsr' => $this->tsrOffsets(),
			'src' => $this->text()
		];
		return [ $obj ];
	}

// Tables are allowed inside image captions.
// Suppress the equal flag temporarily in this rule to consume the '=' here.
link_text = link_text_parameterized link_text_parameterized =
	c:(
		// This group is similar to "block_line" but "list_item"
		// is omitted since `doBlockLevels` happens after
		// `replaceInternalLinks2`, where newlines are stripped.
		(sol (heading / hr / full_table_in_link_caption))
		/ urltext
		/ (
			!inline_breaks
			r:( inline_element / '[' text_char+ ']' $(&(!']' / ']]')) / . ) { return $r; }
		)
	)+ {
		return TokenizerUtils::flattenStringlist( $c );
	}

/* Generic quote rule for italic and bold, further processed in a token
 * stream transformation in doQuotes. Relies on NlTk tokens being emitted
 * for each line of text to balance quotes per line.
 *
 * We are not using a simple pair rule here as we need to support mis-nested
 * bolds/italics and MediaWiki's special heuristics for apostrophes, which are
 * all not context free. */
quote =
	quotes:$("''" "'"*) {
		// sequences of four or more than five quotes are assumed to start
		// with some number of plain-text apostrophes.
		$plainticks = 0;
		$result = [];
		if ( strlen( $quotes ) === 4 ) {
			$plainticks = 1;
		} elseif ( strlen( $quotes ) > 5 ) {
			$plainticks = strlen( $quotes ) - 5;
		}
		if ( $plainticks > 0 ) {
			$result[] = substr( $quotes, 0, $plainticks );
		}
		// mw-quote token will be consumed in token transforms
		$tsr = $this->tsrOffsets();
		$tsr->start += $plainticks;
		$mwq = new SelfclosingTagTk( 'mw-quote',
			[ new KV( 'value', substr( $quotes, $plainticks ) ) ],
			(object)[ 'tsr' => $tsr ] );
		if ( strlen( $quotes ) > 2 ) {
			$mwq->addAttribute( 'isSpace_1', $tsr->start > 0 && substr( $this->input, $tsr->start - 1, 1 ) === ' ');
			$mwq->addAttribute( 'isSpace_2', $tsr->start > 1 && substr( $this->input, $tsr->start - 2, 1 ) === ' ');
		}
		$result[] = $mwq;
		return $result;
	}


/***********************************************************
 * Pre and xmlish tags
 ***********************************************************/

extension_tag =
	! extToken:xmlish_tag
	// Account for `maybeExtensionTag` returning unmatched start / end tags
	&{ return $extToken[0]->getName() === 'extension'; }
	{ return $extToken[0]; }

nowiki =
	extToken:extension_tag
	&{ return $extToken->getAttribute( 'name' ) === 'nowiki'; }
	{ return $extToken; }

// Used by lang_variant productions to protect special language names or
// conversion strings.
nowiki_text =
	extToken:nowiki
	{
		$txt = Utils::extractExtBody( $extToken );
		return Utils::decodeWtEntities( $txt );
	}

/* Generic XML-like tags
 *
 * These also cover extensions (including Cite), which will hook into the
 * token stream for further processing. The content of extension tags is
 * parsed as regular inline, but the source positions of the tag are added
 * to allow reconstructing the unparsed text from the input. */

// See http://www.w3.org/TR/html5/syntax.html#tag-open-state and
// following paragraphs.
tag_name_chars = [^\t\n\v />\0]
tag_name = $([A-Za-z] tag_name_chars*)

// This rule is used in carefully crafted places of xmlish tag tokenizing with
// the inclusion of solidus to match where the spec would ignore those
// characters.  In particular, it does not belong in between attribute name
// and value.
space_or_newline_or_solidus = space_or_newline / (s:"/" !">" { return $s; })

xmlish_tag =
	"<" tag:(xmlish_tag_opened / xmlish_tag_opened)
	{ return $tag; }

xmlish_tag_opened =
	end:"/"?
	name: tag_name
	extTag: isBlock: & {
		if ( $extTag ) {
			return $this->isExtTag( $name );
		} else {
			return $this->isXMLTag( $name, $isBlock );
		}
	}
	// By the time we get to `doTableStuff` in the old parser, we've already
	// safely encoded element attributes. See 55313f4e in core.
	attribs:generic_newline_attributes space_or_newline_or_solidus* // No need to preserve this -- canonicalize on RT via dirty diff
	selfclose:"/"?
	space* // not preserved - canonicalized on RT via dirty diff
	">"
	{
		$lcName = mb_strtolower( $name );

		// Extension tags don't necessarily have the same semantics as html tags,
		// so don't treat them as void elements.
		$isVoidElt = Utils::isVoidElement( $lcName ) && !$extTag;

		// Support

if ( $lcName === 'br' && $end ) { $end = null; } $tsr = $this->tsrOffsets(); $tsr->start--; // For "<" matched at the start of xmlish_tag rule $res = TokenizerUtils::buildXMLTag( $name, $lcName, $attribs, $end, !!$selfclose || $isVoidElt, $tsr ); // change up data-attribs in one scenario // void-elts that aren't self-closed ==> useful for accurate RT-ing if ( !$selfclose && $isVoidElt ) { unset( $res->dataAttribs->selfClose ); $res->dataAttribs->noClose = true; } $met = $this->maybeExtensionTag( $res ); return ( is_array( $met ) ) ? $met : [ $met ]; } /* * A variant of xmlish_tag, but also checks if the tag name is a block-level * tag as defined in * http://www.w3.org/TR/html5/syntax.html#tag-open-state and * following paragraphs. */ block_tag = "<" tag:(xmlish_tag_opened / xmlish_tag_opened) { return $tag; } // A generic attribute that can span multiple lines. generic_newline_attribute = space_or_newline_or_solidus* namePos0:("" { return $this->endOffset(); }) name:generic_attribute_name namePos:("" { return $this->endOffset(); }) vd:(space_or_newline* "=" v:generic_att_value? { return $v; })? { // NB: Keep in sync w/ table_attibute $res = null; // Encapsulate protected attributes. if ( is_string( $name ) ) { $name = TokenizerUtils::protectAttrs( $name ); } $nameSO = new SourceRange( $namePos0, $namePos ); if ( $vd !== null ) { $res = new KV( $name, $vd['value'], $nameSO->join( $vd['srcOffsets'] ) ); $res->vsrc = $vd['srcOffsets']->substr( $this->input ); } else { $res = new KV( $name, '', $nameSO->expandTsrK() ); } if ( is_array( $name ) ) { $res->ksrc = $nameSO->substr( $this->input ); } return $res; } // A single-line attribute. table_attribute = s:optionalSpaceToken namePos0:("" { return $this->endOffset(); }) name:table_attribute_name namePos:("" { return $this->endOffset(); }) vd:(optionalSpaceToken "=" v:table_att_value? { return $v; })? { // NB: Keep in sync w/ generic_newline_attribute $res = null; // Encapsulate protected attributes. if ( gettype( $name ) === 'string' ) { $name = TokenizerUtils::protectAttrs( $name ); } $nameSO = new SourceRange( $namePos0, $namePos ); if ( $vd !== null ) { $res = new KV( $name, $vd['value'], $nameSO->join( $vd['srcOffsets'] ) ); $res->vsrc = $vd['srcOffsets']->substr( $this->input ); } else { $res = new KV( $name, '', $nameSO->expandTsrK() ); } if ( is_array( $name ) ) { $res->ksrc = $nameSO->substr( $this->input ); } return $res; } // The old parser's Sanitizer::removeHTMLtags explodes on < so that it can't // be found anywhere in xmlish tags. This is a divergence from html5 tokenizing // which happily permits it in attribute positions. Extension tags being the // exception, since they're stripped beforehand. less_than = $( & "<" ) // The arrangement of chars is to emphasize the split between what's disallowed // by html5 and what's necessary to give directive a chance. // See: http://www.w3.org/TR/html5/syntax.html#attributes-0 generic_attribute_name = q:$(["'=]?) // From #before-attribute-name-state, < is omitted for directive r:( $[^ \t\r\n\0/=><&{}\-!|]+ / !inline_breaks // \0/=> is the html5 attribute name set we do not want. t:( directive / less_than / $( !( space_or_newline / [\0/=><] ) . ) ) { return $t; } )* & { return count( $r ) > 0 || $q !== ''; } { array_unshift( $r, $q ); return TokenizerUtils::flattenString( $r ); } // Also accept these chars in a wikitext table or tr attribute name position. // They are normally not matched by the table_attribute_name. broken_table_attribute_name_char = c:[\0/=>] { return new KV( $c, '' ); } // Same as generic_attribute_name, except for accepting tags and wikilinks. // (That doesn't make sense (ie. match the old parser) in the generic case.) // We also give a chance to break on \[ (see T2553). table_attribute_name = q:$(["'=]?) // From #before-attribute-name-state, < is omitted for directive r:( $[^ \t\r\n\0/=><&{}\-!|\[]+ / !inline_breaks // \0/=> is the html5 attribute name set we do not want. t:( $wikilink / directive // Accept insane tags-inside-attributes as attribute names. // The sanitizer will strip and shadow them for roundtripping. // Example: generated with.. / &xmlish_tag ill:inlineline { return $ill; } / $( !( space_or_newline / [\0/=>] ) . ) ) { return $t; } )* & { return count( $r ) > 0 || $q !== ''; } { array_unshift( $r, $q ); return TokenizerUtils::flattenString( $r ); } // Attribute value, quoted variants can span multiple lines. // Missing end quote: accept /> look-ahead as heuristic. // These need to be kept in sync with the attribute_preprocessor_text_* generic_att_value = s:$(space_or_newline* "'") t:attribute_preprocessor_text_single? q:$("'" / &('/'? '>')) { return TokenizerUtils::getAttrVal( $t, $this->startOffset() + strlen( $s ), $this->endOffset() - strlen( $q ) ); } / s:$(space_or_newline* '"') t:attribute_preprocessor_text_double? q:$('"' / &('/'? '>')) { return TokenizerUtils::getAttrVal( $t, $this->startOffset() + strlen( $s ), $this->endOffset() - strlen( $q ) ); } / s:$space_or_newline* t:attribute_preprocessor_text &(space_or_newline / eof / '/'? '>') { return TokenizerUtils::getAttrVal( $t, $this->startOffset() + strlen( $s ), $this->endOffset() ); } // Attribute value, restricted to a single line. // Missing end quote: accept |, !!, \r, and \n look-ahead as heuristic. // These need to be kept in sync with the table_attribute_preprocessor_text_* table_att_value = s:$(space* "'") t:table_attribute_preprocessor_text_single? q:$("'" / &('!!' / [|\r\n])) { return TokenizerUtils::getAttrVal( $t, $this->startOffset() + strlen( $s ), $this->endOffset() - strlen( $q ) ); } / s:$(space* '"') t:table_attribute_preprocessor_text_double? q:$('"' / &('!!' / [|\r\n])) { return TokenizerUtils::getAttrVal( $t, $this->startOffset() + strlen( $s ), $this->endOffset() - strlen( $q ) ); } / s:$space* t:table_attribute_preprocessor_text &(space_or_newline/ eof / '!!' / '|') { return TokenizerUtils::getAttrVal( $t, $this->startOffset() + strlen( $s ), $this->endOffset() ); } /********************************************************* * Lists *********************************************************/ list_item = dtdd / hacky_dl_uses / li li = bullets:list_char+ c:inlineline? // The inline_break is to check if we've hit a template end delimiter. &(eolf / inline_breaks) { // Leave bullets as an array -- list handler expects this $tsr = $this->tsrOffsets( 'start' ); $tsr->end += count( $bullets ); $li = new TagTk( 'listItem', [ new KV( 'bullets', $bullets, $tsr->expandTsrV() ) ], (object)[ 'tsr' => $tsr ] ); return array_merge( [ $li ], $c ?: [] ); } /* * This rule is required to support wikitext of this form * ::{|border="1"|foo|bar|baz|} * where the leading colons are used to indent the entire table. * This hack was added back in 2006 in commit * a0746946312b0f1eda30a2c793f5f7052e8e5f3a based on a patch by Carl * Fürstenberg. */ hacky_dl_uses = bullets:":"+ tbl:(table_line (sol table_line)*) line:inlineline? &comment_space_eolf { // Leave bullets as an array -- list handler expects this $tsr = $this->tsrOffsets( 'start' ); $tsr->end += count( $bullets ); $li = new TagTk( 'listItem', [ new KV( 'bullets', $bullets, $tsr->expandTsrV() ) ], (object)[ 'tsr' => $tsr ] ); return TokenizerUtils::flattenIfArray( [ $li, $tbl, $line ?: [] ] ); } dtdd = bullets:(!(";" !list_char) lc:list_char { return $lc; })* ";" c:inlineline_break_on_colon? cpos:(":" { return $this->endOffset(); }) d:inlineline? &eolf { // Leave bullets as an array -- list handler expects this // TSR: +1 for the leading ";" $numBullets = count( $bullets ) + 1; $tsr = $this->tsrOffsets( 'start' ); $tsr->end += $numBullets; $li1Bullets = $bullets; $li1Bullets[] = ';'; $li1 = new TagTk( 'listItem', [ new KV( 'bullets', $li1Bullets, $tsr->expandTsrV() ) ], (object)[ 'tsr' => $tsr ] ); // TSR: -1 for the intermediate ":" $li2Bullets = $bullets; $li2Bullets[] = ':'; $tsr2 = new SourceRange( $cpos - 1, $cpos ); $li2 = new TagTk( 'listItem', [ new KV( 'bullets', $li2Bullets, $tsr2->expandTsrV() ) ], (object)[ 'tsr' => $tsr2, 'stx' => 'row' ] ); return array_merge( [ $li1 ], $c ?: [], [ $li2 ], $d ?: [] ); } list_char = [*#:;] inlineline_break_on_colon = inlineline /****************************************************************************** * Tables * ------ * Table rules are geared to support independent parsing of fragments in * templates (the common table start / row / table end use case). The tokens * produced by these fragments then match up to a table while building the * DOM tree. For similar reasons, table rows do not emit explicit end tag * tokens. * * The separate table_line rule is faster than moving those rules * directly to block_lines. * * Notes about the full_table_in_link_caption rule * ----------------------------------------------------- * However, for link-tables, we have introduced a stricter parse wherein * we require table-start and table-end tags to not come from a template. * In addition, this new rule doesn't accept fosterable-content in * the table unlike the more lax (sol table_line)+ rule. * * This is the best we can do at this time since we cannot distinguish * between table rows and image options entirely in the tokenizer. * * Consider the following examples: * * Example 1: * * [[Image:Foo.jpg|left|30px|Example 1 * {{This-template-returns-a-table-start-tag}} * |foo * {{This-template-returns-a-table-end-tag}} * ]] * * Example 2: * * [[Image:Foo.jpg|left|30px|Example 1 * {{1x|a}} * |foo * {{1x|b}} * ]] * * So, we cannot know a priori (without preprocessing or fully expanding * all templates) if "|foo" in the two examples is a table cell or an image * option. This is a limitation of our tokenizer-based approach compared to * the preprocessing-based approach of the old parser. * * Given this limitation, we are okay forcing a full-table context in * link captions (if necessary, we can relax the fosterable-content requirement * but that is broken wikitext anyway, so we can force that edge-case wikitext * to get fixed by rejecting it). ******************************************************************************/ full_table_in_link_caption = (! inline_breaks / & '{{!}}' ) // Note that "linkdesc" is suppressed here to provide a nested parsing // context in which to parse the table. Otherwise, we may break on // on pipes in the `table_start_tag` and `table_row_tag` attributes. // However, as a result, this can be more permissive than the old // implementation, but likelier to match the users intent. r: full_table_in_link_caption_parameterized { return $r; } full_table_in_link_caption_parameterized = table_start_tag optionalNewlines // Accept multiple end tags since a nested table may have been // opened in the table content line. ( (sol (table_content_line / tplarg_or_template) optionalNewlines)* sol table_end_tag )+ // This rule assumes start-of-line position! table_line = (! inline_breaks / & '{{!}}' ) tl:( table_start_tag optionalNewlines / table_content_line optionalNewlines / table_end_tag ) { return $tl; } table_content_line = (space / comment)* ( table_heading_tags / table_row_tag / table_data_tags / table_caption_tag ) table_start_tag "table_start_tag" = sc:(space / comment)* startPos:("" { return $this->endOffset(); }) b:"{" p:pipe // ok to normalize away stray |} on rt (see T59360) ta:(table_attributes / &{ $this->unreachable(); }) tsEndPos:("" { return $this->endOffset(); }) s2:space* { $coms = TokenizerUtils::popComments( $ta ); if ( $coms ) { $tsEndPos = $coms['commentStartPos']; } $da = (object)[ 'tsr' => new SourceRange( $startPos, $tsEndPos ) ]; if ( $p !== '|' ) { // Variation from default $da->startTagSrc = $b . $p; } return array_merge( $sc, [ new TagTk( 'table', $ta, $da ) ], $coms ? $coms['buf'] : [], $s2 ); } // FIXME: Not sure if we want to support it, but this should allow columns. table_caption_tag = // avoid recursion via nested_block_in_table ! p:pipe "+" args:row_syntax_table_args? tagEndPos:("" { return $this->endOffset(); }) c:nested_block_in_table* { $tsr = new SourceRange( $this->startOffset(), $tagEndPos ); return TokenizerUtils::buildTableTokens( 'caption', '|+', $args, $tsr, $this->endOffset(), $c, true ); } table_row_tag = // avoid recursion via nested_block_in_table ! p:pipe dashes:$"-"+ a:(table_attributes / &{ $this->unreachable(); }) tagEndPos:("" { return $this->endOffset(); }) { $coms = TokenizerUtils::popComments( $a ); if ( $coms ) { $tagEndPos = $coms['commentStartPos']; } $da = (object)[ 'tsr' => new SourceRange( $this->startOffset(), $tagEndPos ), 'startTagSrc' => $p . $dashes ]; // We rely on our tree builder to close the row as needed. This is // needed to support building tables from fragment templates with // individual cells or rows. $trToken = new TagTk( 'tr', $a, $da ); return array_merge( [ $trToken ], $coms ? $coms['buf'] : [] ); } tds = ( pp:( pipe_pipe / p:pipe & row_syntax_table_args { return $p; } ) tdt:table_data_tag { // Avoid modifying cached dataAttribs object $tdt[0] = clone $tdt[0]; $da = $tdt[0]->dataAttribs = clone $tdt[0]->dataAttribs; $da->tsr = clone $da->tsr; $da->stx = 'row'; $da->tsr->start -= strlen( $pp ); // include "||" if ( $pp !== '||' || ( isset( $da->startTagSrc ) && $da->startTagSrc !== $pp ) ) { // Variation from default $da->startTagSrc = $pp . ( isset( $da->startTagSrc ) ? substr( $da->startTagSrc, 1 ) : '' ); } return $tdt; } )* table_data_tags = // avoid recursion via nested_block_in_table ! p:pipe ![+-] td:table_data_tag tagEndPos:("" { return $this->endOffset(); }) tds:tds { // Avoid modifying a cached result $td[0] = clone $td[0]; $da = $td[0]->dataAttribs = clone $td[0]->dataAttribs; $da->tsr = clone $da->tsr; $da->tsr->start -= strlen( $p ); // include "|" if ( $p !== '|' ) { // Variation from default $da->startTagSrc = $p; } return array_merge( $td, $tds ); } table_data_tag = ! "}" arg:row_syntax_table_args? // use inline_breaks to break on tr etc tagEndPos:("" { return $this->endOffset(); }) td:nested_block_in_table* { $tsr = new SourceRange( $this->startOffset(), $tagEndPos ); return TokenizerUtils::buildTableTokens( 'td', '|', $arg, $tsr, $this->endOffset(), $td ); } table_heading_tags = table_heading_tags_parameterized<&th> table_heading_tags_parameterized = "!" thTag:table_heading_tag thTags:( pp:("!!" / pipe_pipe) tht:table_heading_tag { // Avoid modifying a cached result $tht[0] = clone $tht[0]; $da = $tht[0]->dataAttribs = clone $tht[0]->dataAttribs; $da->tsr = clone $da->tsr; $da->stx = 'row'; $da->tsr->start -= strlen( $pp ); // include "!!" or "||" if ( $pp !== '!!' || ( isset( $da->startTagSrc ) && $da->startTagSrc !== $pp ) ) { // Variation from default $da->startTagSrc = $pp . ( isset( $da->startTagSrc ) ? substr( $da->startTagSrc, 1 ) : '' ); } return $tht; } )* { $thTag[0] = clone $thTag[0]; $da = $thTag[0]->dataAttribs = clone $thTag[0]->dataAttribs; $da->tsr = clone $da->tsr; $da->tsr->start--; // include "!" array_unshift( $thTags, $thTag ); return $thTags; } table_heading_tag = arg:row_syntax_table_args? tagEndPos:("" { return $this->endOffset(); }) c:( th:<&th> d:nested_block_in_table { if ( $th !== false && strpos( $this->text(), "\n" ) !== false ) { // There's been a newline. Remove the break and continue // tokenizing nested_block_in_tables. $th = false; } return $d; } )* { $tsr = new SourceRange( $this->startOffset(), $tagEndPos ); return TokenizerUtils::buildTableTokens( 'th', '!', $arg, $tsr, $this->endOffset(), $c ); } table_end_tag = sc:(space / comment)* startPos:("" { return $this->endOffset(); }) p:pipe b:"}" { $tblEnd = new EndTagTk( 'table', [], (object)[ 'tsr' => new SourceRange( $startPos, $this->endOffset() ), ] ); if ( $p !== '|' ) { // p+"" is triggering some bug in pegJS // I cannot even use that expression in the comment! $tblEnd->dataAttribs->endTagSrc = $p . $b; } array_push( $sc, $tblEnd ); return $sc; } /** * Table parameters separated from the content by a single pipe. Does *not* * match if followed by double pipe (row-based syntax). */ row_syntax_table_args = as:table_attributes s:optional_spaces p:pipe !pipe { return [ $as, $s, $p ]; } /******************************************************************* * Text variants and other general rules *******************************************************************/ /* All chars that cannot start syntactic structures in the middle of a line * XXX: ] and other end delimiters should probably only be activated inside * structures to avoid unnecessarily leaving the text rule on plain * content. * * TODO: Much of this is should really be context-dependent (syntactic * flags). The wikilink_preprocessor_text rule is an example where * text_char is not quite right and had to be augmented. Try to minimize / * clarify this carefully! * * This character class is inlined into urltext. Changes here may also need to * be reflected there. */ text_char = [^-'<[{\n\r:;\]}|!=] /* Legend * ' quotes (italic/bold) * < start of xmlish_tag * [ start of links * { start of parser functions, transclusion and template args * \n all sort of block-level markup at start of line * \r ditto * A-Za-z autolinks (http(s), nttp(s), mailto, ISBN, PMID, RFC) * * _ behavior switches (e.g., '__NOTOC__') (XXX: not URL related) * ! and | table cell delimiters, might be better to specialize those * = headings - also specialize those! * * The following chars are also included for now, but only apply in some * contexts and should probably be enabled only in those: * : separate definition in ; term : definition * ] end of link * } end of parser func/transclusion/template arg * - start of lang_variant -{ ... }- * ; separator in lang_variant */ urltext = ( & [A-Za-z] al:autolink { return $al; } / & "&" he:htmlentity { return $he; } / & ('__') bs:behavior_switch { return $bs; } // About 96% of text_char calls originated here, so inline it for efficiency / [^-'<[{\n\r:;\]}|!=] )+ raw_htmlentity = m:$("&" [#0-9a-zA-Z]+ ";") { return Utils::decodeWtEntities( $m ); } htmlentity = cc:raw_htmlentity { // if this is an invalid entity, don't tag it with 'mw:Entity' if ( mb_strlen( $cc ) > 1 /* decoded entity would be 1 character */ ) { return $cc; } return [ // If this changes, the nowiki extension's toDOM will need to follow suit new TagTk( 'span', [ new KV( 'typeof', 'mw:Entity' ) ], (object)[ 'src' => $this->text(), 'srcContent' => $cc, 'tsr' => $this->tsrOffsets( 'start' ) ] ), $cc, new EndTagTk( 'span', [], (object)[ 'tsr' => $this->tsrOffsets( 'end' ) ] ) ]; } spaces = $[ \t]+ optional_spaces = $[ \t]* space = [ \t] optionalSpaceToken = s:optional_spaces { if ( $s !== '' ) { return [ $s ]; } else { return []; } } /* This rule corresponds to \s in the PHP preg_* functions, * which is used frequently in the old parser. The inclusion of * form feed (but not other whitespace, like vertical tab) is a quirk * of Perl, which PHP inherited via the PCRE (Perl-Compatible Regular * Expressions) library. */ space_or_newline = [ \t\n\r\x0c] /* This rule corresponds to \b in the PHP preg_* functions, * after a word character. That is, it's a zero-width lookahead that * the next character is not a word character. */ end_of_word = eof / ![A-Za-z0-9_] // Unicode "separator, space" category. It covers the \u0020 space as well // as \u3000 IDEOGRAPHIC SPACE (see bug 19052). In PHP this is \p{Zs}. // Keep this up-to-date with the characters tagged ;Zs; in // http://www.unicode.org/Public/UNIDATA/UnicodeData.txt unispace = [ \u00A0\u1680\u2000-\u200A\u202F\u205F\u3000] // Non-newline whitespace, including non-breaking spaces. Used for magic links. space_or_nbsp = space // includes \t / unispace / & "&" he:htmlentity &{ return is_array( $he ) && $he[ 1 ] === "\u{A0}"; } { return $he; } // Used within ISBN magic links space_or_nbsp_or_dash = space_or_nbsp / "-" // Extra newlines followed by at least another newline. Usually used to // compress surplus newlines into a meta tag, so that they don't trigger // paragraphs. optionalNewlines = spc:$([\n\r\t ] &[\n\r])* { if ( strlen( $spc ) ) { return [ $spc ]; } else { return []; } } comment_or_includes = (comment / include_limits)* sol = (empty_line_with_comments / sol_prefix) comment_or_includes sol_prefix = newlineToken / & { // Use the sol flag only at the start of the input // Flag should always be an actual boolean (not falsy or undefined) $this->assert( is_bool( $this->options['sol'] ), 'sol should be boolean' ); return $this->endOffset() === 0 && $this->options['sol']; } { return []; } empty_line_with_comments = sp:sol_prefix p:("" { return $this->endOffset(); }) c:(space* comment (space / comment)* newline)+ { return [ $sp, new SelfclosingTagTk( 'meta', [ new KV( 'typeof', 'mw:EmptyLine' ) ], (object)[ 'tokens' => TokenizerUtils::flattenIfArray( $c ), 'tsr' => new SourceRange( $p, $this->endOffset() ), ] ) ]; } comment_space = comment / space nl_comment_space = newlineToken / comment_space /** * noinclude / includeonly / onlyinclude rules. These are normally * handled by the xmlish_tag rule, except where generic tags are not * allowed- for example in directives, which are allowed in various attribute * names and -values. * * Example test case: * {| * |- * foo * * |Hello * |} */ include_limits = & ("<" "/"? n:("includeonly"i / "noinclude"i / "onlyinclude"i ) ) il:xmlish_tag sol_il: & { $il = $il[0]; $lname = mb_strtolower( $il->getName() ); if ( !TokenizerUtils::isIncludeTag( $lname ) ) { return false; } // Preserve SOL where necessary (for onlyinclude and noinclude) // Note that this only works because we encounter <*include*> tags in // the toplevel content and we rely on the php preprocessor to expand // templates, so we shouldn't ever be tokenizing inInclude. // Last line should be empty (except for comments) if ( $lname !== 'includeonly' && $sol_il && $il instanceof TagTk ) { $dp = $il->dataAttribs; $inclContent = $dp->extTagOffsets->stripTags( $dp->src ); $nlpos = strrpos( $inclContent, "\n" ); $last = $nlpos === false ? $inclContent : substr( $inclContent, $nlpos + 1 ); if ( !preg_match( '/^()*$/D', $last ) ) { return false; } } return true; } { return $il; } // Start of file sof = & { return $this->endOffset() === 0 && !$this->pipelineOffset; } // End of file eof = & { return $this->endOffset() === $this->inputLength; } newline = '\n' / '\r\n' newlineToken = newline { return [ new NlTk( $this->tsrOffsets() ) ]; } eolf = newline / eof comment_space_eolf = (space+ / comment)* eolf // 'Preprocessor' directive- higher-level things that can occur in otherwise // plain-text content. directive = comment / extension_tag / tplarg_or_template / & "-{" v:lang_variant_or_tpl { return $v; } / & "&" e:htmlentity { return $e; } / include_limits wikilink_preprocessor_text = r:( t:$[^<[{\n\r\t|!\]}{ &\-]+ // XXX gwicke: any more chars we need to allow here? / !inline_breaks wr:( directive / $( !"]]" ( text_char / [!<\-\}\]\n\r] ) ) ) { return $wr; } )+ { return TokenizerUtils::flattenStringlist( $r ); } // added special separator character class inline: separates url from // description / text extlink_nonipv6url = // Prevent breaking on pipes when we're in a link description. // See the test, 'Images with the "|" character in the comment'. extlink_nonipv6url_parameterized extlink_nonipv6url_parameterized = r:( $[^<[{\n\r|!\]}\-\t&="' \u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000]+ / !inline_breaks s:( directive / [&|{\-!}=] ) { return $s; } / $(['] ![']) // single quotes are ok, double quotes are bad )+ { return TokenizerUtils::flattenString( $r ); } // Attribute values with preprocessor support // n.b. / is a permissible char in the three rules below. // We only break on />, enforced by the negated expression. // Hence, it isn't included in the stop set. // The stop set is space_or_newline and > which matches generic_att_value. attribute_preprocessor_text = r:( $[^{}&<\-|/ \t\n\r\x0c>]+ / !inline_breaks !'/>' s:( directive / less_than / [{}&\-|/] ) { return $s; } )+ { return TokenizerUtils::flattenString( $r ); } // The stop set is '> which matches generic_att_value. attribute_preprocessor_text_single = r:( $[^{}&<\-|/'>]+ / !inline_breaks !'/>' s:( directive / less_than / [{}&\-|/] ) { return $s; } )* { return TokenizerUtils::flattenString( $r ); } // The stop set is "> which matches generic_att_value. attribute_preprocessor_text_double = r:( $[^{}&<\-|/">]+ / !inline_breaks !'/>' s:( directive / less_than / [{}&\-|/] ) { return $s; } )* { return TokenizerUtils::flattenString( $r ); } // Variants with the entire attribute on a single line // n.b. ! is a permissible char in the three rules below. // We only break on !! in th, enforced by the inline break. // Hence, it isn't included in the stop set. // [ is also permissible but we give a chance to break // for the [[ special case in the old parser's doTableStuff (See T2553). // The stop set is space_or_newline and | which matches table_att_value. table_attribute_preprocessor_text = r:( $[^{}&<\-!\[ \t\n\r\x0c|]+ / !inline_breaks s:( directive / [{}&<\-!\[] ) { return $s; } )+ { return TokenizerUtils::flattenString( $r ); } // The stop set is '\r\n| which matches table_att_value. table_attribute_preprocessor_text_single = r:( $[^{}&<\-!\['\r\n|]+ / !inline_breaks s:( directive / [{}&<\-!\[] ) { return $s; } )* { return TokenizerUtils::flattenString( $r ); } // The stop set is "\r\n| which matches table_att_value. table_attribute_preprocessor_text_double = r:( $[^{}&<\-!\["\r\n|]+ / !inline_breaks s:( directive / [{}&<\-!\[] ) { return $s; } )* { return TokenizerUtils::flattenString( $r ); } // Special-case support for those pipe templates pipe = "|" / "{{!}}" // SSS FIXME: what about |{{!}} and {{!}}| pipe_pipe = "||" / "{{!}}{{!}}"