/** * Combined Wiki (MediaWiki) and HTML tokenizer based on pegjs. Emits several * chunks of tokens (one chunk per top-level block matched) and eventually an * end event. Tokens map to HTML tags as far as possible, with custom tokens * used where further processing on the token stream is needed. */ { use Wikimedia\Parsoid\Config\Env; use Wikimedia\Parsoid\Config\SiteConfig; use Wikimedia\Parsoid\Config\WikitextConstants; use Wikimedia\Parsoid\Core\DomSourceRange; use Wikimedia\Parsoid\Tokens\CommentTk; use Wikimedia\Parsoid\Tokens\EndTagTk; use Wikimedia\Parsoid\Tokens\EOFTk; use Wikimedia\Parsoid\Tokens\KV; use Wikimedia\Parsoid\Tokens\KVSourceRange; use Wikimedia\Parsoid\Tokens\NlTk; use Wikimedia\Parsoid\Tokens\SelfclosingTagTk; use Wikimedia\Parsoid\Tokens\SourceRange; use Wikimedia\Parsoid\Tokens\TagTk; use Wikimedia\Parsoid\Tokens\Token; use Wikimedia\Parsoid\Utils\TokenUtils; use Wikimedia\Parsoid\Utils\Utils; use Wikimedia\Parsoid\Utils\PHPUtils; use Wikimedia\Parsoid\Utils\WTUtils; } { /** @var Env */ private $env; /** @var SiteConfig */ private $siteConfig; /** @var array */ private $pipelineOpts; /** @var int */ private $pipelineOffset; private $extTags; protected function initialize() { $this->env = $this->options['env']; $this->siteConfig = $this->env->getSiteConfig(); $tokenizer = $this->options['pegTokenizer']; $this->pipelineOpts = $tokenizer->getOptions(); $this->pipelineOffset = $this->options['pipelineOffset'] ?? 0; $this->extTags = $this->siteConfig->getExtensionTagNameMap(); } private $prevOffset = 0; private $headingIndex = 0; private function assert( $condition, $text ) { if ( !$condition ) { throw new \Exception( "Grammar.pegphp assertion failure: $text" ); } } private function unreachable() { throw new \Exception( "Grammar.pegphp: this should be unreachable" ); } // Some shorthands for legibility private function startOffset() { return $this->savedPos; } private function endOffset() { return $this->currPos; } private function tsrOffsets( $flag = 'default' ): SourceRange { switch ( $flag ) { case 'start': return new SourceRange( $this->savedPos, $this->savedPos ); case 'end': return new SourceRange( $this->currPos, $this->currPos ); default: return new SourceRange( $this->savedPos, $this->currPos ); } } /* * Emit a chunk of tokens to our consumers. Once this has been done, the * current expression can return an empty list (true). */ private function emitChunk( $tokens ) { // Shift tsr of all tokens by the pipeline offset TokenUtils::shiftTokenTSR( $tokens, $this->pipelineOffset ); $this->env->log( 'trace/peg', $this->options['pipelineId'] ?? '0', '----> ', $tokens ); $i = null; $n = count( $tokens ); // Enforce parsing resource limits for ( $i = 0; $i < $n; $i++ ) { TokenizerUtils::enforceParserResourceLimits( $this->env, $tokens[ $i ] ); } return $tokens; } /* ------------------------------------------------------------------------ * Extension tags should be parsed with higher priority than anything else. * * The trick we use is to strip out the content inside a matching tag-pair * and not tokenize it. The content, if it needs to parsed (for example, * for , <*include*> tags), is parsed in a fresh tokenizer context * which means any error correction that needs to happen is restricted to * the scope of the extension content and doesn't spill over to the higher * level. Ex: ).)*-->/g, "") // but, as always, things around here are a little more complicated. // // We accept the same comments, but because we emit them as HTML comments // instead of deleting them, we have to encode the data to ensure that // we always emit a valid HTML5 comment. See the encodeComment helper // for further details. comment = '" .)* ('-->' / eof) { $data = WTUtils::encodeComment( $c ); return [ new CommentTk( $data, (object)[ 'tsr' => $this->tsrOffsets() ] ) ]; } // Behavior switches. See: // https://www.mediawiki.org/wiki/Help:Magic_words#Behavior_switches behavior_switch = bs:$('__' behavior_text '__') { if ( $this->siteConfig->isMagicWord( $bs ) ) { return [ new SelfclosingTagTk( 'behavior-switch', [ new KV( 'word', $bs ) ], (object)[ 'tsr' => $this->tsrOffsets(), 'src' => $bs, 'magicSrc' => $bs ] ) ]; } else { return [ $bs ]; } } // Instead of defining a charset, the old parser's doDoubleUnderscore concats a // regexp of all the language specific aliases of the behavior switches and // then does a match and replace. Just be as permissive as possible and let the // BehaviorSwitchPreprocessor back out of any overreach. behavior_text = $( !'__' ( text_char / "-" ) )+ /************************************************************** * External (bracketed and autolinked) links **************************************************************/ autolink = ! // this must be a word boundary, so previous character must be non-word ! { return Utils::isUniWord(Utils::lastUniChar( $this->input, $this->endOffset() ) ); } r:( autourl / autoref / isbn ) { return $r; } extlink "extlink" = ! // extlink cannot be nested r:( "[" p0:( "" { return $this->endOffset(); }) addr:(url_protocol ipv6urladdr / "") target:(extlink_nonipv6url / "") p1:( "" { return $this->endOffset(); }) & { // Protocol must be valid and there ought to be at least one // post-protocol character. So strip last char off target // before testing protocol. $flat = TokenizerUtils::flattenString( [ $addr, $target ] ); if ( is_array( $flat ) ) { // There are templates present, alas. return count( $flat ) > 0; } return Utils::isProtocolValid( substr( $flat, 0, -1 ), $this->env ); } sp:$( space / unispace )* p2:( "" { return $this->endOffset(); }) content:inlineline? p3:( "" { return $this->endOffset(); }) "]" { $tsr1 = new SourceRange( $p0, $p1 ); $tsr2 = new SourceRange( $p2, $p3 ); return [ new SelfclosingTagTk( 'extlink', [ new KV( 'href', TokenizerUtils::flattenString( [ $addr, $target ] ), $tsr1->expandTsrV() ), new KV( 'mw:content', $content ?? '', $tsr2->expandTsrV() ), new KV( 'spaces', $sp ) ], (object)[ 'tsr' => $this->tsrOffsets(), 'extLinkContentOffsets' => $tsr2, ] ) ]; } ) { return $r; } autoref = ref:('RFC' / 'PMID') sp:space_or_nbsp+ identifier:$[0-9]+ end_of_word { $base_urls = [ 'RFC' => 'https://tools.ietf.org/html/rfc%s', 'PMID' => '//www.ncbi.nlm.nih.gov/pubmed/%s?dopt=Abstract' ]; $tsr = $this->tsrOffsets(); return [ new SelfclosingTagTk( 'extlink', [ new KV( 'href', sprintf( $base_urls[ $ref ], $identifier ) ), new KV( 'mw:content', TokenizerUtils::flattenString( [ $ref, $sp, $identifier ] ), $tsr->expandTsrV() ), new KV( 'typeof', 'mw:ExtLink/' . $ref ) ], (object)[ 'stx' => 'magiclink', 'tsr' => $tsr ] ) ]; } isbn = 'ISBN' sp:space_or_nbsp+ isbn:( [0-9] ((space_or_nbsp_or_dash / "") [0-9])+ ((space_or_nbsp_or_dash / "") [xX] / "") ) isbncode:( end_of_word { // Convert isbn token-and-entity array to stripped string. $stripped = ''; foreach ( TokenizerUtils::flattenStringlist( $isbn ) as $part ) { if ( is_string( $part ) ) { $stripped .= $part; } } return strtoupper( preg_replace( '/[^\dX]/i', '', $stripped ) ); } ) &{ // ISBNs can only be 10 or 13 digits long (with a specific format) return strlen( $isbncode ) === 10 || ( strlen( $isbncode ) === 13 && preg_match( '/^97[89]/', $isbncode ) ); } { $tsr = $this->tsrOffsets(); return [ new SelfclosingTagTk( 'extlink', [ new KV( 'href', 'Special:BookSources/' . $isbncode ), new KV( 'mw:content', TokenizerUtils::flattenString( [ 'ISBN', $sp, $isbn ] ), $tsr->expandTsrV() ), new KV( 'typeof', 'mw:WikiLink/ISBN' ) ], (object)[ 'stx' => 'magiclink', 'tsr' => $tsr ] ) ]; } /* Default URL protocols in MediaWiki (see DefaultSettings). Normally * these can be configured dynamically. */ url_protocol = p:$( '//' / [A-Za-z] [-A-Za-z0-9+.]* ':' '//'? ) & { return Utils::isProtocolValid( $p, $this->env ); } { return $p; } // no punctuation, and '{<' to trigger directives no_punctuation_char = [^ \]\[\r\n"'<>\x00-\x20\x7f&\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000{] // this is the general url rule // on the PHP side, the path part matches EXT_LINK_URL_CLASS // which is '[^][<>"\\x00-\\x20\\x7F\p{Zs}]' url = proto:url_protocol addr:(ipv6urladdr / "") path:( !inline_breaks c:( no_punctuation_char / comment / tplarg_or_template / ['{] / ! ( "&" ( [lL][tT] / [gG][tT] ) ";" ) r:( & "&" he:htmlentity { return $he; } / "&" ) { return $r; } ) { return $c; } )* // Must be at least one character after the protocol & { return $addr !== '' || count( $path ) > 0; } { return TokenizerUtils::flattenString( array_merge( [ $proto, $addr ], $path ) ); } // this is the somewhat-restricted rule used in autolinks // See Parser::doMagicLinks and Parser.php::makeFreeExternalLink. // The `path` portion matches EXT_LINK_URL_CLASS, as in the general // url rule. As in PHP, we do some fancy fixup to yank out // trailing punctuation, perhaps including parentheses. autourl = ! '//' // protocol-relative autolinks not allowed (T32269) r:( proto:url_protocol addr:(ipv6urladdr / "") path:( !inline_breaks c:( no_punctuation_char / comment / tplarg_or_template / $("'" !"'") // single quotes are ok, double quotes are bad / "{" / ! ( rhe:raw_htmlentity &{ return $rhe === '<' || $rhe === '>' || $rhe === "\u{A0}"; } ) r:( & "&" he:htmlentity { return $he; } / "&" ) { return $r; } ) { return $c; } )* { // as in Parser.php::makeFreeExternalLink, we're going to // yank trailing punctuation out of this match. $url = TokenizerUtils::flattenStringlist( array_merge( [ $proto, $addr ], $path ) ); // only need to look at last element; HTML entities are strip-proof. $last = PHPUtils::lastItem( $url ); $trim = 0; if ( is_string( $last ) ) { $strip = ',;\.:!?'; if ( array_search( '(', $path ) === false ) { $strip .= ')'; } $trim = strspn( strrev( $last ), $strip ); $url[ count( $url ) - 1 ] = substr( $last, 0, strlen( $last ) - $trim ); } $url = TokenizerUtils::flattenStringlist( $url ); if ( count( $url ) === 1 && is_string( $url[0] ) && strlen( $url[0] ) <= strlen( $proto ) ) { return null; // ensure we haven't stripped everything: T106945 } $this->currPos -= $trim; return $url; } ) &{ return $r !== null; } { $tsr = $this->tsrOffsets(); $res = [ new SelfclosingTagTk( 'urllink', [ new KV( 'href', $r, $tsr->expandTsrV() ) ], (object)[ 'tsr' => $tsr ] ) ]; return $res; } // This is extracted from EXT_LINK_ADDR in Parser.php: a simplified // expression to match an IPv6 address. The IPv4 address and "at least // one character of a host name" portions are punted to the `path` // component of the `autourl` and `url` productions ipv6urladdr = $( "[" [0-9A-Fa-f:.]+ "]" ) /************************************************************** * Templates, -arguments and wikilinks **************************************************************/ /* * Precedence: template arguments win over templates. See * http://www.mediawiki.org/wiki/Preprocessor_ABNF#Ideal_precedence * 4: {{{{·}}}} → {·{{{·}}}·} * 5: {{{{{·}}}}} → {{·{{{·}}}·}} * 6: {{{{{{·}}}}}} → {{{·{{{·}}}·}}} * 7: {{{{{{{·}}}}}}} → {·{{{·{{{·}}}·}}}·} * This is only if close has > 3 braces; otherwise we just match open * and close as we find them. */ tplarg_or_template = &'{{' templatedepth: &{ // Refuse to recurse beyond `maxDepth` levels. Default in the old parser // is $wgMaxTemplateDepth = 40; This is to prevent crashing from // buggy wikitext with lots of unclosed template calls, as in // eswiki/Usuario:C%C3%A1rdenas/PRUEBAS?oldid=651094 return $templatedepth + 1 < $this->siteConfig->getMaxTemplateDepth(); } t:tplarg_or_template_guarded { return $t; } tplarg_or_template_guarded = &('{{' &('{{{'+ !'{') tplarg) a:(template/broken_template) { return $a; } / a:$('{' &('{{{'+ !'{'))? b:tplarg { return [ $a, $b ]; } / a:$('{' &('{{' !'{'))? b:template { return [ $a, $b ]; } / broken_template tplarg_or_template_or_bust = r:(tplarg_or_template / .)+ { return TokenizerUtils::flattenIfArray( $r ); } template = template_preproc<&preproc="}}"> // The old preprocessor maintains a single stack of "closing token we // are currently looking for", with no backtracking. This means that // once you see `[[ {{` you are looking only for `}}` -- if that template // turns out to be broken you will never pop the `}}` and there is no way // to close the `[[`. Since the PEG tokenizer in Parsoid uses backtracking // and parses in a single pass (instead of PHP's split preprocessor/parser) // we have to be a little more careful when we emulate this behavior. // If we use a rule like: // template = "{{" tplname tplargs* "}}"? // Then we end up having to reinterpret `tplname tplargs*` as a tlb if it // turns out we never find the `}}`, which involves a lot of tedious gluing // tokens back together with fingers crossed we haven't discarded any // significant newlines/whitespace/etc. An alternative would be a rule like: // broken_template = "{{" tlb // but again, `template` is used in many different contexts; `tlb` isn't // necessarily the right one to recursively invoke. Instead we get the // broken template off of the PEGjs production stack by returning immediately // after `{{`, but we set the "preproc" reference parameter to false (the // reference parameter feature having been introduced for this sole purpose) // to indicate to the parent rule that we're "still in" the {{ context and // shouldn't ever inlineBreak for any closing tokens above this one. For // example: // [[Foo{{Bar]] // This will match as: // wikilink->text,template->text --> FAILS looking for }} // backtracks, popping "bracket_bracket" and "brace_brace" off preproc stack // wikilink->text,broken_template,text --> FAILS looking for ]] // backtracks, popping "bracket_bracket" and false off preproc stack // broken_wikilink,text,broken_template,text --> OK // with [false, false] left on the preproc stack broken_template = preproc:<&preproc> t:"{{" { $preproc = null; return $t; } template_preproc = "{{" leadWS:$( nl_comment_space* ) target:template_param_value params:( nl_comment_space* "|" r:( p0:("" { return $this->endOffset(); }) v:nl_comment_space* p:("" { return $this->endOffset(); }) &("|" / "}}") { // empty argument $tsr0 = new SourceRange( $p0, $p ); return new KV( '', TokenizerUtils::flattenIfArray( $v ), $tsr0->expandTsrV() ); } / template_param ) { return $r; } )* trailWS:$( nl_comment_space* ) inline_breaks "}}" { // Insert target as first positional attribute, so that it can be // generically expanded. The TemplateHandler then needs to shift it out // again. array_unshift( $params, new KV( TokenizerUtils::flattenIfArray( $target['tokens'] ), '', $target['srcOffsets']->expandTsrK() ) ); $obj = new SelfclosingTagTk( 'template', $params, (object)[ 'tsr' => $this->tsrOffsets(), 'src' => $this->text(), 'tmp' => (object)[ 'leadWS' => $leadWS, 'trailWS' => $trailWS ] ] ); return $obj; } / $('{{' space_or_newline* '}}') tplarg = tplarg_preproc<&preproc="}}"> tplarg_preproc = "{{{" p:("" { return $this->endOffset(); }) target:template_param_value? params:( nl_comment_space* "|" r:( p0:("" { return $this->endOffset(); }) v:nl_comment_space* p1:("" { return $this->endOffset(); }) &("|" / "}}}") { // empty argument return [ 'tokens' => $v, 'srcOffsets' => new SourceRange( $p0, $p1 ) ]; } / template_param_value ) { return $r; } )* nl_comment_space* inline_breaks "}}}" { $kvs = []; if ( $target === null ) { $target = [ 'tokens' => '', 'srcOffsets' => new SourceRange( $p, $p ) ]; } // Insert target as first positional attribute, so that it can be // generically expanded. The TemplateHandler then needs to shift it out // again. $kvs[] = new KV( TokenizerUtils::flattenIfArray( $target['tokens'] ), '', $target['srcOffsets']->expandTsrK() ); foreach ( $params as $o ) { $s = $o['srcOffsets']; $kvs[] = new KV( '', TokenizerUtils::flattenIfArray( $o['tokens'] ), $s->expandTsrV() ); } $obj = new SelfclosingTagTk( 'templatearg', $kvs, (object)[ 'tsr' => $this->tsrOffsets(), 'src' => $this->text() ] ); return $obj; } template_param = name:template_param_name val:( kEndPos:("" { return $this->endOffset(); }) optionalSpaceToken "=" vStartPos:("" { return $this->endOffset(); }) optionalSpaceToken tpv:template_param_value? { return [ 'kEndPos' => $kEndPos, 'vStartPos' => $vStartPos, 'value' => $tpv ? $tpv['tokens'] : [] ]; } )? { if ( $val !== null ) { if ( $val['value'] !== null ) { $so = new KVSourceRange( $this->startOffset(), $val['kEndPos'], $val['vStartPos'], $this->endOffset() ); return new KV( $name, TokenizerUtils::flattenIfArray( $val['value'] ), $so ); } else { return new KV( TokenizerUtils::flattenIfArray( $name ), '', $so ); } } else { $so = new SourceRange( $this->startOffset(), $this->endOffset() ); return new KV( '', TokenizerUtils::flattenIfArray( $name ), $so->expandTsrV() ); } } // empty parameter / & [|}] { $so = new SourceRange( $this->startOffset(), $this->endOffset() ); return new KV( '', '', $so->expandTsrV() ); } template_param_name = template_param_text / (&'=' { return ''; }) template_param_value = tpt:template_param_text { return [ 'tokens' => $tpt, 'srcOffsets' => $this->tsrOffsets() ]; } template_param_text = il:(nested_block / newlineToken)+ { // il is guaranteed to be an array -- so, tu.flattenIfArray will // always return an array $r = TokenizerUtils::flattenIfArray( $il ); if ( count( $r ) === 1 && is_string( $r[0] ) ) { $r = $r[0]; } return $r; } //// Language converter block markup of language variants: -{ ... }- // Note that "rightmost opening" precedence rule (see // https://www.mediawiki.org/wiki/Preprocessor_ABNF ) means // that neither -{{ nor -{{{ are parsed as a -{ token, although // -{{{{ is (since {{{ has precedence over {{). lang_variant_or_tpl = &('-{' &('{{{'+ !'{') tplarg) a:lang_variant { return $a; } / a:$('-' &('{{{'+ !'{')) b:tplarg { return [ $a, $b ]; } / a:$('-' &('{{' '{{{'* !'{')) b:template { return [ $a, $b ]; } / &'-{' a:lang_variant { return $a; } broken_lang_variant = r:"-{" preproc:<&preproc> { $preproc = null; return $r; } lang_variant = lang_variant_preproc<&preproc="}-"> / broken_lang_variant lang_variant_preproc = lv0:("-{" { return $this->startOffset(); }) f:( &{ return $this->env->langConverterEnabled(); } ff:opt_lang_variant_flags { // if flags contains 'R', then don't treat ; or : specially inside. if ( isset( $ff['flags'] ) ) { $ff['raw'] = isset( $ff['flags']['R'] ) || isset( $ff['flags']['N'] ); } elseif ( isset( $ff['variants'] ) ) { $ff['raw'] = true; } return $ff; } / &{ return !$this->env->langConverterEnabled(); } "" { // if language converter not enabled, don't try to parse inside. return [ 'raw' => true ]; } ) ts:( &{ return $f['raw']; } lv:lang_variant_text { return [ [ 'text' => $lv ] ]; } / &{ return !$f['raw']; } lv:lang_variant_option_list { return $lv; } ) inline_breaks lv1:("}-" { return $this->endOffset(); }) { if ( !$this->env->langConverterEnabled() ) { return [ '-{', $ts[0]['text']['tokens'], '}-' ]; } $lvsrc = substr( $this->input, $lv0, $lv1 - $lv0 ); $attribs = []; foreach ( $ts as &$t ) { // move token strings into KV attributes so that they are // properly expanded by early stages of the token pipeline foreach ( [ 'text', 'from', 'to' ] as $fld ) { if ( !isset( $t[$fld] ) ) { continue; } $name = 'mw:lv' . count( $attribs ); // Note that AttributeExpander will expect the tokens array to be // flattened. We do that in lang_variant_text / lang_variant_nowiki $attribs[] = new KV( $name, $t[$fld]['tokens'], $t[$fld]['srcOffsets']->expandTsrV() ); $t[$fld] = $name; } } unset( $t ); $flags = isset( $f['flags'] ) ? array_keys( $f['flags'] ) : []; sort( $flags ); $variants = isset( $f['variants'] ) ? array_keys( $f['variants'] ) : []; sort( $variants ); return [ new SelfclosingTagTk( 'language-variant', $attribs, (object)[ 'tsr' => new SourceRange( $lv0, $lv1 ), 'src' => $lvsrc, 'flags' => $flags, 'variants' => $variants, 'original' => $f['original'], 'flagSp' => $f['sp'], 'texts' => $ts ] ) ]; } opt_lang_variant_flags = f:( ff:lang_variant_flags "|" { return $ff; } )? { // Collect & separate flags and variants into a hashtable (by key) and ordered list $flags = []; $variants = []; $flagList = []; $flagSpace = []; $variantList = []; $variantSpace = []; $useVariants = false; if ( $f !== null ) { // lang_variant_flags returns arrays in reverse order. $spPtr = count( $f['sp'] ) - 1; for ( $i = count( $f['flags'] ) - 1; $i >= 0; $i--) { $item = $f['flags'][$i]; if ( isset( $item['flag'] ) ) { $flagSpace[] = $f['sp'][$spPtr--]; $flags[$item['flag']] = true; $flagList[] = $item['flag']; $flagSpace[] = $f['sp'][$spPtr--]; } if ( isset( $item['variant'] ) ) { $variantSpace[] = $f['sp'][$spPtr--]; $variants[$item['variant']] = true; $variantList[] = $item['variant']; $variantSpace[] = $f['sp'][$spPtr--]; } } if ( $spPtr >= 0 ) { // handle space after a trailing semicolon $flagSpace[] = $f['sp'][$spPtr]; $variantSpace[] = $f['sp'][$spPtr]; } } // Parse flags (this logic is from core/languages/ConverterRule.php // in the parseFlags() function) if ( count( $flags ) === 0 && count( $variants ) === 0 ) { $flags['$S'] = true; } elseif ( isset( $flags['R'] ) ) { $flags = [ 'R' => true ]; // remove other flags } elseif ( isset( $flags['N'] ) ) { $flags = [ 'N' => true ]; // remove other flags } elseif ( isset( $flags['-'] ) ) { $flags = [ '-' => true ]; // remove other flags } elseif ( isset( $flags['T'] ) && count( $flags ) === 1 ) { $flags['H'] = true; } elseif ( isset( $flags['H'] ) ) { // Replace A flag, and remove other flags except T and D $nf = [ '$+' => true, 'H' => true ]; if ( isset( $flags['T'] ) ) { $nf['T'] = true; } if ( isset( $flags['D'] ) ) { $nf['D'] = true; } $flags = $nf; } elseif ( count( $variants ) > 0 ) { $useVariants = true; } else { if ( isset( $flags['A'] ) ) { $flags['$+'] = true; $flags['$S'] = true; } if ( isset( $flags['D'] ) ) { unset( $flags['$S'] ); } } if ( $useVariants ) { return [ 'variants' => $variants, 'original' => $variantList, 'sp' => $variantSpace ]; } else { return [ 'flags' => $flags, 'original' => $flagList, 'sp' => $flagSpace ]; } } lang_variant_flags = sp1:$(space_or_newline*) f:lang_variant_flag sp2:$(space_or_newline*) more:( ";" lang_variant_flags? )? { $r = ( $more && $more[1] ) ? $more[1] : [ 'sp' => [], 'flags' => [] ]; // Note that sp and flags are in reverse order, since we're using // right recursion and want to push instead of unshift. $r['sp'][] = $sp2; $r['sp'][] = $sp1; $r['flags'][] = $f; return $r; } / sp:$(space_or_newline*) { return [ 'sp' => [ $sp ], 'flags' => [] ]; } lang_variant_flag = f:[-+A-Z] { return [ 'flag' => $f ]; } / v:lang_variant_name { return [ 'variant' => $v ]; } / b:$(!space_or_newline !nowiki [^{}|;])+ { return [ 'bogus' => $b ]; /* bad flag */} // language variant name, like zh, zh-cn, etc. lang_variant_name = $([a-z] [-a-zA-Z]+) // Escaped otherwise-unrepresentable language names // Primarily for supporting html2html round trips; PHP doesn't support // using nowikis here (yet!) / nowiki_text lang_variant_option_list = o:lang_variant_option rest:( ";" oo:lang_variant_option { return $oo; })* tr:( ";" $bogus_lang_variant_option )* // optional trailing crap { array_unshift( $rest, $o ); // if the last bogus option is just spaces, keep them; otherwise // drop all this bogus stuff on the ground if ( count($tr) > 0 ) { $last = $tr[count($tr)-1]; if (preg_match('/^\s*$/Du', $last[1])) { $rest[] = [ 'semi' => true, 'sp' => $last[1] ]; } } return $rest; } / lvtext:lang_variant_text { return [ [ 'text' => $lvtext ] ]; } bogus_lang_variant_option = lang_variant_text? lang_variant_option = sp1:$(space_or_newline*) lang:lang_variant_name sp2:$(space_or_newline*) ":" sp3:$(space_or_newline*) lvtext:(lang_variant_nowiki / lang_variant_text_no_semi) { return [ 'twoway' => true, 'lang' => $lang, 'text' => $lvtext, 'sp' => [ $sp1, $sp2, $sp3 ] ]; } / sp1:$(space_or_newline*) from:(lang_variant_nowiki / lang_variant_text_no_semi_or_arrow) "=>" sp2:$(space_or_newline*) lang:lang_variant_name sp3:$(space_or_newline*) ":" sp4:$(space_or_newline*) to:(lang_variant_nowiki / lang_variant_text_no_semi) { return [ 'oneway' => true, 'from' => $from, 'lang' => $lang, 'to' => $to, 'sp' => [ $sp1, $sp2, $sp3, $sp4 ] ]; } // html2wt support: If a language name or conversion string can't be // represented w/o breaking wikitext, just wrap it in a . // PHP doesn't support this (yet), but Parsoid does. lang_variant_nowiki = n:nowiki_text sp:$space_or_newline* { $tsr = $this->tsrOffsets(); $tsr->end -= strlen( $sp ); return [ 'tokens' => [ $n ], 'srcOffsets' => $tsr, ]; } lang_variant_text = tokens:(inlineline / "|" )* { return [ 'tokens' => TokenizerUtils::flattenStringlist( $tokens ), 'srcOffsets' => $this->tsrOffsets(), ]; } lang_variant_text_no_semi = lang_variant_text lang_variant_text_no_semi_or_arrow = lang_variant_text_no_semi wikilink_content = ( pipe startPos:("" { return $this->endOffset(); }) lt:link_text? { $tsr = new SourceRange( $startPos, $this->endOffset() ); $maybeContent = new KV( 'mw:maybeContent', $lt ?? [], $tsr->expandTsrV() ); $maybeContent->vsrc = substr( $this->input, $startPos, $this->endOffset() - $startPos ); return $maybeContent; } )* wikilink = wikilink_preproc<&preproc="]]"> / broken_wikilink // `broken-link` (see [[:mw:Preprocessor_ABNF]]), but careful because the // second bracket could start an extlink. Set preproc to false as a reference // parameter in the parent since we haven't seen a double-close bracket. // (See full explanation above broken_template production.) broken_wikilink = &"[[" preproc:<&preproc> &{ $preproc = null; return true; } a:("[" (extlink / "[")) { return $a; } wikilink_preproc = "[[" spos:("" { return $this->endOffset(); }) target:wikilink_preprocessor_text? tpos:("" { return $this->endOffset(); }) lcs:wikilink_content inline_breaks "]]" { $pipeTrick = count( $lcs ) === 1 && count( $lcs[0]->v ) === 0; $textTokens = []; if ( $target === null || $pipeTrick ) { $textTokens[] = '[['; if ( $target ) { $textTokens[] = $target; } foreach ( $lcs as $a ) { // a is a mw:maybeContent attribute $textTokens[] = '|'; if ( count( $a->v ) > 0 ) { $textTokens[] = $a->v; } } $textTokens[] = ']]'; return $textTokens; } $obj = new SelfclosingTagTk( 'wikilink' ); $tsr = new SourceRange( $spos, $tpos ); $hrefKV = new KV( 'href', $target, $tsr->expandTsrV() ); $hrefKV->vsrc = $tsr->substr( $this->input ); // XXX: Point to object with path, revision and input information // obj.source = input; $obj->attribs[] = $hrefKV; $obj->attribs = array_merge( $obj->attribs, $lcs ); $obj->dataAttribs = (object)[ 'tsr' => $this->tsrOffsets(), 'src' => $this->text() ]; return [ $obj ]; } // Tables are allowed inside image captions. // Suppress the equal flag temporarily in this rule to consume the '=' here. link_text = link_text_parameterized link_text_parameterized = c:( // This group is similar to "block_line" but "list_item" // is omitted since `doBlockLevels` happens after // `replaceInternalLinks2`, where newlines are stripped. (sol (heading / hr / full_table_in_link_caption)) / urltext / ( !inline_breaks r:( inline_element / '[' text_char+ ']' $(&(!']' / ']]')) / . ) { return $r; } ) )+ { return TokenizerUtils::flattenStringlist( $c ); } /* Generic quote rule for italic and bold, further processed in a token * stream transformation in doQuotes. Relies on NlTk tokens being emitted * for each line of text to balance quotes per line. * * We are not using a simple pair rule here as we need to support mis-nested * bolds/italics and MediaWiki's special heuristics for apostrophes, which are * all not context free. */ quote = quotes:$("''" "'"*) { // sequences of four or more than five quotes are assumed to start // with some number of plain-text apostrophes. $plainticks = 0; $result = []; if ( strlen( $quotes ) === 4 ) { $plainticks = 1; } elseif ( strlen( $quotes ) > 5 ) { $plainticks = strlen( $quotes ) - 5; } if ( $plainticks > 0 ) { $result[] = substr( $quotes, 0, $plainticks ); } // mw-quote token will be consumed in token transforms $tsr = $this->tsrOffsets(); $tsr->start += $plainticks; $mwq = new SelfclosingTagTk( 'mw-quote', [ new KV( 'value', substr( $quotes, $plainticks ) ) ], (object)[ 'tsr' => $tsr ] ); if ( strlen( $quotes ) > 2 ) { $mwq->addAttribute( 'isSpace_1', $tsr->start > 0 && substr( $this->input, $tsr->start - 1, 1 ) === ' '); $mwq->addAttribute( 'isSpace_2', $tsr->start > 1 && substr( $this->input, $tsr->start - 2, 1 ) === ' '); } $result[] = $mwq; return $result; } /*********************************************************** * Pre and xmlish tags ***********************************************************/ extension_tag = ! extToken:xmlish_tag // Account for `maybeExtensionTag` returning unmatched start / end tags &{ return $extToken[0]->getName() === 'extension'; } { return $extToken[0]; } nowiki = extToken:extension_tag &{ return $extToken->getAttribute( 'name' ) === 'nowiki'; } { return $extToken; } // Used by lang_variant productions to protect special language names or // conversion strings. nowiki_text = extToken:nowiki { $txt = Utils::extractExtBody( $extToken ); return Utils::decodeWtEntities( $txt ); } /* Generic XML-like tags * * These also cover extensions (including Cite), which will hook into the * token stream for further processing. The content of extension tags is * parsed as regular inline, but the source positions of the tag are added * to allow reconstructing the unparsed text from the input. */ // See http://www.w3.org/TR/html5/syntax.html#tag-open-state and // following paragraphs. tag_name_chars = [^\t\n\v />\0] tag_name = $([A-Za-z] tag_name_chars*) // This rule is used in carefully crafted places of xmlish tag tokenizing with // the inclusion of solidus to match where the spec would ignore those // characters. In particular, it does not belong in between attribute name // and value. space_or_newline_or_solidus = space_or_newline / (s:"/" !">" { return $s; }) xmlish_tag = "<" tag:(xmlish_tag_opened / xmlish_tag_opened) { return $tag; } xmlish_tag_opened = end:"/"? name: tag_name extTag: isBlock: & { if ( $extTag ) { return $this->isExtTag( $name ); } else { return $this->isXMLTag( $name, $isBlock ); } } // By the time we get to `doTableStuff` in the old parser, we've already // safely encoded element attributes. See 55313f4e in core. attribs:generic_newline_attributes space_or_newline_or_solidus* // No need to preserve this -- canonicalize on RT via dirty diff selfclose:"/"? space* // not preserved - canonicalized on RT via dirty diff ">" { $lcName = mb_strtolower( $name ); // Extension tags don't necessarily have the same semantics as html tags, // so don't treat them as void elements. $isVoidElt = Utils::isVoidElement( $lcName ) && !$extTag; // Support
if ( $lcName === 'br' && $end ) { $end = null; } $tsr = $this->tsrOffsets(); $tsr->start--; // For "<" matched at the start of xmlish_tag rule $res = TokenizerUtils::buildXMLTag( $name, $lcName, $attribs, $end, !!$selfclose || $isVoidElt, $tsr ); // change up data-attribs in one scenario // void-elts that aren't self-closed ==> useful for accurate RT-ing if ( !$selfclose && $isVoidElt ) { unset( $res->dataAttribs->selfClose ); $res->dataAttribs->noClose = true; } $met = $this->maybeExtensionTag( $res ); return ( is_array( $met ) ) ? $met : [ $met ]; } /* * A variant of xmlish_tag, but also checks if the tag name is a block-level * tag as defined in * http://www.w3.org/TR/html5/syntax.html#tag-open-state and * following paragraphs. */ block_tag = "<" tag:(xmlish_tag_opened / xmlish_tag_opened) { return $tag; } // A generic attribute that can span multiple lines. generic_newline_attribute = space_or_newline_or_solidus* namePos0:("" { return $this->endOffset(); }) name:generic_attribute_name namePos:("" { return $this->endOffset(); }) vd:(space_or_newline* "=" v:generic_att_value? { return $v; })? { // NB: Keep in sync w/ table_attibute $res = null; // Encapsulate protected attributes. if ( is_string( $name ) ) { $name = TokenizerUtils::protectAttrs( $name ); } $nameSO = new SourceRange( $namePos0, $namePos ); if ( $vd !== null ) { $res = new KV( $name, $vd['value'], $nameSO->join( $vd['srcOffsets'] ) ); $res->vsrc = $vd['srcOffsets']->substr( $this->input ); } else { $res = new KV( $name, '', $nameSO->expandTsrK() ); } if ( is_array( $name ) ) { $res->ksrc = $nameSO->substr( $this->input ); } return $res; } // A single-line attribute. table_attribute = s:optionalSpaceToken namePos0:("" { return $this->endOffset(); }) name:table_attribute_name namePos:("" { return $this->endOffset(); }) vd:(optionalSpaceToken "=" v:table_att_value? { return $v; })? { // NB: Keep in sync w/ generic_newline_attribute $res = null; // Encapsulate protected attributes. if ( gettype( $name ) === 'string' ) { $name = TokenizerUtils::protectAttrs( $name ); } $nameSO = new SourceRange( $namePos0, $namePos ); if ( $vd !== null ) { $res = new KV( $name, $vd['value'], $nameSO->join( $vd['srcOffsets'] ) ); $res->vsrc = $vd['srcOffsets']->substr( $this->input ); } else { $res = new KV( $name, '', $nameSO->expandTsrK() ); } if ( is_array( $name ) ) { $res->ksrc = $nameSO->substr( $this->input ); } return $res; } // The old parser's Sanitizer::removeHTMLtags explodes on < so that it can't // be found anywhere in xmlish tags. This is a divergence from html5 tokenizing // which happily permits it in attribute positions. Extension tags being the // exception, since they're stripped beforehand. less_than = $( & "<" ) // The arrangement of chars is to emphasize the split between what's disallowed // by html5 and what's necessary to give directive a chance. // See: http://www.w3.org/TR/html5/syntax.html#attributes-0 generic_attribute_name = q:$(["'=]?) // From #before-attribute-name-state, < is omitted for directive r:( $[^ \t\r\n\0/=><&{}\-!|]+ / !inline_breaks // \0/=> is the html5 attribute name set we do not want. t:( directive / less_than / $( !( space_or_newline / [\0/=><] ) . ) ) { return $t; } )* & { return count( $r ) > 0 || $q !== ''; } { array_unshift( $r, $q ); return TokenizerUtils::flattenString( $r ); } // Also accept these chars in a wikitext table or tr attribute name position. // They are normally not matched by the table_attribute_name. broken_table_attribute_name_char = c:[\0/=>] { return new KV( $c, '' ); } // Same as generic_attribute_name, except for accepting tags and wikilinks. // (That doesn't make sense (ie. match the old parser) in the generic case.) // We also give a chance to break on \[ (see T2553). table_attribute_name = q:$(["'=]?) // From #before-attribute-name-state, < is omitted for directive r:( $[^ \t\r\n\0/=><&{}\-!|\[]+ / !inline_breaks // \0/=> is the html5 attribute name set we do not want. t:( $wikilink / directive // Accept insane tags-inside-attributes as attribute names. // The sanitizer will strip and shadow them for roundtripping. // Example: generated with.. / &xmlish_tag ill:inlineline { return $ill; } / $( !( space_or_newline / [\0/=>] ) . ) ) { return $t; } )* & { return count( $r ) > 0 || $q !== ''; } { array_unshift( $r, $q ); return TokenizerUtils::flattenString( $r ); } // Attribute value, quoted variants can span multiple lines. // Missing end quote: accept /> look-ahead as heuristic. // These need to be kept in sync with the attribute_preprocessor_text_* generic_att_value = s:$(space_or_newline* "'") t:attribute_preprocessor_text_single? q:$("'" / &('/'? '>')) { return TokenizerUtils::getAttrVal( $t, $this->startOffset() + strlen( $s ), $this->endOffset() - strlen( $q ) ); } / s:$(space_or_newline* '"') t:attribute_preprocessor_text_double? q:$('"' / &('/'? '>')) { return TokenizerUtils::getAttrVal( $t, $this->startOffset() + strlen( $s ), $this->endOffset() - strlen( $q ) ); } / s:$space_or_newline* t:attribute_preprocessor_text &(space_or_newline / eof / '/'? '>') { return TokenizerUtils::getAttrVal( $t, $this->startOffset() + strlen( $s ), $this->endOffset() ); } // Attribute value, restricted to a single line. // Missing end quote: accept |, !!, \r, and \n look-ahead as heuristic. // These need to be kept in sync with the table_attribute_preprocessor_text_* table_att_value = s:$(space* "'") t:table_attribute_preprocessor_text_single? q:$("'" / &('!!' / [|\r\n])) { return TokenizerUtils::getAttrVal( $t, $this->startOffset() + strlen( $s ), $this->endOffset() - strlen( $q ) ); } / s:$(space* '"') t:table_attribute_preprocessor_text_double? q:$('"' / &('!!' / [|\r\n])) { return TokenizerUtils::getAttrVal( $t, $this->startOffset() + strlen( $s ), $this->endOffset() - strlen( $q ) ); } / s:$space* t:table_attribute_preprocessor_text &(space_or_newline/ eof / '!!' / '|') { return TokenizerUtils::getAttrVal( $t, $this->startOffset() + strlen( $s ), $this->endOffset() ); } /********************************************************* * Lists *********************************************************/ list_item = dtdd / hacky_dl_uses / li li = bullets:list_char+ c:inlineline? // The inline_break is to check if we've hit a template end delimiter. &(eolf / inline_breaks) { // Leave bullets as an array -- list handler expects this $tsr = $this->tsrOffsets( 'start' ); $tsr->end += count( $bullets ); $li = new TagTk( 'listItem', [ new KV( 'bullets', $bullets, $tsr->expandTsrV() ) ], (object)[ 'tsr' => $tsr ] ); return array_merge( [ $li ], $c ?: [] ); } /* * This rule is required to support wikitext of this form * ::{|border="1"|foo|bar|baz|} * where the leading colons are used to indent the entire table. * This hack was added back in 2006 in commit * a0746946312b0f1eda30a2c793f5f7052e8e5f3a based on a patch by Carl * Fürstenberg. */ hacky_dl_uses = bullets:":"+ tbl:(table_line (sol table_line)*) line:inlineline? &comment_space_eolf { // Leave bullets as an array -- list handler expects this $tsr = $this->tsrOffsets( 'start' ); $tsr->end += count( $bullets ); $li = new TagTk( 'listItem', [ new KV( 'bullets', $bullets, $tsr->expandTsrV() ) ], (object)[ 'tsr' => $tsr ] ); return TokenizerUtils::flattenIfArray( [ $li, $tbl, $line ?: [] ] ); } dtdd = bullets:(!(";" !list_char) lc:list_char { return $lc; })* ";" c:inlineline_break_on_colon? cpos:(":" { return $this->endOffset(); }) d:inlineline? &eolf { // Leave bullets as an array -- list handler expects this // TSR: +1 for the leading ";" $numBullets = count( $bullets ) + 1; $tsr = $this->tsrOffsets( 'start' ); $tsr->end += $numBullets; $li1Bullets = $bullets; $li1Bullets[] = ';'; $li1 = new TagTk( 'listItem', [ new KV( 'bullets', $li1Bullets, $tsr->expandTsrV() ) ], (object)[ 'tsr' => $tsr ] ); // TSR: -1 for the intermediate ":" $li2Bullets = $bullets; $li2Bullets[] = ':'; $tsr2 = new SourceRange( $cpos - 1, $cpos ); $li2 = new TagTk( 'listItem', [ new KV( 'bullets', $li2Bullets, $tsr2->expandTsrV() ) ], (object)[ 'tsr' => $tsr2, 'stx' => 'row' ] ); return array_merge( [ $li1 ], $c ?: [], [ $li2 ], $d ?: [] ); } list_char = [*#:;] inlineline_break_on_colon = inlineline /****************************************************************************** * Tables * ------ * Table rules are geared to support independent parsing of fragments in * templates (the common table start / row / table end use case). The tokens * produced by these fragments then match up to a table while building the * DOM tree. For similar reasons, table rows do not emit explicit end tag * tokens. * * The separate table_line rule is faster than moving those rules * directly to block_lines. * * Notes about the full_table_in_link_caption rule * ----------------------------------------------------- * However, for link-tables, we have introduced a stricter parse wherein * we require table-start and table-end tags to not come from a template. * In addition, this new rule doesn't accept fosterable-content in * the table unlike the more lax (sol table_line)+ rule. * * This is the best we can do at this time since we cannot distinguish * between table rows and image options entirely in the tokenizer. * * Consider the following examples: * * Example 1: * * [[Image:Foo.jpg|left|30px|Example 1 * {{This-template-returns-a-table-start-tag}} * |foo * {{This-template-returns-a-table-end-tag}} * ]] * * Example 2: * * [[Image:Foo.jpg|left|30px|Example 1 * {{1x|a}} * |foo * {{1x|b}} * ]] * * So, we cannot know a priori (without preprocessing or fully expanding * all templates) if "|foo" in the two examples is a table cell or an image * option. This is a limitation of our tokenizer-based approach compared to * the preprocessing-based approach of the old parser. * * Given this limitation, we are okay forcing a full-table context in * link captions (if necessary, we can relax the fosterable-content requirement * but that is broken wikitext anyway, so we can force that edge-case wikitext * to get fixed by rejecting it). ******************************************************************************/ full_table_in_link_caption = (! inline_breaks / & '{{!}}' ) // Note that "linkdesc" is suppressed here to provide a nested parsing // context in which to parse the table. Otherwise, we may break on // on pipes in the `table_start_tag` and `table_row_tag` attributes. // However, as a result, this can be more permissive than the old // implementation, but likelier to match the users intent. r: full_table_in_link_caption_parameterized { return $r; } full_table_in_link_caption_parameterized = table_start_tag optionalNewlines // Accept multiple end tags since a nested table may have been // opened in the table content line. ( (sol (table_content_line / tplarg_or_template) optionalNewlines)* sol table_end_tag )+ // This rule assumes start-of-line position! table_line = (! inline_breaks / & '{{!}}' ) tl:( table_start_tag optionalNewlines / table_content_line
optionalNewlines / table_end_tag ) { return $tl; } table_content_line = (space / comment)* ( table_heading_tags / table_row_tag / table_data_tags / table_caption_tag ) table_start_tag "table_start_tag" = sc:(space / comment)* startPos:("" { return $this->endOffset(); }) b:"{" p:pipe // ok to normalize away stray |} on rt (see T59360) ta:(table_attributes / &{ $this->unreachable(); }) tsEndPos:("" { return $this->endOffset(); }) s2:space* { $coms = TokenizerUtils::popComments( $ta ); if ( $coms ) { $tsEndPos = $coms['commentStartPos']; } $da = (object)[ 'tsr' => new SourceRange( $startPos, $tsEndPos ) ]; if ( $p !== '|' ) { // Variation from default $da->startTagSrc = $b . $p; } return array_merge( $sc, [ new TagTk( 'table', $ta, $da ) ], $coms ? $coms['buf'] : [], $s2 ); } // FIXME: Not sure if we want to support it, but this should allow columns. table_caption_tag = // avoid recursion via nested_block_in_table ! p:pipe "+" args:row_syntax_table_args? tagEndPos:("" { return $this->endOffset(); }) c:nested_block_in_table* { $tsr = new SourceRange( $this->startOffset(), $tagEndPos ); return TokenizerUtils::buildTableTokens( 'caption', '|+', $args, $tsr, $this->endOffset(), $c, true ); } table_row_tag = // avoid recursion via nested_block_in_table ! p:pipe dashes:$"-"+ a:(table_attributes / &{ $this->unreachable(); }) tagEndPos:("" { return $this->endOffset(); }) { $coms = TokenizerUtils::popComments( $a ); if ( $coms ) { $tagEndPos = $coms['commentStartPos']; } $da = (object)[ 'tsr' => new SourceRange( $this->startOffset(), $tagEndPos ), 'startTagSrc' => $p . $dashes ]; // We rely on our tree builder to close the row as needed. This is // needed to support building tables from fragment templates with // individual cells or rows. $trToken = new TagTk( 'tr', $a, $da ); return array_merge( [ $trToken ], $coms ? $coms['buf'] : [] ); } tds = ( pp:( pipe_pipe / p:pipe & row_syntax_table_args { return $p; } ) tdt:table_data_tag { // Avoid modifying cached dataAttribs object $tdt[0] = clone $tdt[0]; $da = $tdt[0]->dataAttribs = clone $tdt[0]->dataAttribs; $da->tsr = clone $da->tsr; $da->stx = 'row'; $da->tsr->start -= strlen( $pp ); // include "||" if ( $pp !== '||' || ( isset( $da->startTagSrc ) && $da->startTagSrc !== $pp ) ) { // Variation from default $da->startTagSrc = $pp . ( isset( $da->startTagSrc ) ? substr( $da->startTagSrc, 1 ) : '' ); } return $tdt; } )* table_data_tags = // avoid recursion via nested_block_in_table ! p:pipe ![+-] td:table_data_tag tagEndPos:("" { return $this->endOffset(); }) tds:tds { // Avoid modifying a cached result $td[0] = clone $td[0]; $da = $td[0]->dataAttribs = clone $td[0]->dataAttribs; $da->tsr = clone $da->tsr; $da->tsr->start -= strlen( $p ); // include "|" if ( $p !== '|' ) { // Variation from default $da->startTagSrc = $p; } return array_merge( $td, $tds ); } table_data_tag = ! "}" arg:row_syntax_table_args? // use inline_breaks to break on tr etc tagEndPos:("" { return $this->endOffset(); }) td:nested_block_in_table* { $tsr = new SourceRange( $this->startOffset(), $tagEndPos ); return TokenizerUtils::buildTableTokens( 'td', '|', $arg, $tsr, $this->endOffset(), $td ); } table_heading_tags = table_heading_tags_parameterized<&th> table_heading_tags_parameterized = "!" thTag:table_heading_tag thTags:( pp:("!!" / pipe_pipe) tht:table_heading_tag { // Avoid modifying a cached result $tht[0] = clone $tht[0]; $da = $tht[0]->dataAttribs = clone $tht[0]->dataAttribs; $da->tsr = clone $da->tsr; $da->stx = 'row'; $da->tsr->start -= strlen( $pp ); // include "!!" or "||" if ( $pp !== '!!' || ( isset( $da->startTagSrc ) && $da->startTagSrc !== $pp ) ) { // Variation from default $da->startTagSrc = $pp . ( isset( $da->startTagSrc ) ? substr( $da->startTagSrc, 1 ) : '' ); } return $tht; } )* { $thTag[0] = clone $thTag[0]; $da = $thTag[0]->dataAttribs = clone $thTag[0]->dataAttribs; $da->tsr = clone $da->tsr; $da->tsr->start--; // include "!" array_unshift( $thTags, $thTag ); return $thTags; } table_heading_tag = arg:row_syntax_table_args? tagEndPos:("" { return $this->endOffset(); }) c:( th:<&th> d:nested_block_in_table { if ( $th !== false && strpos( $this->text(), "\n" ) !== false ) { // There's been a newline. Remove the break and continue // tokenizing nested_block_in_tables. $th = false; } return $d; } )* { $tsr = new SourceRange( $this->startOffset(), $tagEndPos ); return TokenizerUtils::buildTableTokens( 'th', '!', $arg, $tsr, $this->endOffset(), $c ); } table_end_tag = sc:(space / comment)* startPos:("" { return $this->endOffset(); }) p:pipe b:"}" { $tblEnd = new EndTagTk( 'table', [], (object)[ 'tsr' => new SourceRange( $startPos, $this->endOffset() ), ] ); if ( $p !== '|' ) { // p+"" is triggering some bug in pegJS // I cannot even use that expression in the comment! $tblEnd->dataAttribs->endTagSrc = $p . $b; } array_push( $sc, $tblEnd ); return $sc; } /** * Table parameters separated from the content by a single pipe. Does *not* * match if followed by double pipe (row-based syntax). */ row_syntax_table_args = as:table_attributes s:optional_spaces p:pipe !pipe { return [ $as, $s, $p ]; } /******************************************************************* * Text variants and other general rules *******************************************************************/ /* All chars that cannot start syntactic structures in the middle of a line * XXX: ] and other end delimiters should probably only be activated inside * structures to avoid unnecessarily leaving the text rule on plain * content. * * TODO: Much of this is should really be context-dependent (syntactic * flags). The wikilink_preprocessor_text rule is an example where * text_char is not quite right and had to be augmented. Try to minimize / * clarify this carefully! * * This character class is inlined into urltext. Changes here may also need to * be reflected there. */ text_char = [^-'<[{\n\r:;\]}|!=] /* Legend * ' quotes (italic/bold) * < start of xmlish_tag * [ start of links * { start of parser functions, transclusion and template args * \n all sort of block-level markup at start of line * \r ditto * A-Za-z autolinks (http(s), nttp(s), mailto, ISBN, PMID, RFC) * * _ behavior switches (e.g., '__NOTOC__') (XXX: not URL related) * ! and | table cell delimiters, might be better to specialize those * = headings - also specialize those! * * The following chars are also included for now, but only apply in some * contexts and should probably be enabled only in those: * : separate definition in ; term : definition * ] end of link * } end of parser func/transclusion/template arg * - start of lang_variant -{ ... }- * ; separator in lang_variant */ urltext = ( & [A-Za-z] al:autolink { return $al; } / & "&" he:htmlentity { return $he; } / & ('__') bs:behavior_switch { return $bs; } // About 96% of text_char calls originated here, so inline it for efficiency / [^-'<[{\n\r:;\]}|!=] )+ raw_htmlentity = m:$("&" [#0-9a-zA-Z]+ ";") { return Utils::decodeWtEntities( $m ); } htmlentity = cc:raw_htmlentity { // if this is an invalid entity, don't tag it with 'mw:Entity' if ( mb_strlen( $cc ) > 1 /* decoded entity would be 1 character */ ) { return $cc; } return [ // If this changes, the nowiki extension's toDOM will need to follow suit new TagTk( 'span', [ new KV( 'typeof', 'mw:Entity' ) ], (object)[ 'src' => $this->text(), 'srcContent' => $cc, 'tsr' => $this->tsrOffsets( 'start' ) ] ), $cc, new EndTagTk( 'span', [], (object)[ 'tsr' => $this->tsrOffsets( 'end' ) ] ) ]; } spaces = $[ \t]+ optional_spaces = $[ \t]* space = [ \t] optionalSpaceToken = s:optional_spaces { if ( $s !== '' ) { return [ $s ]; } else { return []; } } /* This rule corresponds to \s in the PHP preg_* functions, * which is used frequently in the old parser. The inclusion of * form feed (but not other whitespace, like vertical tab) is a quirk * of Perl, which PHP inherited via the PCRE (Perl-Compatible Regular * Expressions) library. */ space_or_newline = [ \t\n\r\x0c] /* This rule corresponds to \b in the PHP preg_* functions, * after a word character. That is, it's a zero-width lookahead that * the next character is not a word character. */ end_of_word = eof / ![A-Za-z0-9_] // Unicode "separator, space" category. It covers the \u0020 space as well // as \u3000 IDEOGRAPHIC SPACE (see bug 19052). In PHP this is \p{Zs}. // Keep this up-to-date with the characters tagged ;Zs; in // http://www.unicode.org/Public/UNIDATA/UnicodeData.txt unispace = [ \u00A0\u1680\u2000-\u200A\u202F\u205F\u3000] // Non-newline whitespace, including non-breaking spaces. Used for magic links. space_or_nbsp = space // includes \t / unispace / & "&" he:htmlentity &{ return is_array( $he ) && $he[ 1 ] === "\u{A0}"; } { return $he; } // Used within ISBN magic links space_or_nbsp_or_dash = space_or_nbsp / "-" // Extra newlines followed by at least another newline. Usually used to // compress surplus newlines into a meta tag, so that they don't trigger // paragraphs. optionalNewlines = spc:$([\n\r\t ] &[\n\r])* { if ( strlen( $spc ) ) { return [ $spc ]; } else { return []; } } comment_or_includes = (comment / include_limits)* sol = (empty_line_with_comments / sol_prefix) comment_or_includes sol_prefix = newlineToken / & { // Use the sol flag only at the start of the input // Flag should always be an actual boolean (not falsy or undefined) $this->assert( is_bool( $this->options['sol'] ), 'sol should be boolean' ); return $this->endOffset() === 0 && $this->options['sol']; } { return []; } empty_line_with_comments = sp:sol_prefix p:("" { return $this->endOffset(); }) c:(space* comment (space / comment)* newline)+ { return [ $sp, new SelfclosingTagTk( 'meta', [ new KV( 'typeof', 'mw:EmptyLine' ) ], (object)[ 'tokens' => TokenizerUtils::flattenIfArray( $c ), 'tsr' => new SourceRange( $p, $this->endOffset() ), ] ) ]; } comment_space = comment / space nl_comment_space = newlineToken / comment_space /** * noinclude / includeonly / onlyinclude rules. These are normally * handled by the xmlish_tag rule, except where generic tags are not * allowed- for example in directives, which are allowed in various attribute * names and -values. * * Example test case: * {| * |- * foo * * |Hello * |} */ include_limits = & ("<" "/"? n:("includeonly"i / "noinclude"i / "onlyinclude"i ) ) il:xmlish_tag sol_il: & { $il = $il[0]; $lname = mb_strtolower( $il->getName() ); if ( !TokenizerUtils::isIncludeTag( $lname ) ) { return false; } // Preserve SOL where necessary (for onlyinclude and noinclude) // Note that this only works because we encounter <*include*> tags in // the toplevel content and we rely on the php preprocessor to expand // templates, so we shouldn't ever be tokenizing inInclude. // Last line should be empty (except for comments) if ( $lname !== 'includeonly' && $sol_il && $il instanceof TagTk ) { $dp = $il->dataAttribs; $inclContent = $dp->extTagOffsets->stripTags( $dp->src ); $nlpos = strrpos( $inclContent, "\n" ); $last = $nlpos === false ? $inclContent : substr( $inclContent, $nlpos + 1 ); if ( !preg_match( '/^()*$/D', $last ) ) { return false; } } return true; } { return $il; } // Start of file sof = & { return $this->endOffset() === 0 && !$this->pipelineOffset; } // End of file eof = & { return $this->endOffset() === $this->inputLength; } newline = '\n' / '\r\n' newlineToken = newline { return [ new NlTk( $this->tsrOffsets() ) ]; } eolf = newline / eof comment_space_eolf = (space+ / comment)* eolf // 'Preprocessor' directive- higher-level things that can occur in otherwise // plain-text content. directive = comment / extension_tag / tplarg_or_template / & "-{" v:lang_variant_or_tpl { return $v; } / & "&" e:htmlentity { return $e; } / include_limits wikilink_preprocessor_text = r:( t:$[^<[{\n\r\t|!\]}{ &\-]+ // XXX gwicke: any more chars we need to allow here? / !inline_breaks wr:( directive / $( !"]]" ( text_char / [!<\-\}\]\n\r] ) ) ) { return $wr; } )+ { return TokenizerUtils::flattenStringlist( $r ); } // added special separator character class inline: separates url from // description / text extlink_nonipv6url = // Prevent breaking on pipes when we're in a link description. // See the test, 'Images with the "|" character in the comment'. extlink_nonipv6url_parameterized extlink_nonipv6url_parameterized = r:( $[^<[{\n\r|!\]}\-\t&="' \u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000]+ / !inline_breaks s:( directive / [&|{\-!}=] ) { return $s; } / $(['] ![']) // single quotes are ok, double quotes are bad )+ { return TokenizerUtils::flattenString( $r ); } // Attribute values with preprocessor support // n.b. / is a permissible char in the three rules below. // We only break on />, enforced by the negated expression. // Hence, it isn't included in the stop set. // The stop set is space_or_newline and > which matches generic_att_value. attribute_preprocessor_text = r:( $[^{}&<\-|/ \t\n\r\x0c>]+ / !inline_breaks !'/>' s:( directive / less_than / [{}&\-|/] ) { return $s; } )+ { return TokenizerUtils::flattenString( $r ); } // The stop set is '> which matches generic_att_value. attribute_preprocessor_text_single = r:( $[^{}&<\-|/'>]+ / !inline_breaks !'/>' s:( directive / less_than / [{}&\-|/] ) { return $s; } )* { return TokenizerUtils::flattenString( $r ); } // The stop set is "> which matches generic_att_value. attribute_preprocessor_text_double = r:( $[^{}&<\-|/">]+ / !inline_breaks !'/>' s:( directive / less_than / [{}&\-|/] ) { return $s; } )* { return TokenizerUtils::flattenString( $r ); } // Variants with the entire attribute on a single line // n.b. ! is a permissible char in the three rules below. // We only break on !! in th, enforced by the inline break. // Hence, it isn't included in the stop set. // [ is also permissible but we give a chance to break // for the [[ special case in the old parser's doTableStuff (See T2553). // The stop set is space_or_newline and | which matches table_att_value. table_attribute_preprocessor_text = r:( $[^{}&<\-!\[ \t\n\r\x0c|]+ / !inline_breaks s:( directive / [{}&<\-!\[] ) { return $s; } )+ { return TokenizerUtils::flattenString( $r ); } // The stop set is '\r\n| which matches table_att_value. table_attribute_preprocessor_text_single = r:( $[^{}&<\-!\['\r\n|]+ / !inline_breaks s:( directive / [{}&<\-!\[] ) { return $s; } )* { return TokenizerUtils::flattenString( $r ); } // The stop set is "\r\n| which matches table_att_value. table_attribute_preprocessor_text_double = r:( $[^{}&<\-!\["\r\n|]+ / !inline_breaks s:( directive / [{}&<\-!\[] ) { return $s; } )* { return TokenizerUtils::flattenString( $r ); } // Special-case support for those pipe templates pipe = "|" / "{{!}}" // SSS FIXME: what about |{{!}} and {{!}}| pipe_pipe = "||" / "{{!}}{{!}}"