>> require('vendor/autoload.php'); * >>> RemexHtml\GenerateDataFiles::run() */ class GenerateDataFiles { private const NS_HTML = 'http://www.w3.org/1999/xhtml'; private const NS_MATHML = 'http://www.w3.org/1998/Math/MathML'; private const NS_SVG = 'http://www.w3.org/2000/svg'; private const NS_XLINK = 'http://www.w3.org/1999/xlink'; private const NS_XML = 'http://www.w3.org/XML/1998/namespace'; private const NS_XMLNS = 'http://www.w3.org/2000/xmlns/'; /** * The only public entry point */ public static function run() { $instance = new self; $instance->execute(); } /** * This is the character entity mapping table copied from * https://www.w3.org/TR/2014/REC-html5-20141028/syntax.html#tokenizing-character-references */ private static $legacyNumericEntityData = << 'address, applet, area, article, aside, base, basefont, bgsound, blockquote, body, br, button, caption, center, col, colgroup, dd, details, dir, div, dl, dt, embed, fieldset, figcaption, figure, footer, form, frame, frameset, h1, h2, h3, h4, h5, h6, head, header, hr, html, iframe, img, input, li, link, listing, main, marquee, menu, menuitem, meta, nav, noembed, noframes, noscript, object, ol, p, param, plaintext, pre, script, section, select, source, style, summary, table, tbody, td, template, textarea, tfoot, th, thead, title, tr, track, ul, wbr, xmp', self::NS_MATHML => 'mi, mo, mn, ms, mtext, annotation-xml', self::NS_SVG => 'foreignObject, desc, title', ]; // @codingStandardsIgnoreStart /** * The NameStartChar production from XML 1.0, but with colon excluded since * there's a lot of ways to break namespace validation, and we actually need * this for local names */ private static $nameStartChar = '[A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]'; /** The NameChar production from XML 1.0 */ private static $nameChar = 'NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]'; // @codingStandardsIgnoreEnd /** * Build a regex alternation from an array of ampersand-prefixed entity * names. * @param string[] $array * @return string */ private function makeRegexAlternation( $array ) { $regex = ''; foreach ( $array as $value ) { if ( $regex !== '' ) { $regex .= '|'; } $regex .= "\n\t\t" . preg_quote( substr( $value, 1 ), '~' ); } return $regex; } private function getCharRanges( $input, $nonterminals = [] ) { $ranges = []; foreach ( preg_split( '/\s*\|\s*/', $input ) as $case ) { if ( preg_match( '/^"(.)"$/', $case, $m ) ) { // Single ASCII character $ranges[] = [ ord( $m[1] ), ord( $m[1] ) ]; } elseif ( preg_match( '/^\[(.)-(.)\]$/', $case, $m ) ) { // ASCII range $ranges[] = [ ord( $m[1] ), ord( $m[2] ) ]; } elseif ( preg_match( '/^#x([0-9A-F]+)$/', $case, $m ) ) { // Single encoded character $codepoint = intval( $m[1], 16 ); $ranges[] = [ $codepoint, $codepoint ]; } elseif ( preg_match( '/^\[#x([0-9A-F]+)-#x([0-9A-F]+)\]$/', $case, $m ) ) { // Encoded range $ranges[] = [ intval( $m[1], 16 ), intval( $m[2], 16 ) ]; } elseif ( isset( $nonterminals[$case] ) ) { $ranges = array_merge( $ranges, $this->getCharRanges( $nonterminals[$case] ) ); } else { throw new \Exception( "Invalid XML char case \"$case\"" ); } } usort( $ranges, function ( $a, $b ) { return $a[0] - $b[0]; } ); return $ranges; } private function makeConvTable( $input, $nonterminals = [] ) { $ranges = $this->getCharRanges( $input, $nonterminals ); // Invert the ranges, produce a set complement $lastEndPlusOne = 0; $table = []; for ( $i = 0; $i < count( $ranges ); $i++ ) { $start = $ranges[$i][0]; $end = $ranges[$i][1]; // Merge consecutive ranges for ( $j = $i + 1; $j < count( $ranges ); $j++ ) { if ( $ranges[$j][0] === $end + 1 ) { $end = $ranges[$j][1]; $i = $j; } else { break; } } $table[] = $lastEndPlusOne; $table[] = $start - 1; $table[] = 0; $table[] = 0xffffff; $lastEndPlusOne = $end + 1; } // Last range $table[] = $lastEndPlusOne; $table[] = 0x10ffff; $table[] = 0; $table[] = 0xffffff; return $table; } private function encodeConvTable( $table ) { return "[\n\t\t" . implode( ",\n\t\t", array_map( function ( $a ) { return implode( ', ', $a ); }, array_chunk( $table, 4 ) ) ) . ' ]'; } private function execute() { $filename = __DIR__ . '/entities.json'; $entitiesJson = file_exists( $filename ) ? file_get_contents( $filename ) : false; if ( $entitiesJson === false ) { throw new \Exception( "Please download entities.json from " . "https://www.w3.org/TR/2016/REC-html51-20161101/entities.json" ); } $entities = (array)json_decode( $entitiesJson ); $entityTranslations = []; foreach ( $entities as $entity => $info ) { $entityTranslations[substr( $entity, 1 )] = $info->characters; } // Sort descending by length uksort( $entities, function ( $a, $b ) { if ( strlen( $a ) > strlen( $b ) ) { return -1; } elseif ( strlen( $a ) < strlen( $b ) ) { return 1; } else { return strcmp( $a, $b ); } } ); $entityRegex = $this->makeRegexAlternation( array_keys( $entities ) ); $charRefRegex = str_replace( '{{NAMED_ENTITY_REGEX}}', $entityRegex, Tokenizer::CHARREF_REGEX ); $matches = []; preg_match_all( '/^0x([0-9A-F]+)\s+U\+([0-9A-F]+)/m', self::$legacyNumericEntityData, $matches, PREG_SET_ORDER ); $legacyNumericEntities = []; foreach ( $matches as $match ) { $legacyNumericEntities[ intval( $match[1], 16 ) ] = \UtfNormal\Utils::codepointToUtf8( intval( $match[2], 16 ) ); } $quirkyRegex = '~' . $this->makeRegexAlternation( self::$quirkyPublicPrefixes ) . '~xAi'; $nameStartCharConvTable = $this->makeConvTable( self::$nameStartChar ); $nameCharConvTable = $this->makeConvTable( self::$nameChar, [ 'NameStartChar' => self::$nameStartChar ] ); $encEntityRegex = var_export( $entityRegex, true ); $encCharRefRegex = var_export( $charRefRegex, true ); $encTranslations = var_export( $entityTranslations, true ); $encLegacy = var_export( $legacyNumericEntities, true ); $encQuirkyRegex = var_export( $quirkyRegex, true ); $encNameStartCharConvTable = $this->encodeConvTable( $nameStartCharConvTable ); $encNameCharConvTable = $this->encodeConvTable( $nameCharConvTable ); $special = []; foreach ( self::$special as $ns => $str ) { foreach ( explode( ',', $str ) as $name ) { $special[$ns][trim( $name )] = true; } } $encSpecial = var_export( $special, true ); $nsHtml = var_export( self::NS_HTML, true ); $nsMathML = var_export( self::NS_MATHML, true ); $nsSvg = var_export( self::NS_SVG, true ); $nsXlink = var_export( self::NS_XLINK, true ); $nsXml = var_export( self::NS_XML, true ); $nsXmlNs = var_export( self::NS_XMLNS, true ); $fileContents = '<' . <<