>> require('vendor/autoload.php');
* >>> RemexHtml\GenerateDataFiles::run()
class GenerateDataFiles {
private const NS_HTML = 'http://www.w3.org/1999/xhtml';
private const NS_MATHML = 'http://www.w3.org/1998/Math/MathML';
private const NS_SVG = 'http://www.w3.org/2000/svg';
private const NS_XLINK = 'http://www.w3.org/1999/xlink';
private const NS_XML = 'http://www.w3.org/XML/1998/namespace';
private const NS_XMLNS = 'http://www.w3.org/2000/xmlns/';
* The only public entry point
public static function run() {
$instance = new self;
* This is the character entity mapping table copied from
* https://www.w3.org/TR/2014/REC-html5-20141028/syntax.html#tokenizing-character-references
private static $legacyNumericEntityData = << 'address, applet, area, article, aside, base,
basefont, bgsound, blockquote, body, br, button, caption, center,
col, colgroup, dd, details, dir, div, dl, dt, embed, fieldset,
figcaption, figure, footer, form, frame, frameset, h1, h2, h3, h4,
h5, h6, head, header, hr, html, iframe, img, input, li, link,
listing, main, marquee, menu, menuitem, meta, nav, noembed,
noframes, noscript, object, ol, p, param, plaintext, pre, script,
section, select, source, style, summary, table, tbody, td, template,
textarea, tfoot, th, thead, title, tr, track, ul, wbr, xmp',
self::NS_MATHML => 'mi, mo, mn, ms, mtext, annotation-xml',
self::NS_SVG => 'foreignObject, desc, title',
// @codingStandardsIgnoreStart
* The NameStartChar production from XML 1.0, but with colon excluded since
* there's a lot of ways to break namespace validation, and we actually need
* this for local names
private static $nameStartChar = '[A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]';
/** The NameChar production from XML 1.0 */
private static $nameChar = 'NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]';
// @codingStandardsIgnoreEnd
* Build a regex alternation from an array of ampersand-prefixed entity
* names.
* @param string[] $array
* @return string
private function makeRegexAlternation( $array ) {
$regex = '';
foreach ( $array as $value ) {
if ( $regex !== '' ) {
$regex .= '|';
$regex .= "\n\t\t" . preg_quote( substr( $value, 1 ), '~' );
return $regex;
private function getCharRanges( $input, $nonterminals = [] ) {
$ranges = [];
foreach ( preg_split( '/\s*\|\s*/', $input ) as $case ) {
if ( preg_match( '/^"(.)"$/', $case, $m ) ) {
// Single ASCII character
$ranges[] = [ ord( $m[1] ), ord( $m[1] ) ];
} elseif ( preg_match( '/^\[(.)-(.)\]$/', $case, $m ) ) {
// ASCII range
$ranges[] = [ ord( $m[1] ), ord( $m[2] ) ];
} elseif ( preg_match( '/^#x([0-9A-F]+)$/', $case, $m ) ) {
// Single encoded character
$codepoint = intval( $m[1], 16 );
$ranges[] = [ $codepoint, $codepoint ];
} elseif ( preg_match( '/^\[#x([0-9A-F]+)-#x([0-9A-F]+)\]$/', $case, $m ) ) {
// Encoded range
$ranges[] = [ intval( $m[1], 16 ), intval( $m[2], 16 ) ];
} elseif ( isset( $nonterminals[$case] ) ) {
$ranges = array_merge( $ranges, $this->getCharRanges( $nonterminals[$case] ) );
} else {
throw new \Exception( "Invalid XML char case \"$case\"" );
usort( $ranges, function ( $a, $b ) {
return $a[0] - $b[0];
} );
return $ranges;
private function makeConvTable( $input, $nonterminals = [] ) {
$ranges = $this->getCharRanges( $input, $nonterminals );
// Invert the ranges, produce a set complement
$lastEndPlusOne = 0;
$table = [];
for ( $i = 0; $i < count( $ranges ); $i++ ) {
$start = $ranges[$i][0];
$end = $ranges[$i][1];
// Merge consecutive ranges
for ( $j = $i + 1; $j < count( $ranges ); $j++ ) {
if ( $ranges[$j][0] === $end + 1 ) {
$end = $ranges[$j][1];
$i = $j;
} else {
$table[] = $lastEndPlusOne;
$table[] = $start - 1;
$table[] = 0;
$table[] = 0xffffff;
$lastEndPlusOne = $end + 1;
// Last range
$table[] = $lastEndPlusOne;
$table[] = 0x10ffff;
$table[] = 0;
$table[] = 0xffffff;
return $table;
private function encodeConvTable( $table ) {
return "[\n\t\t" . implode( ",\n\t\t", array_map(
function ( $a ) {
return implode( ', ', $a );
array_chunk( $table, 4 ) ) ) . ' ]';
private function execute() {
$filename = __DIR__ . '/entities.json';
$entitiesJson = file_exists( $filename ) ?
file_get_contents( $filename ) : false;
if ( $entitiesJson === false ) {
throw new \Exception( "Please download entities.json from " .
"https://www.w3.org/TR/2016/REC-html51-20161101/entities.json" );
$entities = (array)json_decode( $entitiesJson );
$entityTranslations = [];
foreach ( $entities as $entity => $info ) {
$entityTranslations[substr( $entity, 1 )] = $info->characters;
// Sort descending by length
uksort( $entities, function ( $a, $b ) {
if ( strlen( $a ) > strlen( $b ) ) {
return -1;
} elseif ( strlen( $a ) < strlen( $b ) ) {
return 1;
} else {
return strcmp( $a, $b );
} );
$entityRegex = $this->makeRegexAlternation( array_keys( $entities ) );
$charRefRegex = str_replace(
'{{NAMED_ENTITY_REGEX}}', $entityRegex, Tokenizer::CHARREF_REGEX
$matches = [];
preg_match_all( '/^0x([0-9A-F]+)\s+U\+([0-9A-F]+)/m',
self::$legacyNumericEntityData, $matches, PREG_SET_ORDER );
$legacyNumericEntities = [];
foreach ( $matches as $match ) {
$legacyNumericEntities[ intval( $match[1], 16 ) ] =
\UtfNormal\Utils::codepointToUtf8( intval( $match[2], 16 ) );
$quirkyRegex =
'~' .
$this->makeRegexAlternation( self::$quirkyPublicPrefixes ) .
$nameStartCharConvTable = $this->makeConvTable( self::$nameStartChar );
$nameCharConvTable = $this->makeConvTable( self::$nameChar,
[ 'NameStartChar' => self::$nameStartChar ] );
$encEntityRegex = var_export( $entityRegex, true );
$encCharRefRegex = var_export( $charRefRegex, true );
$encTranslations = var_export( $entityTranslations, true );
$encLegacy = var_export( $legacyNumericEntities, true );
$encQuirkyRegex = var_export( $quirkyRegex, true );
$encNameStartCharConvTable = $this->encodeConvTable( $nameStartCharConvTable );
$encNameCharConvTable = $this->encodeConvTable( $nameCharConvTable );
$special = [];
foreach ( self::$special as $ns => $str ) {
foreach ( explode( ',', $str ) as $name ) {
$special[$ns][trim( $name )] = true;
$encSpecial = var_export( $special, true );
$nsHtml = var_export( self::NS_HTML, true );
$nsMathML = var_export( self::NS_MATHML, true );
$nsSvg = var_export( self::NS_SVG, true );
$nsXlink = var_export( self::NS_XLINK, true );
$nsXml = var_export( self::NS_XML, true );
$nsXmlNs = var_export( self::NS_XMLNS, true );
$fileContents = '<' . <<