#!/usr/bin/php 1 ) { $utfnormalDir = rtrim( $argv[1], '/' ); if ( !is_dir( $utfnormalDir ) ) { // @phan-suppress-next-line SecurityCheck-XSS die( "The specified UtfNormal directory '$utfnormalDir' does not exist\n" ); } if ( file_exists( "$utfnormalDir/Validator.php" ) ) { // Probably ok } elseif ( file_exists( "$utfnormalDir/src/Validator.php" ) ) { // Add the 'src' dir $utfnormalDir = "$utfnormalDir/src"; } else { fprintf( STDERR, "Warning: Supplied path \"%s\" does not seem to contain UtfNormal. Trying it anyway.\n", $utfnormalDir ); } } else { $trydirs = [ // Checkouts of mediawiki/core and mediawiki/extensions in the same directory __DIR__ . '/../../../../../../../core/vendor/wikimedia/utfnormal/src', // Scribunto checked out inside the 'extensions' directory of mediawiki/core __DIR__ . '/../../../../../../../vendor/wikimedia/utfnormal/src', ]; if ( getenv( 'MW_INSTALL_PATH' ) ) { array_unshift( $trydirs, getenv( 'MW_INSTALL_PATH' ) . '/vendor/wikimedia/utfnormal/src' ); } foreach ( $trydirs as $trydir ) { $trydir = realpath( $trydir ); if ( $trydir !== false && is_dir( $trydir ) && file_exists( "$trydir/UtfNormalData.inc" ) ) { $utfnormalDir = $trydir; break; } } if ( !$utfnormalDir ) { die( "Cannot find UtfNormal. Please specify the path explicitly.\n" ); } } // @phan-suppress-next-line SecurityCheck-XSS echo "Loading UtfNormal from $utfnormalDir...\n"; // @phan-suppress-next-line SecurityCheck-OTHER require_once "$utfnormalDir/Validator.php"; // @phan-suppress-next-line SecurityCheck-OTHER require_once "$utfnormalDir/UtfNormalData.inc"; // @phan-suppress-next-line SecurityCheck-OTHER require_once "$utfnormalDir/UtfNormalDataK.inc"; if ( !Validator::$utfCheckNFC || !Validator::$utfCombiningClass || !Validator::$utfCanonicalDecomp || !Validator::$utfCanonicalComp || !Validator::$utfCompatibilityDecomp ) { die( "UtfNormal data files did not contain needed data.\n" ); } // @codingStandardsIgnoreLine MediaWiki.NamingConventions.PrefixedGlobalFunctions function uord( $c, $firstOnly ) { $ret = unpack( 'N*', mb_convert_encoding( $c, 'UTF-32BE', 'UTF-8' ) ); return $firstOnly ? $ret[1] : $ret; } echo "Creating normalization table...\n"; $X = fopen( __DIR__ . '/normalization-data.lua', 'w' ); if ( !$X ) { die( "Failed to open normalization-data.lua\n" ); } fprintf( $X, "-- This file is automatically generated by make-normalization-table.php\n" ); fprintf( $X, "local normal = {\n" ); fprintf( $X, "\t-- Characters that might change depending on the following combiner\n" ); fprintf( $X, "\t-- (minus any that are themselves combiners, those are added later)\n" ); fprintf( $X, "\tcheck = {\n" ); foreach ( Validator::$utfCheckNFC as $k => $v ) { if ( isset( Validator::$utfCombiningClass[$k] ) ) { // Skip, because it's in the other table already continue; } fprintf( $X, "\t\t[0x%06x] = 1,\n", uord( $k, true ) ); } fprintf( $X, "\t},\n\n" ); fprintf( $X, "\t-- Combining characters, mapped to combining class\n" ); fprintf( $X, "\tcombclass = {\n" ); $comb = []; foreach ( Validator::$utfCombiningClass as $k => $v ) { $cp = uord( $k, true ); $comb[$cp] = 1; fprintf( $X, "\t\t[0x%06x] = %d,\n", $cp, $v ); } fprintf( $X, "\t},\n\n" ); fprintf( $X, "\t-- Characters mapped to what they decompose to\n" ); fprintf( $X, "\t-- Note Hangul to Jamo is done separately below\n" ); fprintf( $X, "\tdecomp = {\n" ); foreach ( Validator::$utfCanonicalDecomp as $k => $v ) { fprintf( $X, "\t\t[0x%06x] = { ", uord( $k, true ) ); $fmt = "0x%06x"; foreach ( uord( $v, false ) as $c ) { fprintf( $X, $fmt, $c ); $fmt = ", 0x%06x"; } fprintf( $X, " },\n" ); } fprintf( $X, "\t},\n\n" ); fprintf( $X, "\tdecompK = {\n" ); foreach ( Validator::$utfCompatibilityDecomp as $k => $v ) { if ( isset( Validator::$utfCanonicalDecomp[$k] ) && Validator::$utfCanonicalDecomp[$k] === $v ) { // Skip duplicates continue; } fprintf( $X, "\t\t[0x%06x] = { ", uord( $k, true ) ); $fmt = "0x%06x"; foreach ( uord( $v, false ) as $c ) { fprintf( $X, $fmt, $c ); $fmt = ", 0x%06x"; } fprintf( $X, " },\n" ); } fprintf( $X, "\t},\n\n" ); fprintf( $X, "\t-- Character-pairs mapped to what they compose to\n" ); fprintf( $X, "\t-- Note Jamo to Hangul is done separately below\n" ); $t = []; foreach ( Validator::$utfCanonicalComp as $k => $v ) { $k = uord( $k, false ); if ( count( $k ) == 1 ) { // No idea why these are in the file continue; } if ( isset( $comb[$k[1]] ) ) { // Non-starter, no idea why these are in the file either continue; } $t[$k[1]][$k[2]] = uord( $v, true ); } fprintf( $X, "\tcomp = {\n" ); ksort( $t ); foreach ( $t as $k1 => $v1 ) { fprintf( $X, "\t\t[0x%06x] = {\n", $k1 ); ksort( $v1 ); foreach ( $v1 as $k2 => $v2 ) { if ( $k2 < 0 ) { fprintf( $X, "\t\t\t[-1] = 0x%06x,\n", $v2 ); } else { fprintf( $X, "\t\t\t[0x%06x] = 0x%06x,\n", $k2, $v2 ); } } fprintf( $X, "\t\t},\n" ); } fprintf( $X, "\t},\n" ); fprintf( $X, "}\n" ); fprintf( $X, "\n%s\n", <<= 0xac00 and k <= 0xd7a3 then -- Decompose a Hangul syllable into Jamo k = k - 0xac00 local ret = { 0x1100 + math.floor( k / 588 ), 0x1161 + math.floor( ( k % 588 ) / 28 ) } if k % 28 ~= 0 then ret[3] = 0x11a7 + ( k % 28 ) end return ret end return nil end } ) -- Handle Jamo to Hangul composition local jamo_l_v_mt = { __index = function ( t, k ) if k >= 0x1161 and k <= 0x1175 then -- Jamo leading + Jamo vowel return t.base + 28 * ( k - 0x1161 ) end return nil end } local hangul_jamo_mt = { __index = function ( t, k ) if k >= 0x11a7 and k <= 0x11c2 then -- Hangul + jamo final return t.base + k - 0x11a7 end return nil end } setmetatable( normal.comp, { __index = function ( t, k ) if k >= 0x1100 and k <= 0x1112 then -- Jamo leading, return a second table that combines with a Jamo vowel local t2 = { base = 0xac00 + 588 * ( k - 0x1100 ) } setmetatable( t2, jamo_l_v_mt ) t[k] = t2 -- cache it return t2 elseif k >= 0xac00 and k <= 0xd7a3 and k % 28 == 16 then -- Hangul. "k % 28 == 16" picks out just the ones that are -- Jamo leading + vowel, no final. Return a second table that combines -- with a Jamo final. local t2 = { base = k } setmetatable( t2, hangul_jamo_mt ) t[k] = t2 -- cache it return t2 end return nil end } ) -- Compatibility decomposition falls back to the normal decomposition setmetatable( normal.decompK, { __index = normal.decomp } ) return normal LUA ); fclose( $X );