]> scripts.mit.edu Git - autoinstallsdev/mediawiki.git/blob - maintenance/language/generateNormalizerData.php
MediaWiki 1.17.0
[autoinstallsdev/mediawiki.git] / maintenance / language / generateNormalizerData.php
1 <?php
2
3 require_once( dirname( __FILE__ ) . '/../Maintenance.php' );
4
5 require_once( dirname( __FILE__ ) . '/../../includes/normal/UtfNormalUtil.php' );
6
7 /**
8  * Generates normalizer data files for Arabic and Malayalam.
9  * For NFC see includes/normal.
10  */
11 class GenerateNormalizerData extends Maintenance {
12         var $dataFile;
13
14         public function __construct() {
15                 parent::__construct();
16                 $this->addOption( 'unicode-data-file', 'The local location of the data file ' .
17                         'from http://unicode.org/Public/UNIDATA/UnicodeData.txt', false, true );
18         }
19
20         public function execute() {
21                 if ( !$this->hasOption( 'unicode-data-file' ) ) {
22                         $this->dataFile = 'UnicodeData.txt';
23                         if ( !file_exists( $this->dataFile ) ) {
24                                 $this->error( "Unable to find UnicodeData.txt. Please specify its location with --unicode-data-file=<FILE>" );
25                                 exit( 1 );
26                         }
27                 } else {
28                         $this->dataFile = $this->getOption( 'unicode-data-file' );
29                         if ( !file_exists( $this->dataFile ) ) {
30                                 $this->error( 'Unable to find the specified data file.' );
31                                 exit( 1 );
32                         }
33                 }
34
35                 $this->generateArabic();
36                 $this->generateMalayalam();
37         }
38
39         function generateArabic() {
40                 $file = fopen( $this->dataFile, 'r' );
41                 if ( !$file ) {
42                         $this->error( 'Unable to open the data file.' );
43                         exit( 1 );
44                 }
45
46                 // For the file format, see http://www.unicode.org/reports/tr44/
47                 $fieldNames = array(
48                         'Code',
49                         'Name',
50                         'General_Category',
51                         'Canonical_Combining_Class',
52                         'Bidi_Class',
53                         'Decomposition_Type_Mapping',
54                         'Numeric_Type_Value',
55                         'Bidi_Mirrored',
56                         'Unicode_1_Name',
57                         'ISO_Comment',
58                         'Simple_Uppercase_Mapping',
59                         'Simple_Lowercase_Mapping',
60                         'Simple_Titlecase_Mapping'
61                 );
62
63                 $pairs = array();
64
65                 $lineNum = 0;
66                 while ( false !== ( $line = fgets( $file ) ) ) {
67                         ++$lineNum;
68
69                         # Strip comments
70                         $line = trim( substr( $line, 0, strcspn( $line, '#' ) ) );
71                         if ( $line === '' ) {
72                                 continue;
73                         }
74
75                         # Split fields
76                         $numberedData = explode( ';', $line );
77                         $data = array();
78                         foreach ( $fieldNames as $number => $name ) {
79                                 $data[$name] = $numberedData[$number];
80                         }
81
82                         $code = base_convert( $data['Code'], 16, 10 );
83                         if ( ( $code >= 0xFB50 && $code <= 0xFDFF ) # Arabic presentation forms A
84                                 || ( $code >= 0xFE70 && $code <= 0xFEFF ) ) # Arabic presentation forms B
85                         {
86                                 if ( $data['Decomposition_Type_Mapping'] === '' ) {
87                                         // No decomposition
88                                         continue;
89                                 }
90                                 if ( !preg_match( '/^ *(<\w*>) +([0-9A-F ]*)$/',
91                                         $data['Decomposition_Type_Mapping'], $m ) )
92                                 {
93                                         $this->error( "Can't parse Decomposition_Type/Mapping on line $lineNum" );
94                                         $this->error( $line );
95                                         continue;
96                                 }
97
98                                 $source = hexSequenceToUtf8( $data['Code'] );
99                                 $dest = hexSequenceToUtf8( $m[2] );
100                                 $pairs[$source] = $dest;
101                         }
102                 }
103
104                 global $IP;
105                 file_put_contents( "$IP/serialized/normalize-ar.ser", serialize( $pairs ) );
106                 echo "ar: " . count( $pairs ) . " pairs written.\n";
107         }
108
109         function generateMalayalam() {
110                 $hexPairs = array(
111                         # From http://unicode.org/versions/Unicode5.1.0/#Malayalam_Chillu_Characters
112                         '0D23 0D4D 200D' => '0D7A',
113                         '0D28 0D4D 200D' => '0D7B',
114                         '0D30 0D4D 200D' => '0D7C',
115                         '0D32 0D4D 200D' => '0D7D',
116                         '0D33 0D4D 200D' => '0D7E',
117
118                         # From http://permalink.gmane.org/gmane.science.linguistics.wikipedia.technical/46413
119                         '0D15 0D4D 200D' => '0D7F',
120                 );
121
122                 $pairs = array();
123                 foreach ( $hexPairs as $hexSource => $hexDest ) {
124                         $source = hexSequenceToUtf8( $hexSource );
125                         $dest = hexSequenceToUtf8( $hexDest );
126                         $pairs[$source] = $dest;
127                 }
128
129                 global $IP;
130                 file_put_contents( "$IP/serialized/normalize-ml.ser", serialize( $pairs ) );
131                 echo "ml: " . count( $pairs ) . " pairs written.\n";
132         }
133 }
134
135 $maintClass = 'GenerateNormalizerData';
136 require_once( RUN_MAINTENANCE_IF_MAIN );
137