]> scripts.mit.edu Git - autoinstalls/mediawiki.git/blob - includes/normal/UtfNormalTest2.php
MediaWiki 1.17.0
[autoinstalls/mediawiki.git] / includes / normal / UtfNormalTest2.php
1 #!/usr/bin/php
2 <?php
3 /**
4  * Other tests for the unicode normalization module
5  *
6  * @file
7  * @ingroup UtfNormal
8  */
9
10 if( php_sapi_name() != 'cli' ) {
11         die( "Run me from the command line please.\n" );
12 }
13
14 // From http://unicode.org/Public/UNIDATA/NormalizationTest.txt
15 $file = "NormalizationTest.txt";
16
17 // Anything after this character is a comment
18 define ( 'COMMENT', '#' );
19
20 // Semicolons are used to separate the columns
21 define ( 'SEPARATOR', ';' );
22
23 $f = fopen($file, "r");
24
25 /**
26  * The following section will be used for testing different normalization methods.
27  * - Pure PHP
28      ~ no assertion errors
29      ~ 6.25 minutes
30
31  * - php_utfnormal.so or intl extension: both are wrappers around
32      libicu so we list the version of libicu when making the
33      comparison
34
35  * - libicu Ubuntu 3.8.1-3ubuntu1.1 php 5.2.6-3ubuntu4.5
36      ~ 2200 assertion errors
37      ~ 5 seconds
38          ~ output: http://paste2.org/p/921566
39
40  * - libicu Ubuntu 4.2.1-3 php 5.3.2-1ubuntu4.2
41      ~ 1384 assertion errors
42          ~ 15 seconds
43          ~ output: http://paste2.org/p/921435
44
45  * - libicu Debian 4.4.1-5 php 5.3.2-1ubuntu4.2
46      ~ no assertion errors
47          ~ 13 seconds
48
49  * - Tests comparing pure PHP output with libicu output were added
50      later and slow down the runtime.
51  */
52
53 require_once("./UtfNormal.php");
54 function normalize_form_c($c)      { return UtfNormal::toNFC($c);  }
55 function normalize_form_d($c)      { return UtfNormal::toNFD($c);  }
56 function normalize_form_kc($c)     { return UtfNormal::toNFKC($c); }
57 function normalize_form_kd($c)     { return UtfNormal::toNFKD($c); }
58
59 /**
60  * This set of functions is only useful if youve added a param to the
61  * following functions to force pure PHP usage.  I decided not to
62  * commit that code since might produce a slowdown in the UTF
63  * normalization code just for the sake of these tests. -- hexmode
64  */
65 function normalize_form_c_php($c)  { return UtfNormal::toNFC($c, "php");  }
66 function normalize_form_d_php($c)  { return UtfNormal::toNFD($c, "php");  }
67 function normalize_form_kc_php($c) { return UtfNormal::toNFKC($c, "php"); }
68 function normalize_form_kd_php($c) { return UtfNormal::toNFKD($c, "php"); }
69
70 assert_options(ASSERT_ACTIVE, 1);
71 assert_options(ASSERT_WARNING, 0);
72 assert_options(ASSERT_QUIET_EVAL, 1);
73 assert_options(ASSERT_CALLBACK, 'my_assert');
74
75 function my_assert( $file, $line, $code ) {
76         global $col, $lineNo;
77         echo "Assertion that '$code' failed on line $lineNo ($col[5])\n";
78 }
79
80 $count = 0;
81 $lineNo = 0;
82 if( $f !== false ) {
83         while( ( $col = getRow( $f ) ) !== false ) {
84                 $lineNo++;
85
86                 if(count($col) == 6) {
87                         $count++;
88                         if( $count % 100 === 0 ) echo "Count: $count\n";
89                 } else {
90                         continue;
91                 }
92
93                 # verify that the pure PHP version is correct
94                 $NFCc1  = normalize_form_c($col[0]);
95                 $NFCc1p = normalize_form_c_php($col[0]);
96                 assert('$NFCc1 === $NFCc1p');
97                 $NFCc2  = normalize_form_c($col[1]);
98                 $NFCc2p = normalize_form_c_php($col[1]);
99                 assert('$NFCc2 === $NFCc2p');
100                 $NFCc3  = normalize_form_c($col[2]);
101                 $NFCc3p = normalize_form_c_php($col[2]);
102                 assert('$NFCc3 === $NFCc3p');
103                 $NFCc4  = normalize_form_c($col[3]);
104                 $NFCc4p = normalize_form_c_php($col[3]);
105                 assert('$NFCc4 === $NFCc4p');
106                 $NFCc5  = normalize_form_c($col[4]);
107                 $NFCc5p = normalize_form_c_php($col[4]);
108                 assert('$NFCc5 === $NFCc5p');
109
110                 $NFDc1  = normalize_form_d($col[0]);
111                 $NFDc1p = normalize_form_d_php($col[0]);
112                 assert('$NFDc1 === $NFDc1p');
113                 $NFDc2  = normalize_form_d($col[1]);
114                 $NFDc2p = normalize_form_d_php($col[1]);
115                 assert('$NFDc2 === $NFDc2p');
116                 $NFDc3  = normalize_form_d($col[2]);
117                 $NFDc3p = normalize_form_d_php($col[2]);
118                 assert('$NFDc3 === $NFDc3p');
119                 $NFDc4  = normalize_form_d($col[3]);
120                 $NFDc4p = normalize_form_d_php($col[3]);
121                 assert('$NFDc4 === $NFDc4p');
122                 $NFDc5  = normalize_form_d($col[4]);
123                 $NFDc5p = normalize_form_d_php($col[4]);
124                 assert('$NFDc5 === $NFDc5p');
125
126                 $NFKDc1  = normalize_form_kd($col[0]);
127                 $NFKDc1p = normalize_form_kd_php($col[0]);
128                 assert('$NFKDc1 === $NFKDc1p');
129                 $NFKDc2  = normalize_form_kd($col[1]);
130                 $NFKDc2p = normalize_form_kd_php($col[1]);
131                 assert('$NFKDc2 === $NFKDc2p');
132                 $NFKDc3  = normalize_form_kd($col[2]);
133                 $NFKDc3p = normalize_form_kd_php($col[2]);
134                 assert('$NFKDc3 === $NFKDc3p');
135                 $NFKDc4  = normalize_form_kd($col[3]);
136                 $NFKDc4p = normalize_form_kd_php($col[3]);
137                 assert('$NFKDc4 === $NFKDc4p');
138                 $NFKDc5  = normalize_form_kd($col[4]);
139                 $NFKDc5p = normalize_form_kd_php($col[4]);
140                 assert('$NFKDc5 === $NFKDc5p');
141
142                 $NFKCc1  = normalize_form_kc($col[0]);
143                 $NFKCc1p = normalize_form_kc_php($col[0]);
144                 assert('$NFKCc1 === $NFKCc1p');
145                 $NFKCc2  = normalize_form_kc($col[1]);
146                 $NFKCc2p = normalize_form_kc_php($col[1]);
147                 assert('$NFKCc2 === $NFKCc2p');
148                 $NFKCc3  = normalize_form_kc($col[2]);
149                 $NFKCc3p = normalize_form_kc_php($col[2]);
150                 assert('$NFKCc3 === $NFKCc3p');
151                 $NFKCc4  = normalize_form_kc($col[3]);
152                 $NFKCc4p = normalize_form_kc_php($col[3]);
153                 assert('$NFKCc4 === $NFKCc4p');
154                 $NFKCc5  = normalize_form_kc($col[4]);
155                 $NFKCc5p = normalize_form_kc_php($col[4]);
156                 assert('$NFKCc5 === $NFKCc5p');
157
158                 # c2 ==  NFC(c1) ==      NFC(c2) ==      NFC(c3)
159                 assert('$col[1] === $NFCc1');
160                 assert('$col[1] === $NFCc2');
161                 assert('$col[1] === $NFCc3');
162
163                 # c4 ==  NFC(c4) ==      NFC(c5)
164                 assert('$col[3] === $NFCc4');
165                 assert('$col[3] === $NFCc5');
166
167                 # c3 ==  NFD(c1) ==      NFD(c2) ==      NFD(c3)
168                 assert('$col[2] === $NFDc1');
169                 assert('$col[2] === $NFDc2');
170                 assert('$col[2] === $NFDc3');
171
172                 # c5 ==  NFD(c4) ==      NFD(c5)
173                 assert('$col[4] === $NFDc4');
174                 assert('$col[4] === $NFDc5');
175
176                 # c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
177                 assert('$col[3] === $NFKCc1');
178                 assert('$col[3] === $NFKCc2');
179                 assert('$col[3] === $NFKCc3');
180                 assert('$col[3] === $NFKCc4');
181                 assert('$col[3] === $NFKCc5');
182
183                 # c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
184                 assert('$col[4] === $NFKDc1');
185                 assert('$col[4] === $NFKDc2');
186                 assert('$col[4] === $NFKDc3');
187                 assert('$col[4] === $NFKDc4');
188                 assert('$col[4] === $NFKDc5');
189         }
190 }
191 echo "done.\n";
192
193 // Compare against http://en.wikipedia.org/wiki/UTF-8#Description
194 function unichr($c) {
195         if ($c <= 0x7F) {
196                 return chr($c);
197         } else if ($c <= 0x7FF) {
198                 return chr(0xC0 | $c >> 6) . chr(0x80 | $c & 0x3F);
199         } else if ($c <= 0xFFFF) {
200                 return chr(0xE0 | $c >> 12) . chr(0x80 | $c >> 6 & 0x3F)
201                         . chr(0x80 | $c & 0x3F);
202         } else if ($c <= 0x10FFFF) {
203                 return chr(0xF0 | $c >> 18) . chr(0x80 | $c >> 12 & 0x3F)
204                         . chr(0x80 | $c >> 6 & 0x3F)
205                         . chr(0x80 | $c & 0x3F);
206         } else {
207                 return false;
208         }
209 }
210
211 function unistr($c) {
212         return implode("", array_map("unichr", array_map("hexdec", explode(" ", $c))));
213 }
214
215 function getRow( $f ) {
216         $row = fgets( $f );
217         if( $row === false ) return false;
218         $row = rtrim($row);
219         $pos = strpos( $row, COMMENT );
220         $pos2 = strpos( $row, ")" );
221         if( $pos === 0 ) return array($row);
222         $c = "";
223
224         if( $pos ) {
225                 if($pos2) $c = substr( $row, $pos2 + 2 );
226                 else      $c = substr( $row, $pos );
227                 $row = substr( $row, 0, $pos );
228         }
229
230         $ret = array();
231         foreach( explode( SEPARATOR, $row ) as $ent ) {
232                 if( trim( $ent ) !== "" ) {
233                         $ret[] = unistr($ent);
234                 }
235         }
236         $ret[] = $c;
237
238         return $ret;
239 }