]> scripts.mit.edu Git - autoinstallsdev/mediawiki.git/blob - includes/normal/UtfNormalUtil.php
MediaWiki 1.17.4
[autoinstallsdev/mediawiki.git] / includes / normal / UtfNormalUtil.php
1 <?php
2 /**
3  * Some of these functions are adapted from places in MediaWiki.
4  * Should probably merge them for consistency.
5  *
6  * Copyright © 2004 Brion Vibber <brion@pobox.com>
7  * http://www.mediawiki.org/
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation; either version 2 of the License, or
12  * (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License along
20  * with this program; if not, write to the Free Software Foundation, Inc.,
21  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
22  * http://www.gnu.org/copyleft/gpl.html
23  *
24  * @file
25  * @ingroup UtfNormal
26  */
27
28 require_once dirname(__FILE__).'/UtfNormalDefines.php';
29
30 /**
31  * Return UTF-8 sequence for a given Unicode code point.
32  * May die if fed out of range data.
33  *
34  * @param $codepoint Integer:
35  * @return String
36  * @public
37  */
38 function codepointToUtf8( $codepoint ) {
39         if($codepoint <         0x80) return chr($codepoint);
40         if($codepoint <    0x800) return chr($codepoint >>      6 & 0x3f | 0xc0) .
41                                                                          chr($codepoint           & 0x3f | 0x80);
42         if($codepoint <  0x10000) return chr($codepoint >> 12 & 0x0f | 0xe0) .
43                                                                          chr($codepoint >>      6 & 0x3f | 0x80) .
44                                                                          chr($codepoint           & 0x3f | 0x80);
45         if($codepoint < 0x110000) return chr($codepoint >> 18 & 0x07 | 0xf0) .
46                                                                          chr($codepoint >> 12 & 0x3f | 0x80) .
47                                                                          chr($codepoint >>      6 & 0x3f | 0x80) .
48                                                                          chr($codepoint           & 0x3f | 0x80);
49
50         echo "Asked for code outside of range ($codepoint)\n";
51         die( -1 );
52 }
53
54 /**
55  * Take a series of space-separated hexadecimal numbers representing
56  * Unicode code points and return a UTF-8 string composed of those
57  * characters. Used by UTF-8 data generation and testing routines.
58  *
59  * @param $sequence String
60  * @return String
61  * @private
62  */
63 function hexSequenceToUtf8( $sequence ) {
64         $utf = '';
65         foreach( explode( ' ', $sequence ) as $hex ) {
66                 $n = hexdec( $hex );
67                 $utf .= codepointToUtf8( $n );
68         }
69         return $utf;
70 }
71
72 /**
73  * Take a UTF-8 string and return a space-separated series of hex
74  * numbers representing Unicode code points. For debugging.
75  *
76  * @param $str String: UTF-8 string.
77  * @return string
78  * @private
79  */
80 function utf8ToHexSequence( $str ) {
81         return rtrim( preg_replace( '/(.)/uSe',
82                                     'sprintf("%04x ", utf8ToCodepoint("$1"))',
83                                     $str ) );
84 }
85
86 /**
87  * Determine the Unicode codepoint of a single-character UTF-8 sequence.
88  * Does not check for invalid input data.
89  *
90  * @param $char String
91  * @return Integer
92  * @public
93  */
94 function utf8ToCodepoint( $char ) {
95         # Find the length
96         $z = ord( $char{0} );
97         if ( $z & 0x80 ) {
98                 $length = 0;
99                 while ( $z & 0x80 ) {
100                         $length++;
101                         $z <<= 1;
102                 }
103         } else {
104                 $length = 1;
105         }
106
107         if ( $length != strlen( $char ) ) {
108                 return false;
109         }
110         if ( $length == 1 ) {
111                 return ord( $char );
112         }
113
114         # Mask off the length-determining bits and shift back to the original location
115         $z &= 0xff;
116         $z >>= $length;
117
118         # Add in the free bits from subsequent bytes
119         for ( $i=1; $i<$length; $i++ ) {
120                 $z <<= 6;
121                 $z |= ord( $char{$i} ) & 0x3f;
122         }
123
124         return $z;
125 }
126
127 /**
128  * Escape a string for inclusion in a PHP single-quoted string literal.
129  *
130  * @param $string String: string to be escaped.
131  * @return String: escaped string.
132  * @public
133  */
134 function escapeSingleString( $string ) {
135         return strtr( $string,
136                 array(
137                         '\\' => '\\\\',
138                         '\'' => '\\\''
139                 ));
140 }