Wordpress 4.6
[autoinstalls/wordpress.git] / wp-includes / Requests / IDNAEncoder.php
1 <?php
2
3 /**
4  * IDNA URL encoder
5  *
6  * Note: Not fully compliant, as nameprep does nothing yet.
7  *
8  * @package Requests
9  * @subpackage Utilities
10  * @see https://tools.ietf.org/html/rfc3490 IDNA specification
11  * @see https://tools.ietf.org/html/rfc3492 Punycode/Bootstrap specification
12  */
13 class Requests_IDNAEncoder {
14         /**
15          * ACE prefix used for IDNA
16          *
17          * @see https://tools.ietf.org/html/rfc3490#section-5
18          * @var string
19          */
20         const ACE_PREFIX = 'xn--';
21
22         /**#@+
23          * Bootstrap constant for Punycode
24          *
25          * @see https://tools.ietf.org/html/rfc3492#section-5
26          * @var int
27          */
28         const BOOTSTRAP_BASE         = 36;
29         const BOOTSTRAP_TMIN         = 1;
30         const BOOTSTRAP_TMAX         = 26;
31         const BOOTSTRAP_SKEW         = 38;
32         const BOOTSTRAP_DAMP         = 700;
33         const BOOTSTRAP_INITIAL_BIAS = 72;
34         const BOOTSTRAP_INITIAL_N    = 128;
35         /**#@-*/
36
37         /**
38          * Encode a hostname using Punycode
39          *
40          * @param string $string Hostname
41          * @return string Punycode-encoded hostname
42          */
43         public static function encode($string) {
44                 $parts = explode('.', $string);
45                 foreach ($parts as &$part) {
46                         $part = self::to_ascii($part);
47                 }
48                 return implode('.', $parts);
49         }
50
51         /**
52          * Convert a UTF-8 string to an ASCII string using Punycode
53          *
54          * @throws Requests_Exception Provided string longer than 64 ASCII characters (`idna.provided_too_long`)
55          * @throws Requests_Exception Prepared string longer than 64 ASCII characters (`idna.prepared_too_long`)
56          * @throws Requests_Exception Provided string already begins with xn-- (`idna.provided_is_prefixed`)
57          * @throws Requests_Exception Encoded string longer than 64 ASCII characters (`idna.encoded_too_long`)
58          *
59          * @param string $string ASCII or UTF-8 string (max length 64 characters)
60          * @return string ASCII string
61          */
62         public static function to_ascii($string) {
63                 // Step 1: Check if the string is already ASCII
64                 if (self::is_ascii($string)) {
65                         // Skip to step 7
66                         if (strlen($string) < 64) {
67                                 return $string;
68                         }
69
70                         throw new Requests_Exception('Provided string is too long', 'idna.provided_too_long', $string);
71                 }
72
73                 // Step 2: nameprep
74                 $string = self::nameprep($string);
75
76                 // Step 3: UseSTD3ASCIIRules is false, continue
77                 // Step 4: Check if it's ASCII now
78                 if (self::is_ascii($string)) {
79                         // Skip to step 7
80                         if (strlen($string) < 64) {
81                                 return $string;
82                         }
83
84                         throw new Requests_Exception('Prepared string is too long', 'idna.prepared_too_long', $string);
85                 }
86
87                 // Step 5: Check ACE prefix
88                 if (strpos($string, self::ACE_PREFIX) === 0) {
89                         throw new Requests_Exception('Provided string begins with ACE prefix', 'idna.provided_is_prefixed', $string);
90                 }
91
92                 // Step 6: Encode with Punycode
93                 $string = self::punycode_encode($string);
94
95                 // Step 7: Prepend ACE prefix
96                 $string = self::ACE_PREFIX . $string;
97
98                 // Step 8: Check size
99                 if (strlen($string) < 64) {
100                         return $string;
101                 }
102
103                 throw new Requests_Exception('Encoded string is too long', 'idna.encoded_too_long', $string);
104         }
105
106         /**
107          * Check whether a given string contains only ASCII characters
108          *
109          * @internal (Testing found regex was the fastest implementation)
110          *
111          * @param string $string
112          * @return bool Is the string ASCII-only?
113          */
114         protected static function is_ascii($string) {
115                 return (preg_match('/(?:[^\x00-\x7F])/', $string) !== 1);
116         }
117
118         /**
119          * Prepare a string for use as an IDNA name
120          *
121          * @todo Implement this based on RFC 3491 and the newer 5891
122          * @param string $string
123          * @return string Prepared string
124          */
125         protected static function nameprep($string) {
126                 return $string;
127         }
128
129         /**
130          * Convert a UTF-8 string to a UCS-4 codepoint array
131          *
132          * Based on Requests_IRI::replace_invalid_with_pct_encoding()
133          *
134          * @throws Requests_Exception Invalid UTF-8 codepoint (`idna.invalidcodepoint`)
135          * @param string $input
136          * @return array Unicode code points
137          */
138         protected static function utf8_to_codepoints($input) {
139                 $codepoints = array();
140
141                 // Get number of bytes
142                 $strlen = strlen($input);
143
144                 for ($position = 0; $position < $strlen; $position++) {
145                         $value = ord($input[$position]);
146
147                         // One byte sequence:
148                         if ((~$value & 0x80) === 0x80) {
149                                 $character = $value;
150                                 $length = 1;
151                                 $remaining = 0;
152                         }
153                         // Two byte sequence:
154                         elseif (($value & 0xE0) === 0xC0) {
155                                 $character = ($value & 0x1F) << 6;
156                                 $length = 2;
157                                 $remaining = 1;
158                         }
159                         // Three byte sequence:
160                         elseif (($value & 0xF0) === 0xE0) {
161                                 $character = ($value & 0x0F) << 12;
162                                 $length = 3;
163                                 $remaining = 2;
164                         }
165                         // Four byte sequence:
166                         elseif (($value & 0xF8) === 0xF0) {
167                                 $character = ($value & 0x07) << 18;
168                                 $length = 4;
169                                 $remaining = 3;
170                         }
171                         // Invalid byte:
172                         else {
173                                 throw new Requests_Exception('Invalid Unicode codepoint', 'idna.invalidcodepoint', $value);
174                         }
175
176                         if ($remaining > 0) {
177                                 if ($position + $length > $strlen) {
178                                         throw new Requests_Exception('Invalid Unicode codepoint', 'idna.invalidcodepoint', $character);
179                                 }
180                                 for ($position++; $remaining > 0; $position++) {
181                                         $value = ord($input[$position]);
182
183                                         // If it is invalid, count the sequence as invalid and reprocess the current byte:
184                                         if (($value & 0xC0) !== 0x80) {
185                                                 throw new Requests_Exception('Invalid Unicode codepoint', 'idna.invalidcodepoint', $character);
186                                         }
187
188                                         $character |= ($value & 0x3F) << (--$remaining * 6);
189                                 }
190                                 $position--;
191                         }
192
193                         if (
194                                 // Non-shortest form sequences are invalid
195                                    $length > 1 && $character <= 0x7F
196                                 || $length > 2 && $character <= 0x7FF
197                                 || $length > 3 && $character <= 0xFFFF
198                                 // Outside of range of ucschar codepoints
199                                 // Noncharacters
200                                 || ($character & 0xFFFE) === 0xFFFE
201                                 || $character >= 0xFDD0 && $character <= 0xFDEF
202                                 || (
203                                         // Everything else not in ucschar
204                                            $character > 0xD7FF && $character < 0xF900
205                                         || $character < 0x20
206                                         || $character > 0x7E && $character < 0xA0
207                                         || $character > 0xEFFFD
208                                 )
209                         ) {
210                                 throw new Requests_Exception('Invalid Unicode codepoint', 'idna.invalidcodepoint', $character);
211                         }
212
213                         $codepoints[] = $character;
214                 }
215
216                 return $codepoints;
217         }
218
219         /**
220          * RFC3492-compliant encoder
221          *
222          * @internal Pseudo-code from Section 6.3 is commented with "#" next to relevant code
223          * @throws Requests_Exception On character outside of the domain (never happens with Punycode) (`idna.character_outside_domain`)
224          *
225          * @param string $input UTF-8 encoded string to encode
226          * @return string Punycode-encoded string
227          */
228         public static function punycode_encode($input) {
229                 $output = '';
230 #               let n = initial_n
231                 $n = self::BOOTSTRAP_INITIAL_N;
232 #               let delta = 0
233                 $delta = 0;
234 #               let bias = initial_bias
235                 $bias = self::BOOTSTRAP_INITIAL_BIAS;
236 #               let h = b = the number of basic code points in the input
237                 $h = $b = 0; // see loop
238 #               copy them to the output in order
239                 $codepoints = self::utf8_to_codepoints($input);
240                 $extended = array();
241
242                 foreach ($codepoints as $char) {
243                         if ($char < 128) {
244                                 // Character is valid ASCII
245                                 // TODO: this should also check if it's valid for a URL
246                                 $output .= chr($char);
247                                 $h++;
248                         }
249                         // Check if the character is non-ASCII, but below initial n
250                         // This never occurs for Punycode, so ignore in coverage
251                         // @codeCoverageIgnoreStart
252                         elseif ($char < $n) {
253                                 throw new Requests_Exception('Invalid character', 'idna.character_outside_domain', $char);
254                         }
255                         // @codeCoverageIgnoreEnd
256                         else {
257                                 $extended[$char] = true;
258                         }
259                 }
260                 $extended = array_keys($extended);
261                 sort($extended);
262                 $b = $h;
263 #               [copy them] followed by a delimiter if b > 0
264                 if (strlen($output) > 0) {
265                         $output .= '-';
266                 }
267 #               {if the input contains a non-basic code point < n then fail}
268 #               while h < length(input) do begin
269                 while ($h < count($codepoints)) {
270 #                       let m = the minimum code point >= n in the input
271                         $m = array_shift($extended);
272                         //printf('next code point to insert is %s' . PHP_EOL, dechex($m));
273 #                       let delta = delta + (m - n) * (h + 1), fail on overflow
274                         $delta += ($m - $n) * ($h + 1);
275 #                       let n = m
276                         $n = $m;
277 #                       for each code point c in the input (in order) do begin
278                         for ($num = 0; $num < count($codepoints); $num++) {
279                                 $c = $codepoints[$num];
280 #                               if c < n then increment delta, fail on overflow
281                                 if ($c < $n) {
282                                         $delta++;
283                                 }
284 #                               if c == n then begin
285                                 elseif ($c === $n) {
286 #                                       let q = delta
287                                         $q = $delta;
288 #                                       for k = base to infinity in steps of base do begin
289                                         for ($k = self::BOOTSTRAP_BASE; ; $k += self::BOOTSTRAP_BASE) {
290 #                                               let t = tmin if k <= bias {+ tmin}, or
291 #                                                               tmax if k >= bias + tmax, or k - bias otherwise
292                                                 if ($k <= ($bias + self::BOOTSTRAP_TMIN)) {
293                                                         $t = self::BOOTSTRAP_TMIN;
294                                                 }
295                                                 elseif ($k >= ($bias + self::BOOTSTRAP_TMAX)) {
296                                                         $t = self::BOOTSTRAP_TMAX;
297                                                 }
298                                                 else {
299                                                         $t = $k - $bias;
300                                                 }
301 #                                               if q < t then break
302                                                 if ($q < $t) {
303                                                         break;
304                                                 }
305 #                                               output the code point for digit t + ((q - t) mod (base - t))
306                                                 $digit = $t + (($q - $t) % (self::BOOTSTRAP_BASE - $t));
307                                                 $output .= self::digit_to_char($digit);
308 #                                               let q = (q - t) div (base - t)
309                                                 $q = floor(($q - $t) / (self::BOOTSTRAP_BASE - $t));
310 #                                       end
311                                         }
312 #                                       output the code point for digit q
313                                         $output .= self::digit_to_char($q);
314 #                                       let bias = adapt(delta, h + 1, test h equals b?)
315                                         $bias = self::adapt($delta, $h + 1, $h === $b);
316 #                                       let delta = 0
317                                         $delta = 0;
318 #                                       increment h
319                                         $h++;
320 #                               end
321                                 }
322 #                       end
323                         }
324 #                       increment delta and n
325                         $delta++;
326                         $n++;
327 #               end
328                 }
329
330                 return $output;
331         }
332
333         /**
334          * Convert a digit to its respective character
335          *
336          * @see https://tools.ietf.org/html/rfc3492#section-5
337          * @throws Requests_Exception On invalid digit (`idna.invalid_digit`)
338          *
339          * @param int $digit Digit in the range 0-35
340          * @return string Single character corresponding to digit
341          */
342         protected static function digit_to_char($digit) {
343                 // @codeCoverageIgnoreStart
344                 // As far as I know, this never happens, but still good to be sure.
345                 if ($digit < 0 || $digit > 35) {
346                         throw new Requests_Exception(sprintf('Invalid digit %d', $digit), 'idna.invalid_digit', $digit);
347                 }
348                 // @codeCoverageIgnoreEnd
349                 $digits = 'abcdefghijklmnopqrstuvwxyz0123456789';
350                 return substr($digits, $digit, 1);
351         }
352
353         /**
354          * Adapt the bias
355          *
356          * @see https://tools.ietf.org/html/rfc3492#section-6.1
357          * @param int $delta
358          * @param int $numpoints
359          * @param bool $firsttime
360          * @return int New bias
361          */
362         protected static function adapt($delta, $numpoints, $firsttime) {
363 #       function adapt(delta,numpoints,firsttime):
364 #               if firsttime then let delta = delta div damp
365                 if ($firsttime) {
366                         $delta = floor($delta / self::BOOTSTRAP_DAMP);
367                 }
368 #               else let delta = delta div 2
369                 else {
370                         $delta = floor($delta / 2);
371                 }
372 #               let delta = delta + (delta div numpoints)
373                 $delta += floor($delta / $numpoints);
374 #               let k = 0
375                 $k = 0;
376 #               while delta > ((base - tmin) * tmax) div 2 do begin
377                 $max = floor(((self::BOOTSTRAP_BASE - self::BOOTSTRAP_TMIN) * self::BOOTSTRAP_TMAX) / 2);
378                 while ($delta > $max) {
379 #                       let delta = delta div (base - tmin)
380                         $delta = floor($delta / (self::BOOTSTRAP_BASE - self::BOOTSTRAP_TMIN));
381 #                       let k = k + base
382                         $k += self::BOOTSTRAP_BASE;
383 #               end
384                 }
385 #               return k + (((base - tmin + 1) * delta) div (delta + skew))
386                 return $k + floor(((self::BOOTSTRAP_BASE - self::BOOTSTRAP_TMIN + 1) * $delta) / ($delta + self::BOOTSTRAP_SKEW));
387         }
388 }