+ // Normalize as many pct-encoded sections as possible
+ $string = preg_replace_callback('/(?:%[A-Fa-f0-9]{2})+/', array(&$this, 'remove_iunreserved_percent_encoded'), $string);
+
+ // Replace invalid percent characters
+ $string = preg_replace('/%(?![A-Fa-f0-9]{2})/', '%25', $string);
+
+ // Add unreserved and % to $valid_chars (the latter is safe because all
+ // pct-encoded sections are now valid).
+ $valid_chars .= 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~%';
+
+ // Now replace any bytes that aren't allowed with their pct-encoded versions
+ $position = 0;
+ $strlen = strlen($string);
+ while (($position += strspn($string, $valid_chars, $position)) < $strlen)
+ {
+ $value = ord($string[$position]);
+
+ // Start position
+ $start = $position;
+
+ // By default we are valid
+ $valid = true;
+
+ // No one byte sequences are valid due to the while.
+ // Two byte sequence:
+ if (($value & 0xE0) === 0xC0)
+ {
+ $character = ($value & 0x1F) << 6;
+ $length = 2;
+ $remaining = 1;
+ }
+ // Three byte sequence:
+ elseif (($value & 0xF0) === 0xE0)
+ {
+ $character = ($value & 0x0F) << 12;
+ $length = 3;
+ $remaining = 2;
+ }
+ // Four byte sequence:
+ elseif (($value & 0xF8) === 0xF0)
+ {
+ $character = ($value & 0x07) << 18;
+ $length = 4;
+ $remaining = 3;
+ }
+ // Invalid byte:
+ else
+ {
+ $valid = false;
+ $length = 1;
+ $remaining = 0;
+ }
+
+ if ($remaining)
+ {
+ if ($position + $length <= $strlen)
+ {
+ for ($position++; $remaining; $position++)
+ {
+ $value = ord($string[$position]);
+
+ // Check that the byte is valid, then add it to the character:
+ if (($value & 0xC0) === 0x80)
+ {
+ $character |= ($value & 0x3F) << (--$remaining * 6);
+ }
+ // If it is invalid, count the sequence as invalid and reprocess the current byte:
+ else
+ {
+ $valid = false;
+ $position--;
+ break;
+ }
+ }
+ }
+ else
+ {
+ $position = $strlen - 1;
+ $valid = false;
+ }
+ }
+
+ // Percent encode anything invalid or not in ucschar
+ if (
+ // Invalid sequences
+ !$valid
+ // Non-shortest form sequences are invalid
+ || $length > 1 && $character <= 0x7F
+ || $length > 2 && $character <= 0x7FF
+ || $length > 3 && $character <= 0xFFFF
+ // Outside of range of ucschar codepoints
+ // Noncharacters
+ || ($character & 0xFFFE) === 0xFFFE
+ || $character >= 0xFDD0 && $character <= 0xFDEF
+ || (
+ // Everything else not in ucschar
+ $character > 0xD7FF && $character < 0xF900
+ || $character < 0xA0
+ || $character > 0xEFFFD
+ )
+ && (
+ // Everything not in iprivate, if it applies
+ !$iprivate
+ || $character < 0xE000
+ || $character > 0x10FFFD
+ )
+ )
+ {
+ // If we were a character, pretend we weren't, but rather an error.
+ if ($valid)
+ $position--;
+
+ for ($j = $start; $j <= $position; $j++)
+ {
+ $string = substr_replace($string, sprintf('%%%02X', ord($string[$j])), $j, 1);
+ $j += 2;
+ $position += 2;
+ $strlen += 2;
+ }
+ }
+ }
+