X-Git-Url: https://scripts.mit.edu/gitweb/autoinstalls/wordpress.git/blobdiff_plain/fcaa67f093b5c83deea7a361a8cf8c6ac4e832d3..899389d1e4043331309c0433543419258b230b60:/wp-includes/compat.php?ds=sidebyside diff --git a/wp-includes/compat.php b/wp-includes/compat.php index 43667053..4317eb68 100644 --- a/wp-includes/compat.php +++ b/wp-includes/compat.php @@ -13,23 +13,85 @@ if ( !function_exists('_') ) { } } +/** + * Returns whether PCRE/u (PCRE_UTF8 modifier) is available for use. + * + * @ignore + * @since 4.2.2 + * @access private + * + * @param bool $set - Used for testing only + * null : default - get PCRE/u capability + * false : Used for testing - return false for future calls to this function + * 'reset': Used for testing - restore default behavior of this function + */ +function _wp_can_use_pcre_u( $set = null ) { + static $utf8_pcre = 'reset'; + + if ( null !== $set ) { + $utf8_pcre = $set; + } + + if ( 'reset' === $utf8_pcre ) { + $utf8_pcre = @preg_match( '/^./u', 'a' ); + } + + return $utf8_pcre; +} + if ( ! function_exists( 'mb_substr' ) ) : function mb_substr( $str, $start, $length = null, $encoding = null ) { return _mb_substr( $str, $start, $length, $encoding ); } endif; +/* + * Only understands UTF-8 and 8bit. All other character sets will be treated as 8bit. + * For $encoding === UTF-8, the $str input is expected to be a valid UTF-8 byte sequence. + * The behavior of this function for invalid inputs is undefined. + */ function _mb_substr( $str, $start, $length = null, $encoding = null ) { + if ( null === $encoding ) { + $encoding = get_option( 'blog_charset' ); + } + // The solution below works only for UTF-8, // so in case of a different charset just use built-in substr() - $charset = get_option( 'blog_charset' ); - if ( ! in_array( $charset, array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ) ) ) { + if ( ! in_array( $encoding, array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ) ) ) { return is_null( $length ) ? substr( $str, $start ) : substr( $str, $start, $length ); } - // Use the regex unicode support to separate the UTF-8 characters into an array - preg_match_all( '/./us', $str, $match ); - $chars = is_null( $length ) ? array_slice( $match[0], $start ) : array_slice( $match[0], $start, $length ); - return implode( '', $chars ); + + if ( _wp_can_use_pcre_u() ) { + // Use the regex unicode support to separate the UTF-8 characters into an array + preg_match_all( '/./us', $str, $match ); + $chars = is_null( $length ) ? array_slice( $match[0], $start ) : array_slice( $match[0], $start, $length ); + return implode( '', $chars ); + } + + $regex = '/( + [\x00-\x7F] # single-byte sequences 0xxxxxxx + | [\xC2-\xDF][\x80-\xBF] # double-byte sequences 110xxxxx 10xxxxxx + | \xE0[\xA0-\xBF][\x80-\xBF] # triple-byte sequences 1110xxxx 10xxxxxx * 2 + | [\xE1-\xEC][\x80-\xBF]{2} + | \xED[\x80-\x9F][\x80-\xBF] + | [\xEE-\xEF][\x80-\xBF]{2} + | \xF0[\x90-\xBF][\x80-\xBF]{2} # four-byte sequences 11110xxx 10xxxxxx * 3 + | [\xF1-\xF3][\x80-\xBF]{3} + | \xF4[\x80-\x8F][\x80-\xBF]{2} + )/x'; + + $chars = array( '' ); // Start with 1 element instead of 0 since the first thing we do is pop + do { + // We had some string left over from the last round, but we counted it in that last round. + array_pop( $chars ); + + // Split by UTF-8 character, limit to 1000 characters (last array element will contain the rest of the string) + $pieces = preg_split( $regex, $str, 1000, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY ); + + $chars = array_merge( $chars, $pieces ); + } while ( count( $pieces ) > 1 && $str = array_pop( $pieces ) ); // If there's anything left over, repeat the loop. + + return join( '', array_slice( $chars, $start, $length ) ); } if ( ! function_exists( 'mb_strlen' ) ) : @@ -38,16 +100,54 @@ if ( ! function_exists( 'mb_strlen' ) ) : } endif; +/* + * Only understands UTF-8 and 8bit. All other character sets will be treated as 8bit. + * For $encoding === UTF-8, the $str input is expected to be a valid UTF-8 byte sequence. + * The behavior of this function for invalid inputs is undefined. + */ function _mb_strlen( $str, $encoding = null ) { + if ( null === $encoding ) { + $encoding = get_option( 'blog_charset' ); + } + // The solution below works only for UTF-8, // so in case of a different charset just use built-in strlen() - $charset = get_option( 'blog_charset' ); - if ( ! in_array( $charset, array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ) ) ) { + if ( ! in_array( $encoding, array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ) ) ) { return strlen( $str ); } - // Use the regex unicode support to separate the UTF-8 characters into an array - preg_match_all( '/./us', $str, $match ); - return count( $match[0] ); + + if ( _wp_can_use_pcre_u() ) { + // Use the regex unicode support to separate the UTF-8 characters into an array + preg_match_all( '/./us', $str, $match ); + return count( $match[0] ); + } + + $regex = '/(?: + [\x00-\x7F] # single-byte sequences 0xxxxxxx + | [\xC2-\xDF][\x80-\xBF] # double-byte sequences 110xxxxx 10xxxxxx + | \xE0[\xA0-\xBF][\x80-\xBF] # triple-byte sequences 1110xxxx 10xxxxxx * 2 + | [\xE1-\xEC][\x80-\xBF]{2} + | \xED[\x80-\x9F][\x80-\xBF] + | [\xEE-\xEF][\x80-\xBF]{2} + | \xF0[\x90-\xBF][\x80-\xBF]{2} # four-byte sequences 11110xxx 10xxxxxx * 3 + | [\xF1-\xF3][\x80-\xBF]{3} + | \xF4[\x80-\x8F][\x80-\xBF]{2} + )/x'; + + $count = 1; // Start at 1 instead of 0 since the first thing we do is decrement + do { + // We had some string left over from the last round, but we counted it in that last round. + $count--; + + // Split by UTF-8 character, limit to 1000 characters (last array element will contain the rest of the string) + $pieces = preg_split( $regex, $str, 1000 ); + + // Increment + $count += count( $pieces ); + } while ( $str = array_pop( $pieces ) ); // If there's anything left over, repeat the loop. + + // Fencepost: preg_split() always returns one extra item in the array + return --$count; } if ( !function_exists('hash_hmac') ):