diff --git a/activesupport/lib/active_support/multibyte.rb b/activesupport/lib/active_support/multibyte.rb index 65a96af..b6354ee 100644 --- a/activesupport/lib/active_support/multibyte.rb +++ b/activesupport/lib/active_support/multibyte.rb @@ -1,9 +1,5 @@ # encoding: utf-8 -require 'active_support/multibyte/chars' -require 'active_support/multibyte/exceptions' -require 'active_support/multibyte/unicode_database' - module ActiveSupport #:nodoc: module Multibyte # A list of all available normalization forms. See http://www.unicode.org/reports/tr15/tr15-29.html for more @@ -27,7 +23,35 @@ module ActiveSupport #:nodoc: # # Example: # ActiveSupport::Multibyte.proxy_class = CharsForUTF32 - mattr_accessor :proxy_class - self.proxy_class = ActiveSupport::Multibyte::Chars + def self.proxy_class=(klass) + @proxy_class = klass + end + + # Returns the currect proxy class + def self.proxy_class + @proxy_class ||= ActiveSupport::Multibyte::Chars + end + + # Regular expressions that describe valid byte sequences for a character + VALID_CHARACTER = { + # Borrowed from the Kconv library by Shinji KONO - (also as seen on the W3C site) + 'UTF-8' => /\A(?: + [\x00-\x7f] | + [\xc2-\xdf] [\x80-\xbf] | + \xe0 [\xa0-\xbf] [\x80-\xbf] | + [\xe1-\xef] [\x80-\xbf] [\x80-\xbf] | + \xf0 [\x90-\xbf] [\x80-\xbf] [\x80-\xbf] | + [\xf1-\xf3] [\x80-\xbf] [\x80-\xbf] [\x80-\xbf] | + \xf4 [\x80-\x8f] [\x80-\xbf] [\x80-\xbf])\z /xn, + # Quick check for valid Shift-JIS characters, disregards the odd-even pairing + 'Shift_JIS' => /\A(?: + [\x00-\x7e \xa1-\xdf] | + [\x81-\x9f \xe0-\xef] [\x40-\x7e \x80-\x9e \x9f-\xfc])\z /xn + } end end + +require 'active_support/multibyte/chars' +require 'active_support/multibyte/exceptions' +require 'active_support/multibyte/unicode_database' +require 'active_support/multibyte/utils' diff --git a/activesupport/lib/active_support/multibyte/chars.rb b/activesupport/lib/active_support/multibyte/chars.rb index 3d392d2..16bc130 100644 --- a/activesupport/lib/active_support/multibyte/chars.rb +++ b/activesupport/lib/active_support/multibyte/chars.rb @@ -73,16 +73,7 @@ module ActiveSupport #:nodoc: UNICODE_TRAILERS_PAT = /(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+\Z/ UNICODE_LEADERS_PAT = /\A(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+/ - # Borrowed from the Kconv library by Shinji KONO - (also as seen on the W3C site) - UTF8_PAT = /\A(?: - [\x00-\x7f] | - [\xc2-\xdf] [\x80-\xbf] | - \xe0 [\xa0-\xbf] [\x80-\xbf] | - [\xe1-\xef] [\x80-\xbf] [\x80-\xbf] | - \xf0 [\x90-\xbf] [\x80-\xbf] [\x80-\xbf] | - [\xf1-\xf3] [\x80-\xbf] [\x80-\xbf] [\x80-\xbf] | - \xf4 [\x80-\x8f] [\x80-\xbf] [\x80-\xbf] - )*\z/xn + UTF8_PAT = ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8'] attr_reader :wrapped_string alias to_s wrapped_string @@ -307,23 +298,23 @@ module ActiveSupport #:nodoc: def rstrip chars(@wrapped_string.gsub(UNICODE_TRAILERS_PAT, '')) end - + # Strips entire range of Unicode whitespace from the left of the string. def lstrip chars(@wrapped_string.gsub(UNICODE_LEADERS_PAT, '')) end - + # Strips entire range of Unicode whitespace from the right and left of the string. def strip rstrip.lstrip end - + # Returns the number of codepoints in the string def size self.class.u_unpack(@wrapped_string).size end alias_method :length, :size - + # Reverses all characters in the string. # # Example: @@ -331,7 +322,7 @@ module ActiveSupport #:nodoc: def reverse chars(self.class.u_unpack(@wrapped_string).reverse.pack('U*')) end - + # Implements Unicode-aware slice with codepoints. Slicing on one point returns the codepoints for that # character. # @@ -646,7 +637,7 @@ module ActiveSupport #:nodoc: string.split(//u).map do |c| c.force_encoding(Encoding::ASCII) if c.respond_to?(:force_encoding) - if !UTF8_PAT.match(c) + if !ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8'].match(c) n = c.unpack('C')[0] n < 128 ? n.chr : n < 160 ? [UCD.cp1252[n] || n].pack('U') : diff --git a/activesupport/lib/active_support/multibyte/utils.rb b/activesupport/lib/active_support/multibyte/utils.rb new file mode 100644 index 0000000..acef84d --- /dev/null +++ b/activesupport/lib/active_support/multibyte/utils.rb @@ -0,0 +1,61 @@ +# encoding: utf-8 + +module ActiveSupport #:nodoc: + module Multibyte #:nodoc: + if Kernel.const_defined?(:Encoding) + # Returns a regular expression that matches valid characters in the current encoding + def self.valid_character + VALID_CHARACTER[Encoding.default_internal.to_s] + end + else + def self.valid_character + case $KCODE + when 'UTF8' + VALID_CHARACTER['UTF-8'] + when 'SJIS' + VALID_CHARACTER['Shift_JIS'] + end + end + end + + if 'string'.respond_to?(:valid_encoding?) + # Verifies the encoding of a string + def self.verify(string) + string.valid_encoding? + end + else + def self.verify(string) + if expression = valid_character + for c in string.split(//) + return false unless valid_character.match(c) + end + end + true + end + end + + # Verifies the encoding of the string and raises an exception when it's not valid + def self.verify!(string) + raise EncodingError.new("Found characters with invalid encoding") unless verify(string) + end + + if 'string'.respond_to?(:force_encoding) + # Removes all invalid characters from the string. + # + # Note: this method is a no-op in Ruby 1.9 + def self.clean(string) + string + end + else + def self.clean(string) + if expression = valid_character + stripped = []; for c in string.split(//) + stripped << c if valid_character.match(c) + end; stripped.join + else + string + end + end + end + end +end \ No newline at end of file diff --git a/activesupport/test/multibyte_utils_test.rb b/activesupport/test/multibyte_utils_test.rb new file mode 100644 index 0000000..d8ac5ff --- /dev/null +++ b/activesupport/test/multibyte_utils_test.rb @@ -0,0 +1,141 @@ +# encoding: utf-8 + +require 'abstract_unit' +require 'multibyte_test_helpers' + +class MultibyteUtilsTest < ActiveSupport::TestCase + include MultibyteTestHelpers + + test "valid_character returns an expression for the current encoding" do + with_encoding('None') do + assert_nil ActiveSupport::Multibyte.valid_character + end + with_encoding('UTF8') do + assert_equal ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8'], ActiveSupport::Multibyte.valid_character + end + with_encoding('SJIS') do + assert_equal ActiveSupport::Multibyte::VALID_CHARACTER['Shift_JIS'], ActiveSupport::Multibyte.valid_character + end + end + + test "verify verifies ASCII strings are properly encoded" do + with_encoding('None') do + examples.each do |example| + assert ActiveSupport::Multibyte.verify(example) + end + end + end + + test "verify verifies UTF-8 strings are properly encoded" do + with_encoding('UTF8') do + assert ActiveSupport::Multibyte.verify(example('valid UTF-8')) + assert !ActiveSupport::Multibyte.verify(example('invalid UTF-8')) + end + end + + test "verify verifies Shift-JIS strings are properly encoded" do + with_encoding('SJIS') do + assert ActiveSupport::Multibyte.verify(example('valid Shift-JIS')) + assert !ActiveSupport::Multibyte.verify(example('invalid Shift-JIS')) + end + end + + test "verify! raises an exception when it finds an invalid character" do + with_encoding('UTF8') do + assert_raises(ActiveSupport::Multibyte::EncodingError) do + ActiveSupport::Multibyte.verify!(example('invalid UTF-8')) + end + end + end + + test "verify! doesn't raise an exception when the encoding is valid" do + with_encoding('UTF8') do + assert_nothing_raised do + ActiveSupport::Multibyte.verify!(example('valid UTF-8')) + end + end + end + + if RUBY_VERSION < '1.9' + test "clean leaves ASCII strings intact" do + with_encoding('None') do + [ + 'word', "\270\236\010\210\245" + ].each do |string| + assert_equal string, ActiveSupport::Multibyte.clean(string) + end + end + end + + test "clean cleans invalid characters from UTF-8 encoded strings" do + with_encoding('UTF8') do + cleaned_utf8 = [8].pack('C*') + assert_equal example('valid UTF-8'), ActiveSupport::Multibyte.clean(example('valid UTF-8')) + assert_equal cleaned_utf8, ActiveSupport::Multibyte.clean(example('invalid UTF-8')) + end + end + + test "clean cleans invalid characters from Shift-JIS encoded strings" do + with_encoding('SJIS') do + cleaned_sjis = [184, 0, 136, 165].pack('C*') + assert_equal example('valid Shift-JIS'), ActiveSupport::Multibyte.clean(example('valid Shift-JIS')) + assert_equal cleaned_sjis, ActiveSupport::Multibyte.clean(example('invalid Shift-JIS')) + end + end + else + test "clean is a no-op" do + with_encoding('UTF8') do + assert_equal example('invalid Shift-JIS'), ActiveSupport::Multibyte.clean(example('invalid Shift-JIS')) + end + end + end + + private + + STRINGS = { + 'valid ASCII' => [65, 83, 67, 73, 73].pack('C*'), + 'invalid ASCII' => [128].pack('C*'), + 'valid UTF-8' => [227, 129, 147, 227, 129, 171, 227, 129, 161, 227, 130, 143].pack('C*'), + 'invalid UTF-8' => [184, 158, 8, 136, 165].pack('C*'), + 'valid Shift-JIS' => [131, 122, 129, 91, 131, 128].pack('C*'), + 'invalid Shift-JIS' => [184, 158, 8, 0, 255, 136, 165].pack('C*') + } + + if Kernel.const_defined?(:Encoding) + def example(key) + STRINGS[key].force_encoding(Encoding.default_internal) + end + + def examples + STRINGS.values.map { |s| s.force_encoding(Encoding.default_internal) } + end + else + def example(key) + STRINGS[key] + end + + def examples + STRINGS.values + end + end + + if 'string'.respond_to?(:encoding) + def with_encoding(enc) + before = Encoding.default_internal + + case enc + when 'UTF8' + Encoding.default_internal = Encoding::UTF_8 + when 'SJIS' + Encoding.default_internal = Encoding::Shift_JIS + else + Encoding.default_internal = Encoding::BINARY + end + yield + + Encoding.default_internal = before + end + else + alias with_encoding with_kcode + end +end \ No newline at end of file