From b48a2bd84fbe7091260c4413e9afd388a4b85ff6 Mon Sep 17 00:00:00 2001 From: Kyle Banker Date: Thu, 24 Mar 2011 12:11:12 -0400 Subject: [PATCH] RUBY-253 fix UTF8 check for Ruby 1.9 --- ext/cbson/cbson.c | 18 +++--------------- lib/bson/bson_ruby.rb | 5 +++++ test/bson/bson_test.rb | 20 ++++++++++++++------ 3 files changed, 22 insertions(+), 21 deletions(-) diff --git a/ext/cbson/cbson.c b/ext/cbson/cbson.c index ef8c10c..5fe6dfe 100644 --- a/ext/cbson/cbson.c +++ b/ext/cbson/cbson.c @@ -107,21 +107,12 @@ static int max_bson_size; } \ _str; \ }) -/* MUST call TO_UTF8 before calling write_utf8. */ #define TO_UTF8(string) rb_str_export_to_enc((string), rb_utf8_encoding()) -static void write_utf8(buffer_t buffer, VALUE string, char check_null) { - result_t status = check_string(RSTRING_PTR(string), RSTRING_LENINT(string), - 0, check_null); - if (status == HAS_NULL) { - buffer_free(buffer); - rb_raise(InvalidDocument, "Key names / regex patterns must not contain the NULL byte"); - } - SAFE_WRITE(buffer, RSTRING_PTR(string), RSTRING_LENINT(string)); -} #else #define STR_NEW(p,n) rb_str_new((p), (n)) -/* MUST call TO_UTF8 before calling write_utf8. */ #define TO_UTF8(string) (string) +#endif + static void write_utf8(buffer_t buffer, VALUE string, char check_null) { result_t status = check_string(RSTRING_PTR(string), RSTRING_LEN(string), 1, check_null); @@ -132,9 +123,9 @@ static void write_utf8(buffer_t buffer, VALUE string, char check_null) { buffer_free(buffer); rb_raise(InvalidStringEncoding, "String not valid UTF-8"); } + string = TO_UTF8(string); SAFE_WRITE(buffer, RSTRING_PTR(string), RSTRING_LEN(string)); } -#endif // this sucks. but for some reason these moved around between 1.8 and 1.9 #ifdef ONIGURUMA_H @@ -211,7 +202,6 @@ static VALUE pack_extra(buffer_t buffer, VALUE check_keys) { static void write_name_and_type(buffer_t buffer, VALUE name, char type) { SAFE_WRITE(buffer, &type, 1); - name = TO_UTF8(name); write_utf8(buffer, name, 1); SAFE_WRITE(buffer, &zero, 1); } @@ -340,7 +330,6 @@ static int write_element(VALUE key, VALUE value, VALUE extra, int allow_id) { { int length; write_name_and_type(buffer, key, 0x02); - value = TO_UTF8(value); length = RSTRING_LENINT(value) + 1; SAFE_WRITE(buffer, (char*)&length, 4); write_utf8(buffer, value, 0); @@ -485,7 +474,6 @@ static int write_element(VALUE key, VALUE value, VALUE extra, int allow_id) { write_name_and_type(buffer, key, 0x0B); - pattern = TO_UTF8(pattern); write_utf8(buffer, pattern, 1); SAFE_WRITE(buffer, &zero, 1); diff --git a/lib/bson/bson_ruby.rb b/lib/bson/bson_ruby.rb index 3cfaae6..17d03a7 100644 --- a/lib/bson/bson_ruby.rb +++ b/lib/bson/bson_ruby.rb @@ -57,6 +57,11 @@ module BSON BINARY_ENCODING = Encoding.find('binary') def self.to_utf8_binary(str) + begin + str.unpack("U*") + rescue => ex + raise InvalidStringEncoding, "String not valid utf-8: #{str.inspect}" + end str.encode(UTF8_ENCODING).force_encoding(BINARY_ENCODING) end else diff --git a/test/bson/bson_test.rb b/test/bson/bson_test.rb index bad4304..f178442 100644 --- a/test/bson/bson_test.rb +++ b/test/bson/bson_test.rb @@ -119,15 +119,23 @@ class BSONTest < Test::Unit::TestCase end else def test_non_utf8_string - bson = BSON::BSON_CODER.serialize({'str' => 'aé'.encode('iso-8859-1')}) - result = BSON::BSON_CODER.deserialize(bson)['str'] - assert_equal 'aé', result - assert_equal 'UTF-8', result.encoding.name + assert_raise BSON::InvalidStringEncoding do + BSON::BSON_CODER.serialize({'str' => 'aé'.encode('iso-8859-1')}) + end + end + + def test_invalid_utf8_string + str = "123\xD9" + assert !str.valid_encoding? + assert_raise BSON::InvalidStringEncoding do + BSON::BSON_CODER.serialize({'str' => str}) + end end def test_non_utf8_key - bson = BSON::BSON_CODER.serialize({'aé'.encode('iso-8859-1') => 'hello'}) - assert_equal 'hello', BSON::BSON_CODER.deserialize(bson)['aé'] + assert_raise BSON::InvalidStringEncoding do + BSON::BSON_CODER.serialize({'aé'.encode('iso-8859-1') => 'hello'}) + end end # Based on a test from sqlite3-ruby