From 902c3120c8ab3f60c37921b04b33a84db7826002 Mon Sep 17 00:00:00 2001 From: Mike Dirolf Date: Fri, 4 Sep 2009 11:02:04 -0400 Subject: [PATCH] decode strings as UTF-8 when using C extension on 1.9 --- ext/cbson/cbson.c | 19 +++++++++++++------ ext/cbson/extconf.rb | 1 + test/test_db_api.rb | 20 ++++++++++++++++++++ 3 files changed, 34 insertions(+), 6 deletions(-) diff --git a/ext/cbson/cbson.c b/ext/cbson/cbson.c index 20e8bc2..0719bae 100644 --- a/ext/cbson/cbson.c +++ b/ext/cbson/cbson.c @@ -50,6 +50,13 @@ static VALUE RegexpOfHolding; static VALUE OrderedHash; static VALUE InvalidName; +#if HAVE_RUBY_ENCODING_H +#include "ruby/encoding.h" +#define STR_NEW(p,n) rb_enc_str_new((p), (n), rb_utf8_encoding()) +#else +#define STR_NEW(p,n) rb_str_new((p), (n)) +#endif + // this sucks. but for some reason these moved around between 1.8 and 1.9 #ifdef ONIGURUMA_H #define IGNORECASE ONIG_OPTION_IGNORECASE @@ -495,7 +502,7 @@ static VALUE get_value(const char* buffer, int* position, int type) { int value_length; *position += 4; value_length = strlen(buffer + *position); - value = rb_str_new(buffer+ *position, value_length); + value = STR_NEW(buffer + *position, value_length); *position += value_length + 1; break; } @@ -509,7 +516,7 @@ static VALUE get_value(const char* buffer, int* position, int type) { int collection_length = strlen(buffer + offset); char id_type; - argv[0] = rb_str_new(buffer + offset, collection_length); + argv[0] = STR_NEW(buffer + offset, collection_length); offset += collection_length + 1; id_type = buffer[offset]; offset += 5; @@ -599,7 +606,7 @@ static VALUE get_value(const char* buffer, int* position, int type) { case 11: { int pattern_length = strlen(buffer + *position); - VALUE pattern = rb_str_new(buffer + *position, pattern_length); + VALUE pattern = STR_NEW(buffer + *position, pattern_length); int flags_length, flags = 0, i = 0; char extra[10]; VALUE argv[3]; @@ -635,7 +642,7 @@ static VALUE get_value(const char* buffer, int* position, int type) { VALUE collection, str, oid, id, argv[2]; *position += 4; collection_length = strlen(buffer + *position); - collection = rb_str_new(buffer + *position, collection_length); + collection = STR_NEW(buffer + *position, collection_length); *position += collection_length + 1; str = rb_str_new(buffer + *position, 12); @@ -662,7 +669,7 @@ static VALUE get_value(const char* buffer, int* position, int type) { VALUE code, scope, argv[2]; *position += 8; code_length = strlen(buffer + *position); - code = rb_str_new(buffer + *position, code_length); + code = STR_NEW(buffer + *position, code_length); *position += code_length + 1; memcpy(&scope_size, buffer + *position, 4); @@ -715,7 +722,7 @@ static VALUE elements_to_hash(const char* buffer, int max) { while (position < max) { int type = (int)buffer[position++]; int name_length = strlen(buffer + position); - VALUE name = rb_str_new(buffer + position, name_length); + VALUE name = STR_NEW(buffer + position, name_length); VALUE value; position += name_length + 1; value = get_value(buffer, &position, type); diff --git a/ext/cbson/extconf.rb b/ext/cbson/extconf.rb index 68fbea4..e494ce7 100644 --- a/ext/cbson/extconf.rb +++ b/ext/cbson/extconf.rb @@ -2,6 +2,7 @@ require 'mkmf' have_header("ruby/st.h") || have_header("st.h") have_header("ruby/regex.h") || have_header("regex.h") +have_header("ruby/encoding.h") dir_config('cbson') create_makefile('mongo_ext/cbson') diff --git a/test/test_db_api.rb b/test/test_db_api.rb index 37e5932..635608f 100644 --- a/test/test_db_api.rb +++ b/test/test_db_api.rb @@ -809,4 +809,24 @@ class DBAPITest < Test::Unit::TestCase @@db.collection("test").find({}, :snapshot => true, :sort => 'a').to_a end end + + def test_encodings + if RUBY_VERSION >= '1.9' + ascii = "hello world" + utf8 = "hello world".encode("UTF-8") + iso8859 = "hello world".encode("ISO-8859-1") + + assert_equal "US-ASCII", ascii.encoding.name + assert_equal "UTF-8", utf8.encoding.name + assert_equal "ISO-8859-1", iso8859.encoding.name + + @@coll.clear + @@coll.save("ascii" => ascii, "utf8" => utf8, "iso8859" => iso8859) + doc = @@coll.find_one() + + assert_equal "UTF-8", doc["ascii"].encoding.name + assert_equal "UTF-8", doc["utf8"].encoding.name + assert_equal "UTF-8", doc["iso8859"].encoding.name + end + end end