decode strings as UTF-8 when using C extension on 1.9

2009-09-04 11:02:04 -04:00 · 2009-09-04 11:02:04 -04:00 · 902c3120c8
parent 3d304b1a02
commit 902c3120c8
3 changed files with 34 additions and 6 deletions
--- a/ext/cbson/cbson.c
+++ b/ext/cbson/cbson.c
@ -50,6 +50,13 @@ static VALUE RegexpOfHolding;
 static VALUE OrderedHash;
 static VALUE InvalidName;

+#if HAVE_RUBY_ENCODING_H
+#include "ruby/encoding.h"
+#define STR_NEW(p,n) rb_enc_str_new((p), (n), rb_utf8_encoding())
+#else
+#define STR_NEW(p,n) rb_str_new((p), (n))
+#endif
+
 // this sucks. but for some reason these moved around between 1.8 and 1.9
 #ifdef ONIGURUMA_H
 #define IGNORECASE ONIG_OPTION_IGNORECASE
@ -495,7 +502,7 @@ static VALUE get_value(const char* buffer, int* position, int type) {
            int value_length;
            *position += 4;
            value_length = strlen(buffer + *position);
-            value = rb_str_new(buffer+ *position, value_length);
+            value = STR_NEW(buffer + *position, value_length);
            *position += value_length + 1;
            break;
        }
@ -509,7 +516,7 @@ static VALUE get_value(const char* buffer, int* position, int type) {
                int collection_length = strlen(buffer + offset);
                char id_type;

-                argv[0] = rb_str_new(buffer + offset, collection_length);
+                argv[0] = STR_NEW(buffer + offset, collection_length);
                offset += collection_length + 1;
                id_type = buffer[offset];
                offset += 5;
@ -599,7 +606,7 @@ static VALUE get_value(const char* buffer, int* position, int type) {
    case 11:
        {
            int pattern_length = strlen(buffer + *position);
-            VALUE pattern = rb_str_new(buffer + *position, pattern_length);
+            VALUE pattern = STR_NEW(buffer + *position, pattern_length);
            int flags_length, flags = 0, i = 0;
            char extra[10];
            VALUE argv[3];
@ -635,7 +642,7 @@ static VALUE get_value(const char* buffer, int* position, int type) {
            VALUE collection, str, oid, id, argv[2];
            *position += 4;
            collection_length = strlen(buffer + *position);
-            collection = rb_str_new(buffer + *position, collection_length);
+            collection = STR_NEW(buffer + *position, collection_length);
            *position += collection_length + 1;

            str = rb_str_new(buffer + *position, 12);
@ -662,7 +669,7 @@ static VALUE get_value(const char* buffer, int* position, int type) {
            VALUE code, scope, argv[2];
            *position += 8;
            code_length = strlen(buffer + *position);
-            code = rb_str_new(buffer + *position, code_length);
+            code = STR_NEW(buffer + *position, code_length);
            *position += code_length + 1;

            memcpy(&scope_size, buffer + *position, 4);
@ -715,7 +722,7 @@ static VALUE elements_to_hash(const char* buffer, int max) {
    while (position < max) {
        int type = (int)buffer[position++];
        int name_length = strlen(buffer + position);
-        VALUE name = rb_str_new(buffer + position, name_length);
+        VALUE name = STR_NEW(buffer + position, name_length);
        VALUE value;
        position += name_length + 1;
        value = get_value(buffer, &position, type);
--- a/ext/cbson/extconf.rb
+++ b/ext/cbson/extconf.rb
@ -2,6 +2,7 @@ require 'mkmf'

 have_header("ruby/st.h") || have_header("st.h")
 have_header("ruby/regex.h") || have_header("regex.h")
+have_header("ruby/encoding.h")

 dir_config('cbson')
 create_makefile('mongo_ext/cbson')
--- a/test/test_db_api.rb
+++ b/test/test_db_api.rb
@ -809,4 +809,24 @@ class DBAPITest < Test::Unit::TestCase
      @@db.collection("test").find({}, :snapshot => true, :sort => 'a').to_a
    end
  end
+
+  def test_encodings
+    if RUBY_VERSION >= '1.9'
+      ascii = "hello world"
+      utf8 = "hello world".encode("UTF-8")
+      iso8859 = "hello world".encode("ISO-8859-1")
+
+      assert_equal "US-ASCII", ascii.encoding.name
+      assert_equal "UTF-8", utf8.encoding.name
+      assert_equal "ISO-8859-1", iso8859.encoding.name
+
+      @@coll.clear
+      @@coll.save("ascii" => ascii, "utf8" => utf8, "iso8859" => iso8859)
+      doc = @@coll.find_one()
+
+      assert_equal "UTF-8", doc["ascii"].encoding.name
+      assert_equal "UTF-8", doc["utf8"].encoding.name
+      assert_equal "UTF-8", doc["iso8859"].encoding.name
+    end
+  end
 end