From 902c3120c8ab3f60c37921b04b33a84db7826002 Mon Sep 17 00:00:00 2001
From: Mike Dirolf <mike@10gen.com>
Date: Fri, 4 Sep 2009 11:02:04 -0400
Subject: [PATCH] decode strings as UTF-8 when using C extension on 1.9

---
 ext/cbson/cbson.c    | 19 +++++++++++++------
 ext/cbson/extconf.rb |  1 +
 test/test_db_api.rb  | 20 ++++++++++++++++++++
 3 files changed, 34 insertions(+), 6 deletions(-)

diff --git a/ext/cbson/cbson.c b/ext/cbson/cbson.c
index 20e8bc2..0719bae 100644
--- a/ext/cbson/cbson.c
+++ b/ext/cbson/cbson.c
@@ -50,6 +50,13 @@ static VALUE RegexpOfHolding;
 static VALUE OrderedHash;
 static VALUE InvalidName;
 
+#if HAVE_RUBY_ENCODING_H
+#include "ruby/encoding.h"
+#define STR_NEW(p,n) rb_enc_str_new((p), (n), rb_utf8_encoding())
+#else
+#define STR_NEW(p,n) rb_str_new((p), (n))
+#endif
+
 // this sucks. but for some reason these moved around between 1.8 and 1.9
 #ifdef ONIGURUMA_H
 #define IGNORECASE ONIG_OPTION_IGNORECASE
@@ -495,7 +502,7 @@ static VALUE get_value(const char* buffer, int* position, int type) {
             int value_length;
             *position += 4;
             value_length = strlen(buffer + *position);
-            value = rb_str_new(buffer+ *position, value_length);
+            value = STR_NEW(buffer + *position, value_length);
             *position += value_length + 1;
             break;
         }
@@ -509,7 +516,7 @@ static VALUE get_value(const char* buffer, int* position, int type) {
                 int collection_length = strlen(buffer + offset);
                 char id_type;
 
-                argv[0] = rb_str_new(buffer + offset, collection_length);
+                argv[0] = STR_NEW(buffer + offset, collection_length);
                 offset += collection_length + 1;
                 id_type = buffer[offset];
                 offset += 5;
@@ -599,7 +606,7 @@ static VALUE get_value(const char* buffer, int* position, int type) {
     case 11:
         {
             int pattern_length = strlen(buffer + *position);
-            VALUE pattern = rb_str_new(buffer + *position, pattern_length);
+            VALUE pattern = STR_NEW(buffer + *position, pattern_length);
             int flags_length, flags = 0, i = 0;
             char extra[10];
             VALUE argv[3];
@@ -635,7 +642,7 @@ static VALUE get_value(const char* buffer, int* position, int type) {
             VALUE collection, str, oid, id, argv[2];
             *position += 4;
             collection_length = strlen(buffer + *position);
-            collection = rb_str_new(buffer + *position, collection_length);
+            collection = STR_NEW(buffer + *position, collection_length);
             *position += collection_length + 1;
 
             str = rb_str_new(buffer + *position, 12);
@@ -662,7 +669,7 @@ static VALUE get_value(const char* buffer, int* position, int type) {
             VALUE code, scope, argv[2];
             *position += 8;
             code_length = strlen(buffer + *position);
-            code = rb_str_new(buffer + *position, code_length);
+            code = STR_NEW(buffer + *position, code_length);
             *position += code_length + 1;
 
             memcpy(&scope_size, buffer + *position, 4);
@@ -715,7 +722,7 @@ static VALUE elements_to_hash(const char* buffer, int max) {
     while (position < max) {
         int type = (int)buffer[position++];
         int name_length = strlen(buffer + position);
-        VALUE name = rb_str_new(buffer + position, name_length);
+        VALUE name = STR_NEW(buffer + position, name_length);
         VALUE value;
         position += name_length + 1;
         value = get_value(buffer, &position, type);
diff --git a/ext/cbson/extconf.rb b/ext/cbson/extconf.rb
index 68fbea4..e494ce7 100644
--- a/ext/cbson/extconf.rb
+++ b/ext/cbson/extconf.rb
@@ -2,6 +2,7 @@ require 'mkmf'
 
 have_header("ruby/st.h") || have_header("st.h")
 have_header("ruby/regex.h") || have_header("regex.h")
+have_header("ruby/encoding.h")
 
 dir_config('cbson')
 create_makefile('mongo_ext/cbson')
diff --git a/test/test_db_api.rb b/test/test_db_api.rb
index 37e5932..635608f 100644
--- a/test/test_db_api.rb
+++ b/test/test_db_api.rb
@@ -809,4 +809,24 @@ class DBAPITest < Test::Unit::TestCase
       @@db.collection("test").find({}, :snapshot => true, :sort => 'a').to_a
     end
   end
+
+  def test_encodings
+    if RUBY_VERSION >= '1.9'
+      ascii = "hello world"
+      utf8 = "hello world".encode("UTF-8")
+      iso8859 = "hello world".encode("ISO-8859-1")
+
+      assert_equal "US-ASCII", ascii.encoding.name
+      assert_equal "UTF-8", utf8.encoding.name
+      assert_equal "ISO-8859-1", iso8859.encoding.name
+
+      @@coll.clear
+      @@coll.save("ascii" => ascii, "utf8" => utf8, "iso8859" => iso8859)
+      doc = @@coll.find_one()
+
+      assert_equal "UTF-8", doc["ascii"].encoding.name
+      assert_equal "UTF-8", doc["utf8"].encoding.name
+      assert_equal "UTF-8", doc["iso8859"].encoding.name
+    end
+  end
 end