From 5793ce986bb20724552a8f33c620cd03da654c2d Mon Sep 17 00:00:00 2001
From: Jim Menard <jim@10gen.com>
Date: Tue, 13 Jan 2009 12:53:55 -0500
Subject: [PATCH] UTF-8 encoding/decoding for Ruby 1.9.

---
 README.rdoc            | 15 +++++++++++++++
 lib/mongo/util/bson.rb | 22 ++++++++++++++++++++--
 2 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/README.rdoc b/README.rdoc
index b6933f0..db20766 100644
--- a/README.rdoc
+++ b/README.rdoc
@@ -49,6 +49,21 @@ be running, of course.
 See also the test code, especially tests/test_db_api.rb.
 
 
+= Notes
+
+== String Encoding
+
+The BSON ("Binary JSON") format used to communicate with Mongo requires that
+strings be UTF-8 (http://en.wikipedia.org/wiki/UTF-8).
+
+Ruby 1.9 has built-in character encoding support. All strings sent to Mongo
+and received from Mongo are converted to UTF-8 when necessary, and strings
+read from Mongo will have their character encodings set to UTF-8.
+
+When used with Ruby 1.8, the bytes in each string are written to and read from
+Mongo as-is. If the string is ASCII all is well, because ASCII is a subset of
+UTF-8. If the string is not ASCII then it may not be a well-formed UTF-8 string.
+
 = Testing
 
 If you have the source code, you can run the tests.
diff --git a/lib/mongo/util/bson.rb b/lib/mongo/util/bson.rb
index a31b242..d1e363d 100644
--- a/lib/mongo/util/bson.rb
+++ b/lib/mongo/util/bson.rb
@@ -46,8 +46,18 @@ class BSON
   NUMBER_INT = 16
   MAXKEY = 127
 
+  if RUBY_VERSION >= '1.9'
+    def self.to_utf8(str)
+      str.encode("utf-8")
+    end
+  else
+    def self.to_utf8(str)
+      str                       # TODO punt for now
+    end
+  end
+
   def self.serialize_cstr(buf, val)
-    buf.put_array(val.to_s.unpack("C*") + [0])
+    buf.put_array(to_utf8(val.to_s).unpack("C*") + [0])
   end
 
   def initialize(db=nil)
@@ -173,6 +183,7 @@ class BSON
     doc
   end
 
+  # For debugging.
   def hex_dump
     str = ''
     @buf.to_a.each_with_index { |b,i|
@@ -231,7 +242,11 @@ class BSON
   def deserialize_string_data(buf)
     len = buf.get_int
     bytes = buf.get(len)
-    bytes[0..-2].pack("C*")
+    str = bytes[0..-2].pack("C*")
+    if RUBY_VERSION >= '1.9'
+      str.force_encoding("utf-8")
+    end
+    str
   end
 
   def deserialize_oid_data(buf)
@@ -385,6 +400,9 @@ class BSON
       break if b == 0
       chars << b.chr
     end
+    if RUBY_VERSION >= '1.9'
+      chars.force_encoding("utf-8") # Mongo stores UTF-8
+    end
     chars
   end