From 5793ce986bb20724552a8f33c620cd03da654c2d Mon Sep 17 00:00:00 2001 From: Jim Menard Date: Tue, 13 Jan 2009 12:53:55 -0500 Subject: [PATCH] UTF-8 encoding/decoding for Ruby 1.9. --- README.rdoc | 15 +++++++++++++++ lib/mongo/util/bson.rb | 22 ++++++++++++++++++++-- 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/README.rdoc b/README.rdoc index b6933f0..db20766 100644 --- a/README.rdoc +++ b/README.rdoc @@ -49,6 +49,21 @@ be running, of course. See also the test code, especially tests/test_db_api.rb. += Notes + +== String Encoding + +The BSON ("Binary JSON") format used to communicate with Mongo requires that +strings be UTF-8 (http://en.wikipedia.org/wiki/UTF-8). + +Ruby 1.9 has built-in character encoding support. All strings sent to Mongo +and received from Mongo are converted to UTF-8 when necessary, and strings +read from Mongo will have their character encodings set to UTF-8. + +When used with Ruby 1.8, the bytes in each string are written to and read from +Mongo as-is. If the string is ASCII all is well, because ASCII is a subset of +UTF-8. If the string is not ASCII then it may not be a well-formed UTF-8 string. + = Testing If you have the source code, you can run the tests. diff --git a/lib/mongo/util/bson.rb b/lib/mongo/util/bson.rb index a31b242..d1e363d 100644 --- a/lib/mongo/util/bson.rb +++ b/lib/mongo/util/bson.rb @@ -46,8 +46,18 @@ class BSON NUMBER_INT = 16 MAXKEY = 127 + if RUBY_VERSION >= '1.9' + def self.to_utf8(str) + str.encode("utf-8") + end + else + def self.to_utf8(str) + str # TODO punt for now + end + end + def self.serialize_cstr(buf, val) - buf.put_array(val.to_s.unpack("C*") + [0]) + buf.put_array(to_utf8(val.to_s).unpack("C*") + [0]) end def initialize(db=nil) @@ -173,6 +183,7 @@ class BSON doc end + # For debugging. def hex_dump str = '' @buf.to_a.each_with_index { |b,i| @@ -231,7 +242,11 @@ class BSON def deserialize_string_data(buf) len = buf.get_int bytes = buf.get(len) - bytes[0..-2].pack("C*") + str = bytes[0..-2].pack("C*") + if RUBY_VERSION >= '1.9' + str.force_encoding("utf-8") + end + str end def deserialize_oid_data(buf) @@ -385,6 +400,9 @@ class BSON break if b == 0 chars << b.chr end + if RUBY_VERSION >= '1.9' + chars.force_encoding("utf-8") # Mongo stores UTF-8 + end chars end