From 16ea148ab1a6d2269dfa887b279cdc50d21db350 Mon Sep 17 00:00:00 2001 From: Kyle Banker Date: Thu, 18 Feb 2010 16:31:25 -0500 Subject: [PATCH] More experimental GridFS improvements --- bin/gridstore_benchmark | 12 +- lib/mongo.rb | 1 + lib/mongo/gridfs/grid.rb | 45 +++---- lib/mongo/gridfs/grid_file_system.rb | 51 ++++++++ lib/mongo/gridfs/grid_io.rb | 96 +++++++------- test/test_grid.rb | 182 +++----------------------- test/test_grid_file_system.rb | 185 +++++++++++++++++++++++++++ test/test_grid_io.rb | 5 +- 8 files changed, 328 insertions(+), 249 deletions(-) create mode 100644 lib/mongo/gridfs/grid_file_system.rb create mode 100644 test/test_grid_file_system.rb diff --git a/bin/gridstore_benchmark b/bin/gridstore_benchmark index 6892c5c..395e821 100755 --- a/bin/gridstore_benchmark +++ b/bin/gridstore_benchmark @@ -1,7 +1,6 @@ #!/usr/bin/env ruby require 'rubygems' require 'mongo' -#require 'ruby-prof' include Mongo include GridFS @@ -16,9 +15,7 @@ mb = length / 1048576.0 t1 = Time.now @grid = Grid.new(db) -@grid.open('mongodb-new.pdf', 'w') do |f| - f.write(sample_data) -end +@id = @grid.put(sample_data, 'mongodb-new.pdf') puts "Write: #{mb / (Time.now - t1)} mb/s" t1 = Time.now @@ -29,14 +26,9 @@ puts "Write: #{mb / (Time.now - t1)} mb/s" t1 = Time.now @grid = Grid.new(db) -data = @grid.open('mongodb-new.pdf', 'r') do |f| - f.read -end +data = @grid.get(@id).read puts "Read new: #{mb / (Time.now - t1)} mb/s" file = db['fs.files'].find_one({:filename => 'mongodb-new.pdf'}) -p file -puts -p db['fs.chunks'].find({:files_id => file['_id']}, {:fields => ['files_id']}).to_a t1 = Time.now old_data = GridStore.open(db, 'mongodb.pdf', 'r') do |f| diff --git a/lib/mongo.rb b/lib/mongo.rb index e71ed29..b7cf4c0 100644 --- a/lib/mongo.rb +++ b/lib/mongo.rb @@ -61,3 +61,4 @@ require 'mongo/exceptions' require 'mongo/gridfs' require 'mongo/gridfs/grid' require 'mongo/gridfs/grid_io' +require 'mongo/gridfs/grid_file_system' diff --git a/lib/mongo/gridfs/grid.rb b/lib/mongo/gridfs/grid.rb index 1436258..850b7b4 100644 --- a/lib/mongo/gridfs/grid.rb +++ b/lib/mongo/gridfs/grid.rb @@ -14,34 +14,35 @@ # limitations under the License. # ++ -# GridFS is a specification for storing large objects in MongoDB. -# See the documentation for GridFS::GridStore -# -# @see GridFS::GridStore -# -# @core gridfs module Mongo - class Grid - DEFAULT_ROOT_COLLECTION = 'fs' - def initialize(db, root_collection=DEFAULT_ROOT_COLLECTION, opts={}) + # WARNING: This class is part of a new, experimental GridFS API. Subject to change. + class Grid + DEFAULT_BUCKET_NAME = 'fs' + + def initialize(db, bucket_name=DEFAULT_BUCKET_NAME) check_params(db) @db = db - @files = @db["#{root_collection}.files"] - @chunks = @db["#{root_collection}.chunks"] + @files = @db["#{bucket_name}.files"] + @chunks = @db["#{bucket_name}.chunks"] + + @chunks.create_index([['files_id', Mongo::ASCENDING], ['n', Mongo::ASCENDING]]) end - def open(filename, mode, opts={}) - file = GridIO.new(@files, @chunks, filename, mode, opts) - result = nil - begin - if block_given? - result = yield file - end - ensure - file.close - end - result + def put(data, filename, opts={}) + file = GridIO.new(@files, @chunks, filename, 'w', false, opts=opts) + file.write(data) + file.close + file.files_id + end + + def get(id) + GridIO.new(@files, @chunks, nil, 'r', false, :_id => id) + end + + def delete(id) + @files.remove({"_id" => id}) + @chunks.remove({"_id" => id}) end private diff --git a/lib/mongo/gridfs/grid_file_system.rb b/lib/mongo/gridfs/grid_file_system.rb new file mode 100644 index 0000000..c38c91f --- /dev/null +++ b/lib/mongo/gridfs/grid_file_system.rb @@ -0,0 +1,51 @@ +# -- +# Copyright (C) 2008-2009 10gen Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ++ + +module Mongo + + # WARNING: This class is part of a new, experimental GridFS API. Subject to change. + class GridFileSystem < Grid + + def initialize(db, bucket_name=DEFAULT_BUCKET_NAME) + super + + @files.create_index([['filename', 1], ['uploadDate', -1]]) + end + + def open(filename, mode, opts={}) + file = GridIO.new(@files, @chunks, filename, mode, true, opts) + return file unless block_given? + result = nil + begin + result = yield file + ensure + file.close + end + result + end + + def put(data, filename) + end + + def get(id) + end + + # Deletes all files matching the given criteria. + def delete(criteria) + end + + end +end diff --git a/lib/mongo/gridfs/grid_io.rb b/lib/mongo/gridfs/grid_io.rb index 9802b35..729b842 100644 --- a/lib/mongo/gridfs/grid_io.rb +++ b/lib/mongo/gridfs/grid_io.rb @@ -15,25 +15,29 @@ # ++ module Mongo + + # WARNING: This is part of a new, experimental GridFS API. Subject to change. class GridIO DEFAULT_CHUNK_SIZE = 256 * 1024 - DEFAULT_CONTENT_TYPE = 'text/plain' + DEFAULT_CONTENT_TYPE = 'binary/octet-stream' - attr_reader :content_type - attr_reader :chunk_size + attr_reader :content_type, :chunk_size, :upload_date, :files_id, :filename, :metadata - # @options opts [Hash] :cond - def initialize(files, chunks, filename, mode, opts={}) + def initialize(files, chunks, filename, mode, filesystem, opts={}) @files = files @chunks = chunks @filename = filename @mode = mode @content_type = opts[:content_type] || DEFAULT_CONTENT_TYPE @chunk_size = opts[:chunk_size] || DEFAULT_CHUNK_SIZE - @files_id = opts[:files_id] || Mongo::ObjectID.new + @files_id = opts[:_id] - init_file(opts) - init_mode(opts) + case @mode + when 'r' then init_read(filesystem, opts) + when 'w' then init_write(opts) + else + raise GridError, "Invalid file mode #{@mode}. Valid options include 'r' and 'w'." + end end # Read the data from the file. If a length if specified, will read from the @@ -57,6 +61,7 @@ module Mongo end buf end + alias :data :read # Write the given string (binary) data to the file. # @@ -79,7 +84,7 @@ module Mongo end chunk_available = @chunk_size - @chunk_position step_size = (to_write > chunk_available) ? chunk_available : to_write - @current_chunk['data'] = Binary.new(@current_chunk['data'].to_s << string[-to_write, step_size]) + @current_chunk['data'] = Binary.new((@current_chunk['data'].to_s << string[-to_write, step_size]).unpack("c*")) @chunk_position += step_size to_write -= step_size save_chunk(@current_chunk) @@ -134,16 +139,16 @@ module Mongo # @return [True] def close if @mode[0] == ?w - if @upload_date - @files.remove('_id' => @files_id) - else - @upload_date = Time.now - end + @upload_date = Time.now.utc @files.insert(to_mongo_object) end true end + def inspect + "_id: #{@files_id}" + end + private def create_chunk(n) @@ -184,49 +189,38 @@ module Mongo end # Initialize based on whether the supplied file exists. - def init_file(opts) - selector = {'filename' => @filename} - selector.merge(opts[:criteria]) if opts[:criteria] - doc = @files.find(selector).next_document - if doc - @files_id = doc['_id'] - @content_type = doc['contentType'] - @chunk_size = doc['chunkSize'] - @upload_date = doc['uploadDate'] - @aliases = doc['aliases'] - @file_length = doc['length'] - @metadata = doc['metadata'] - @md5 = doc['md5'] + def init_read(filesystem, opts) + if filesystem + doc = @files.find({'filename' => @filename}, :sort => [["uploadDate", -1]], :limit => 1).next_document + raise GridError, "Could not open file with filename #{@filename}" unless doc else - @files_id = Mongo::ObjectID.new - @content_type = opts[:content_type] || DEFAULT_CONTENT_TYPE - @chunk_size = opts[:chunk_size] || DEFAULT_CHUNK_SIZE - @length = 0 + doc = @files.find({'_id' => @files_id}).next_document + raise GridError, "Could not open file with id #{@files_id}" unless doc end + + @files_id = doc['_id'] + @content_type = doc['contentType'] + @chunk_size = doc['chunkSize'] + @upload_date = doc['uploadDate'] + @aliases = doc['aliases'] + @file_length = doc['length'] + @metadata = doc['metadata'] + @md5 = doc['md5'] + @filename = doc['filename'] + @current_chunk = get_chunk(0) + @file_position = 0 end # Validates and sets up the class for the given file mode. - def init_mode(opts) - case @mode - when 'r' - @current_chunk = get_chunk(0) - @file_position = 0 - when 'w' - @chunks.remove({'_files_id' => @files_id}) + def init_write(opts) + @files_id = opts[:_id] || Mongo::ObjectID.new + @content_type = opts[:content_type] || @content_type || DEFAULT_CONTENT_TYPE + @chunk_size = opts[:chunk_size] || @chunk_size || DEFAULT_CHUNK_SIZE + @file_length = 0 + @metadata = opts[:metadata] if opts[:metadata] - @metadata = opts[:metadata] if opts[:metadata] - @chunks.create_index([['files_id', Mongo::ASCENDING], ['n', Mongo::ASCENDING]]) - @current_chunk = create_chunk(0) - @file_position = 0 - when 'w+' - @metadata = opts[:metadata] if opts[:metadata] - @chunks.create_index([['files_id', Mongo::ASCENDING], ['n', Mongo::ASCENDING]]) - @current_chunk = get_chunk(last_chunk_number) || create_chunk(0) - @chunk_position = @current_chunk['data'].length - @file_position = @length - else - raise GridError, "Illegal file mode #{mode}. Valid options are 'r', 'w', and 'w+'." - end + @current_chunk = create_chunk(0) + @file_position = 0 end def to_mongo_object diff --git a/test/test_grid.rb b/test/test_grid.rb index 2e424a2..e27d6f2 100644 --- a/test/test_grid.rb +++ b/test/test_grid.rb @@ -5,8 +5,8 @@ class GridTest < Test::Unit::TestCase def setup @db ||= Connection.new(ENV['MONGO_RUBY_DRIVER_HOST'] || 'localhost', ENV['MONGO_RUBY_DRIVER_PORT'] || Connection::DEFAULT_PORT).db('ruby-mongo-test') - @files = @db.collection('fs.files') - @chunks = @db.collection('fs.chunks') + @files = @db.collection('test-bucket.files') + @chunks = @db.collection('test-bucket.chunks') end def teardown @@ -14,178 +14,34 @@ class GridTest < Test::Unit::TestCase @chunks.remove end - context "When reading:" do + context "A basic grid-stored file" do setup do - @data = "CHUNKS" * 50000 - @grid = Grid.new(@db) - @grid.open('sample', 'w') do |f| - f.write @data - end - - @grid = Grid.new(@db) + @data = "GRIDDATA" * 50000 + @grid = Grid.new(@db, 'test-bucket') + @id = @grid.put(@data, 'sample', :metadata => {'app' => 'photos'}) end - should "read sample data" do - data = @grid.open('sample', 'r') { |f| f.read } - assert_equal data.length, @data.length + should "retrieve the stored data" do + data = @grid.get(@id).data + assert_equal @data, data end - should "return an empty string if length is zero" do - data = @grid.open('sample', 'r') { |f| f.read(0) } - assert_equal '', data + should "store the filename" do + file = @grid.get(@id) + assert_equal 'sample', file.filename end - should "return the first n bytes" do - data = @grid.open('sample', 'r') {|f| f.read(288888) } - assert_equal 288888, data.length - assert_equal @data[0...288888], data + should "store any relevant metadata" do + file = @grid.get(@id) + assert_equal 'photos', file.metadata['app'] end - should "return the first n bytes even with an offset" do - data = @grid.open('sample', 'r') do |f| - f.seek(1000) - f.read(288888) - end - assert_equal 288888, data.length - assert_equal @data[1000...289888], data - end - end - - context "When writing:" do - setup do - @data = "BYTES" * 50000 - @grid = Grid.new(@db) - @grid.open('sample', 'w') do |f| - f.write @data - end - end - - should "read sample data" do - data = @grid.open('sample', 'r') { |f| f.read } - assert_equal data.length, @data.length - end - - should "return the total number of bytes written" do - data = 'a' * 300000 - assert_equal 300000, @grid.open('write', 'w') {|f| f.write(data) } - end - - should "more read sample data" do - data = @grid.open('sample', 'r') { |f| f.read } - assert_equal data.length, @data.length - end - - should "raise exception if not opened for write" do - assert_raise GridError do - @grid.open('io', 'r') { |f| f.write('hello') } + should "delete the file and any chunks" do + @grid.delete(@id) + assert_raise GridError do + @grid.get(@id) end end end - context "When appending:" do - setup do - @data = "1" - @grid = Grid.new(@db) - @grid.open('sample', 'w', :chunk_size => 1000) do |f| - f.write @data - end - end - - should "add data to the file" do - new_data = "2" - @grid.open('sample', 'w+') do |f| - f.write(new_data) - end - - all_data = @grid.open('sample', 'r') {|f| f.read } - assert_equal @data + new_data, all_data - end - - should "add multi-chunk-data" do - new_data = "2" * 5000 - - @grid.open('sample', 'w+') do |f| - f.write(new_data) - end - - all_data = @grid.open('sample', 'r') {|f| f.read } - assert_equal @data + new_data, all_data - end - end - - context "When writing chunks:" do - setup do - data = "B" * 50000 - @grid = Grid.new(@db) - @grid.open('sample', 'w', :chunk_size => 1000) do |f| - f.write data - end - end - - should "write the correct number of chunks" do - file = @files.find_one({:filename => 'sample'}) - chunks = @chunks.find({'files_id' => file['_id']}).to_a - assert_equal 50, chunks.length - end - end - - context "Positioning:" do - setup do - data = 'hello, world' + '1' * 5000 + 'goodbye!' + '2' * 1000 + '!' - @grid = Grid.new(@db) - @grid.open('hello', 'w', :chunk_size => 1000) do |f| - f.write data - end - end - - should "seek within chunks" do - @grid.open('hello', 'r') do |f| - f.seek(0) - assert_equal 'h', f.read(1) - f.seek(7) - assert_equal 'w', f.read(1) - f.seek(4) - assert_equal 'o', f.read(1) - f.seek(0) - f.seek(7, IO::SEEK_CUR) - assert_equal 'w', f.read(1) - f.seek(-1, IO::SEEK_CUR) - assert_equal ' ', f.read(1) - f.seek(-4, IO::SEEK_CUR) - assert_equal 'l', f.read(1) - f.seek(3, IO::SEEK_CUR) - assert_equal ',', f.read(1) - end - end - - should "seek between chunks" do - @grid.open('hello', 'r') do |f| - f.seek(1000) - assert_equal '11111', f.read(5) - - f.seek(5009) - assert_equal '111goodbye!222', f.read(14) - - f.seek(-1, IO::SEEK_END) - assert_equal '!', f.read(1) - f.seek(-6, IO::SEEK_END) - assert_equal '2', f.read(1) - end - end - - should "tell the current position" do - @grid.open('hello', 'r') do |f| - assert_equal 0, f.tell - - f.seek(999) - assert_equal 999, f.tell - end - end - - should "seek only in read mode" do - assert_raise GridError do - @grid.open('hello', 'w+') {|f| f.seek(0) } - end - end - end end diff --git a/test/test_grid_file_system.rb b/test/test_grid_file_system.rb new file mode 100644 index 0000000..e042cfd --- /dev/null +++ b/test/test_grid_file_system.rb @@ -0,0 +1,185 @@ +require 'test/test_helper' + +class GridTest < Test::Unit::TestCase + + def setup + @db ||= Connection.new(ENV['MONGO_RUBY_DRIVER_HOST'] || 'localhost', + ENV['MONGO_RUBY_DRIVER_PORT'] || Connection::DEFAULT_PORT).db('ruby-mongo-test') + @files = @db.collection('fs.files') + @chunks = @db.collection('fs.chunks') + end + + def teardown + @files.remove + @chunks.remove + end + + context "When reading:" do + setup do + @data = "CHUNKS" * 50000 + @grid = GridFileSystem.new(@db) + @grid.open('sample', 'w') do |f| + f.write @data + end + + @grid = GridFileSystem.new(@db) + end + + should "read sample data" do + data = @grid.open('sample', 'r') { |f| f.read } + assert_equal data.length, @data.length + end + + should "return an empty string if length is zero" do + data = @grid.open('sample', 'r') { |f| f.read(0) } + assert_equal '', data + end + + should "return the first n bytes" do + data = @grid.open('sample', 'r') {|f| f.read(288888) } + assert_equal 288888, data.length + assert_equal @data[0...288888], data + end + + should "return the first n bytes even with an offset" do + data = @grid.open('sample', 'r') do |f| + f.seek(1000) + f.read(288888) + end + assert_equal 288888, data.length + assert_equal @data[1000...289888], data + end + end + + context "When writing:" do + setup do + @data = "BYTES" * 50000 + @grid = GridFileSystem.new(@db) + @grid.open('sample', 'w') do |f| + f.write @data + end + end + + should "read sample data" do + data = @grid.open('sample', 'r') { |f| f.read } + assert_equal data.length, @data.length + end + + should "return the total number of bytes written" do + data = 'a' * 300000 + assert_equal 300000, @grid.open('write', 'w') {|f| f.write(data) } + end + + should "more read sample data" do + data = @grid.open('sample', 'r') { |f| f.read } + assert_equal data.length, @data.length + end + + should "raise exception if not opened for write" do + assert_raise GridError do + @grid.open('io', 'r') { |f| f.write('hello') } + end + end + + context "and when overwriting the file" do + setup do + @old = @grid.open('sample', 'r') + + @new_data = "DATA" * 1000 + @grid.open('sample', 'w') do |f| + f.write @new_data + end + + @new = @grid.open('sample', 'r') + end + + should "have a newer upload date" do + assert @new.upload_date > @old.upload_date + end + + should "have a different files_id" do + assert_not_equal @new.files_id, @old.files_id + end + + should "contain the new data" do + assert_equal @new_data, @new.read + end + end + end + + context "When writing chunks:" do + setup do + data = "B" * 50000 + @grid = GridFileSystem.new(@db) + @grid.open('sample', 'w', :chunk_size => 1000) do |f| + f.write data + end + end + + should "write the correct number of chunks" do + file = @files.find_one({:filename => 'sample'}) + chunks = @chunks.find({'files_id' => file['_id']}).to_a + assert_equal 50, chunks.length + end + end + + context "Positioning:" do + setup do + data = 'hello, world' + '1' * 5000 + 'goodbye!' + '2' * 1000 + '!' + @grid = GridFileSystem.new(@db) + @grid.open('hello', 'w', :chunk_size => 1000) do |f| + f.write data + end + end + + should "seek within chunks" do + @grid.open('hello', 'r') do |f| + f.seek(0) + assert_equal 'h', f.read(1) + f.seek(7) + assert_equal 'w', f.read(1) + f.seek(4) + assert_equal 'o', f.read(1) + f.seek(0) + f.seek(7, IO::SEEK_CUR) + assert_equal 'w', f.read(1) + f.seek(-1, IO::SEEK_CUR) + assert_equal ' ', f.read(1) + f.seek(-4, IO::SEEK_CUR) + assert_equal 'l', f.read(1) + f.seek(3, IO::SEEK_CUR) + assert_equal ',', f.read(1) + end + end + + should "seek between chunks" do + @grid.open('hello', 'r') do |f| + f.seek(1000) + assert_equal '11111', f.read(5) + + f.seek(5009) + assert_equal '111goodbye!222', f.read(14) + + f.seek(-1, IO::SEEK_END) + assert_equal '!', f.read(1) + f.seek(-6, IO::SEEK_END) + assert_equal '2', f.read(1) + end + end + + should "tell the current position" do + @grid.open('hello', 'r') do |f| + assert_equal 0, f.tell + + f.seek(999) + assert_equal 999, f.tell + end + end + + should "seek only in read mode" do + assert_raise GridError do + @grid.open('hello', 'w') {|f| f.seek(0) } + end + end + end +end diff --git a/test/test_grid_io.rb b/test/test_grid_io.rb index bb11f17..47f6dec 100644 --- a/test/test_grid_io.rb +++ b/test/test_grid_io.rb @@ -22,15 +22,14 @@ class GridIOTest < Test::Unit::TestCase end should "set default 256k chunk size" do - file = GridIO.new(@files, @chunks, @filename, @mode) + file = GridIO.new(@files, @chunks, @filename, @mode, false) assert_equal 256 * 1024, file.chunk_size end should "set chunk size" do - file = GridIO.new(@files, @chunks, @filename, @mode, :chunk_size => 1000) + file = GridIO.new(@files, @chunks, @filename, @mode, false, :chunk_size => 1000) assert_equal 1000, file.chunk_size end end - end