More experimental GridFS improvements

This commit is contained in:
Kyle Banker 2010-02-18 16:31:25 -05:00
parent 5d1d110280
commit 16ea148ab1
8 changed files with 328 additions and 249 deletions

View File

@ -1,7 +1,6 @@
#!/usr/bin/env ruby #!/usr/bin/env ruby
require 'rubygems' require 'rubygems'
require 'mongo' require 'mongo'
#require 'ruby-prof'
include Mongo include Mongo
include GridFS include GridFS
@ -16,9 +15,7 @@ mb = length / 1048576.0
t1 = Time.now t1 = Time.now
@grid = Grid.new(db) @grid = Grid.new(db)
@grid.open('mongodb-new.pdf', 'w') do |f| @id = @grid.put(sample_data, 'mongodb-new.pdf')
f.write(sample_data)
end
puts "Write: #{mb / (Time.now - t1)} mb/s" puts "Write: #{mb / (Time.now - t1)} mb/s"
t1 = Time.now t1 = Time.now
@ -29,14 +26,9 @@ puts "Write: #{mb / (Time.now - t1)} mb/s"
t1 = Time.now t1 = Time.now
@grid = Grid.new(db) @grid = Grid.new(db)
data = @grid.open('mongodb-new.pdf', 'r') do |f| data = @grid.get(@id).read
f.read
end
puts "Read new: #{mb / (Time.now - t1)} mb/s" puts "Read new: #{mb / (Time.now - t1)} mb/s"
file = db['fs.files'].find_one({:filename => 'mongodb-new.pdf'}) file = db['fs.files'].find_one({:filename => 'mongodb-new.pdf'})
p file
puts
p db['fs.chunks'].find({:files_id => file['_id']}, {:fields => ['files_id']}).to_a
t1 = Time.now t1 = Time.now
old_data = GridStore.open(db, 'mongodb.pdf', 'r') do |f| old_data = GridStore.open(db, 'mongodb.pdf', 'r') do |f|

View File

@ -61,3 +61,4 @@ require 'mongo/exceptions'
require 'mongo/gridfs' require 'mongo/gridfs'
require 'mongo/gridfs/grid' require 'mongo/gridfs/grid'
require 'mongo/gridfs/grid_io' require 'mongo/gridfs/grid_io'
require 'mongo/gridfs/grid_file_system'

View File

@ -14,34 +14,35 @@
# limitations under the License. # limitations under the License.
# ++ # ++
# GridFS is a specification for storing large objects in MongoDB.
# See the documentation for GridFS::GridStore
#
# @see GridFS::GridStore
#
# @core gridfs
module Mongo module Mongo
class Grid
DEFAULT_ROOT_COLLECTION = 'fs'
def initialize(db, root_collection=DEFAULT_ROOT_COLLECTION, opts={}) # WARNING: This class is part of a new, experimental GridFS API. Subject to change.
class Grid
DEFAULT_BUCKET_NAME = 'fs'
def initialize(db, bucket_name=DEFAULT_BUCKET_NAME)
check_params(db) check_params(db)
@db = db @db = db
@files = @db["#{root_collection}.files"] @files = @db["#{bucket_name}.files"]
@chunks = @db["#{root_collection}.chunks"] @chunks = @db["#{bucket_name}.chunks"]
@chunks.create_index([['files_id', Mongo::ASCENDING], ['n', Mongo::ASCENDING]])
end end
def open(filename, mode, opts={}) def put(data, filename, opts={})
file = GridIO.new(@files, @chunks, filename, mode, opts) file = GridIO.new(@files, @chunks, filename, 'w', false, opts=opts)
result = nil file.write(data)
begin file.close
if block_given? file.files_id
result = yield file end
end
ensure def get(id)
file.close GridIO.new(@files, @chunks, nil, 'r', false, :_id => id)
end end
result
def delete(id)
@files.remove({"_id" => id})
@chunks.remove({"_id" => id})
end end
private private

View File

@ -0,0 +1,51 @@
# --
# Copyright (C) 2008-2009 10gen Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ++
module Mongo
# WARNING: This class is part of a new, experimental GridFS API. Subject to change.
class GridFileSystem < Grid
def initialize(db, bucket_name=DEFAULT_BUCKET_NAME)
super
@files.create_index([['filename', 1], ['uploadDate', -1]])
end
def open(filename, mode, opts={})
file = GridIO.new(@files, @chunks, filename, mode, true, opts)
return file unless block_given?
result = nil
begin
result = yield file
ensure
file.close
end
result
end
def put(data, filename)
end
def get(id)
end
# Deletes all files matching the given criteria.
def delete(criteria)
end
end
end

View File

@ -15,25 +15,29 @@
# ++ # ++
module Mongo module Mongo
# WARNING: This is part of a new, experimental GridFS API. Subject to change.
class GridIO class GridIO
DEFAULT_CHUNK_SIZE = 256 * 1024 DEFAULT_CHUNK_SIZE = 256 * 1024
DEFAULT_CONTENT_TYPE = 'text/plain' DEFAULT_CONTENT_TYPE = 'binary/octet-stream'
attr_reader :content_type attr_reader :content_type, :chunk_size, :upload_date, :files_id, :filename, :metadata
attr_reader :chunk_size
# @options opts [Hash] :cond def initialize(files, chunks, filename, mode, filesystem, opts={})
def initialize(files, chunks, filename, mode, opts={})
@files = files @files = files
@chunks = chunks @chunks = chunks
@filename = filename @filename = filename
@mode = mode @mode = mode
@content_type = opts[:content_type] || DEFAULT_CONTENT_TYPE @content_type = opts[:content_type] || DEFAULT_CONTENT_TYPE
@chunk_size = opts[:chunk_size] || DEFAULT_CHUNK_SIZE @chunk_size = opts[:chunk_size] || DEFAULT_CHUNK_SIZE
@files_id = opts[:files_id] || Mongo::ObjectID.new @files_id = opts[:_id]
init_file(opts) case @mode
init_mode(opts) when 'r' then init_read(filesystem, opts)
when 'w' then init_write(opts)
else
raise GridError, "Invalid file mode #{@mode}. Valid options include 'r' and 'w'."
end
end end
# Read the data from the file. If a length if specified, will read from the # Read the data from the file. If a length if specified, will read from the
@ -57,6 +61,7 @@ module Mongo
end end
buf buf
end end
alias :data :read
# Write the given string (binary) data to the file. # Write the given string (binary) data to the file.
# #
@ -79,7 +84,7 @@ module Mongo
end end
chunk_available = @chunk_size - @chunk_position chunk_available = @chunk_size - @chunk_position
step_size = (to_write > chunk_available) ? chunk_available : to_write step_size = (to_write > chunk_available) ? chunk_available : to_write
@current_chunk['data'] = Binary.new(@current_chunk['data'].to_s << string[-to_write, step_size]) @current_chunk['data'] = Binary.new((@current_chunk['data'].to_s << string[-to_write, step_size]).unpack("c*"))
@chunk_position += step_size @chunk_position += step_size
to_write -= step_size to_write -= step_size
save_chunk(@current_chunk) save_chunk(@current_chunk)
@ -134,16 +139,16 @@ module Mongo
# @return [True] # @return [True]
def close def close
if @mode[0] == ?w if @mode[0] == ?w
if @upload_date @upload_date = Time.now.utc
@files.remove('_id' => @files_id)
else
@upload_date = Time.now
end
@files.insert(to_mongo_object) @files.insert(to_mongo_object)
end end
true true
end end
def inspect
"_id: #{@files_id}"
end
private private
def create_chunk(n) def create_chunk(n)
@ -184,49 +189,38 @@ module Mongo
end end
# Initialize based on whether the supplied file exists. # Initialize based on whether the supplied file exists.
def init_file(opts) def init_read(filesystem, opts)
selector = {'filename' => @filename} if filesystem
selector.merge(opts[:criteria]) if opts[:criteria] doc = @files.find({'filename' => @filename}, :sort => [["uploadDate", -1]], :limit => 1).next_document
doc = @files.find(selector).next_document raise GridError, "Could not open file with filename #{@filename}" unless doc
if doc
@files_id = doc['_id']
@content_type = doc['contentType']
@chunk_size = doc['chunkSize']
@upload_date = doc['uploadDate']
@aliases = doc['aliases']
@file_length = doc['length']
@metadata = doc['metadata']
@md5 = doc['md5']
else else
@files_id = Mongo::ObjectID.new doc = @files.find({'_id' => @files_id}).next_document
@content_type = opts[:content_type] || DEFAULT_CONTENT_TYPE raise GridError, "Could not open file with id #{@files_id}" unless doc
@chunk_size = opts[:chunk_size] || DEFAULT_CHUNK_SIZE
@length = 0
end end
@files_id = doc['_id']
@content_type = doc['contentType']
@chunk_size = doc['chunkSize']
@upload_date = doc['uploadDate']
@aliases = doc['aliases']
@file_length = doc['length']
@metadata = doc['metadata']
@md5 = doc['md5']
@filename = doc['filename']
@current_chunk = get_chunk(0)
@file_position = 0
end end
# Validates and sets up the class for the given file mode. # Validates and sets up the class for the given file mode.
def init_mode(opts) def init_write(opts)
case @mode @files_id = opts[:_id] || Mongo::ObjectID.new
when 'r' @content_type = opts[:content_type] || @content_type || DEFAULT_CONTENT_TYPE
@current_chunk = get_chunk(0) @chunk_size = opts[:chunk_size] || @chunk_size || DEFAULT_CHUNK_SIZE
@file_position = 0 @file_length = 0
when 'w' @metadata = opts[:metadata] if opts[:metadata]
@chunks.remove({'_files_id' => @files_id})
@metadata = opts[:metadata] if opts[:metadata] @current_chunk = create_chunk(0)
@chunks.create_index([['files_id', Mongo::ASCENDING], ['n', Mongo::ASCENDING]]) @file_position = 0
@current_chunk = create_chunk(0)
@file_position = 0
when 'w+'
@metadata = opts[:metadata] if opts[:metadata]
@chunks.create_index([['files_id', Mongo::ASCENDING], ['n', Mongo::ASCENDING]])
@current_chunk = get_chunk(last_chunk_number) || create_chunk(0)
@chunk_position = @current_chunk['data'].length
@file_position = @length
else
raise GridError, "Illegal file mode #{mode}. Valid options are 'r', 'w', and 'w+'."
end
end end
def to_mongo_object def to_mongo_object

View File

@ -5,8 +5,8 @@ class GridTest < Test::Unit::TestCase
def setup def setup
@db ||= Connection.new(ENV['MONGO_RUBY_DRIVER_HOST'] || 'localhost', @db ||= Connection.new(ENV['MONGO_RUBY_DRIVER_HOST'] || 'localhost',
ENV['MONGO_RUBY_DRIVER_PORT'] || Connection::DEFAULT_PORT).db('ruby-mongo-test') ENV['MONGO_RUBY_DRIVER_PORT'] || Connection::DEFAULT_PORT).db('ruby-mongo-test')
@files = @db.collection('fs.files') @files = @db.collection('test-bucket.files')
@chunks = @db.collection('fs.chunks') @chunks = @db.collection('test-bucket.chunks')
end end
def teardown def teardown
@ -14,178 +14,34 @@ class GridTest < Test::Unit::TestCase
@chunks.remove @chunks.remove
end end
context "When reading:" do context "A basic grid-stored file" do
setup do setup do
@data = "CHUNKS" * 50000 @data = "GRIDDATA" * 50000
@grid = Grid.new(@db) @grid = Grid.new(@db, 'test-bucket')
@grid.open('sample', 'w') do |f| @id = @grid.put(@data, 'sample', :metadata => {'app' => 'photos'})
f.write @data
end
@grid = Grid.new(@db)
end end
should "read sample data" do should "retrieve the stored data" do
data = @grid.open('sample', 'r') { |f| f.read } data = @grid.get(@id).data
assert_equal data.length, @data.length assert_equal @data, data
end end
should "return an empty string if length is zero" do should "store the filename" do
data = @grid.open('sample', 'r') { |f| f.read(0) } file = @grid.get(@id)
assert_equal '', data assert_equal 'sample', file.filename
end end
should "return the first n bytes" do should "store any relevant metadata" do
data = @grid.open('sample', 'r') {|f| f.read(288888) } file = @grid.get(@id)
assert_equal 288888, data.length assert_equal 'photos', file.metadata['app']
assert_equal @data[0...288888], data
end end
should "return the first n bytes even with an offset" do should "delete the file and any chunks" do
data = @grid.open('sample', 'r') do |f| @grid.delete(@id)
f.seek(1000)
f.read(288888)
end
assert_equal 288888, data.length
assert_equal @data[1000...289888], data
end
end
context "When writing:" do
setup do
@data = "BYTES" * 50000
@grid = Grid.new(@db)
@grid.open('sample', 'w') do |f|
f.write @data
end
end
should "read sample data" do
data = @grid.open('sample', 'r') { |f| f.read }
assert_equal data.length, @data.length
end
should "return the total number of bytes written" do
data = 'a' * 300000
assert_equal 300000, @grid.open('write', 'w') {|f| f.write(data) }
end
should "more read sample data" do
data = @grid.open('sample', 'r') { |f| f.read }
assert_equal data.length, @data.length
end
should "raise exception if not opened for write" do
assert_raise GridError do assert_raise GridError do
@grid.open('io', 'r') { |f| f.write('hello') } @grid.get(@id)
end end
end end
end end
context "When appending:" do
setup do
@data = "1"
@grid = Grid.new(@db)
@grid.open('sample', 'w', :chunk_size => 1000) do |f|
f.write @data
end
end
should "add data to the file" do
new_data = "2"
@grid.open('sample', 'w+') do |f|
f.write(new_data)
end
all_data = @grid.open('sample', 'r') {|f| f.read }
assert_equal @data + new_data, all_data
end
should "add multi-chunk-data" do
new_data = "2" * 5000
@grid.open('sample', 'w+') do |f|
f.write(new_data)
end
all_data = @grid.open('sample', 'r') {|f| f.read }
assert_equal @data + new_data, all_data
end
end
context "When writing chunks:" do
setup do
data = "B" * 50000
@grid = Grid.new(@db)
@grid.open('sample', 'w', :chunk_size => 1000) do |f|
f.write data
end
end
should "write the correct number of chunks" do
file = @files.find_one({:filename => 'sample'})
chunks = @chunks.find({'files_id' => file['_id']}).to_a
assert_equal 50, chunks.length
end
end
context "Positioning:" do
setup do
data = 'hello, world' + '1' * 5000 + 'goodbye!' + '2' * 1000 + '!'
@grid = Grid.new(@db)
@grid.open('hello', 'w', :chunk_size => 1000) do |f|
f.write data
end
end
should "seek within chunks" do
@grid.open('hello', 'r') do |f|
f.seek(0)
assert_equal 'h', f.read(1)
f.seek(7)
assert_equal 'w', f.read(1)
f.seek(4)
assert_equal 'o', f.read(1)
f.seek(0)
f.seek(7, IO::SEEK_CUR)
assert_equal 'w', f.read(1)
f.seek(-1, IO::SEEK_CUR)
assert_equal ' ', f.read(1)
f.seek(-4, IO::SEEK_CUR)
assert_equal 'l', f.read(1)
f.seek(3, IO::SEEK_CUR)
assert_equal ',', f.read(1)
end
end
should "seek between chunks" do
@grid.open('hello', 'r') do |f|
f.seek(1000)
assert_equal '11111', f.read(5)
f.seek(5009)
assert_equal '111goodbye!222', f.read(14)
f.seek(-1, IO::SEEK_END)
assert_equal '!', f.read(1)
f.seek(-6, IO::SEEK_END)
assert_equal '2', f.read(1)
end
end
should "tell the current position" do
@grid.open('hello', 'r') do |f|
assert_equal 0, f.tell
f.seek(999)
assert_equal 999, f.tell
end
end
should "seek only in read mode" do
assert_raise GridError do
@grid.open('hello', 'w+') {|f| f.seek(0) }
end
end
end
end end

View File

@ -0,0 +1,185 @@
require 'test/test_helper'
class GridTest < Test::Unit::TestCase
def setup
@db ||= Connection.new(ENV['MONGO_RUBY_DRIVER_HOST'] || 'localhost',
ENV['MONGO_RUBY_DRIVER_PORT'] || Connection::DEFAULT_PORT).db('ruby-mongo-test')
@files = @db.collection('fs.files')
@chunks = @db.collection('fs.chunks')
end
def teardown
@files.remove
@chunks.remove
end
context "When reading:" do
setup do
@data = "CHUNKS" * 50000
@grid = GridFileSystem.new(@db)
@grid.open('sample', 'w') do |f|
f.write @data
end
@grid = GridFileSystem.new(@db)
end
should "read sample data" do
data = @grid.open('sample', 'r') { |f| f.read }
assert_equal data.length, @data.length
end
should "return an empty string if length is zero" do
data = @grid.open('sample', 'r') { |f| f.read(0) }
assert_equal '', data
end
should "return the first n bytes" do
data = @grid.open('sample', 'r') {|f| f.read(288888) }
assert_equal 288888, data.length
assert_equal @data[0...288888], data
end
should "return the first n bytes even with an offset" do
data = @grid.open('sample', 'r') do |f|
f.seek(1000)
f.read(288888)
end
assert_equal 288888, data.length
assert_equal @data[1000...289888], data
end
end
context "When writing:" do
setup do
@data = "BYTES" * 50000
@grid = GridFileSystem.new(@db)
@grid.open('sample', 'w') do |f|
f.write @data
end
end
should "read sample data" do
data = @grid.open('sample', 'r') { |f| f.read }
assert_equal data.length, @data.length
end
should "return the total number of bytes written" do
data = 'a' * 300000
assert_equal 300000, @grid.open('write', 'w') {|f| f.write(data) }
end
should "more read sample data" do
data = @grid.open('sample', 'r') { |f| f.read }
assert_equal data.length, @data.length
end
should "raise exception if not opened for write" do
assert_raise GridError do
@grid.open('io', 'r') { |f| f.write('hello') }
end
end
context "and when overwriting the file" do
setup do
@old = @grid.open('sample', 'r')
@new_data = "DATA" * 1000
@grid.open('sample', 'w') do |f|
f.write @new_data
end
@new = @grid.open('sample', 'r')
end
should "have a newer upload date" do
assert @new.upload_date > @old.upload_date
end
should "have a different files_id" do
assert_not_equal @new.files_id, @old.files_id
end
should "contain the new data" do
assert_equal @new_data, @new.read
end
end
end
context "When writing chunks:" do
setup do
data = "B" * 50000
@grid = GridFileSystem.new(@db)
@grid.open('sample', 'w', :chunk_size => 1000) do |f|
f.write data
end
end
should "write the correct number of chunks" do
file = @files.find_one({:filename => 'sample'})
chunks = @chunks.find({'files_id' => file['_id']}).to_a
assert_equal 50, chunks.length
end
end
context "Positioning:" do
setup do
data = 'hello, world' + '1' * 5000 + 'goodbye!' + '2' * 1000 + '!'
@grid = GridFileSystem.new(@db)
@grid.open('hello', 'w', :chunk_size => 1000) do |f|
f.write data
end
end
should "seek within chunks" do
@grid.open('hello', 'r') do |f|
f.seek(0)
assert_equal 'h', f.read(1)
f.seek(7)
assert_equal 'w', f.read(1)
f.seek(4)
assert_equal 'o', f.read(1)
f.seek(0)
f.seek(7, IO::SEEK_CUR)
assert_equal 'w', f.read(1)
f.seek(-1, IO::SEEK_CUR)
assert_equal ' ', f.read(1)
f.seek(-4, IO::SEEK_CUR)
assert_equal 'l', f.read(1)
f.seek(3, IO::SEEK_CUR)
assert_equal ',', f.read(1)
end
end
should "seek between chunks" do
@grid.open('hello', 'r') do |f|
f.seek(1000)
assert_equal '11111', f.read(5)
f.seek(5009)
assert_equal '111goodbye!222', f.read(14)
f.seek(-1, IO::SEEK_END)
assert_equal '!', f.read(1)
f.seek(-6, IO::SEEK_END)
assert_equal '2', f.read(1)
end
end
should "tell the current position" do
@grid.open('hello', 'r') do |f|
assert_equal 0, f.tell
f.seek(999)
assert_equal 999, f.tell
end
end
should "seek only in read mode" do
assert_raise GridError do
@grid.open('hello', 'w') {|f| f.seek(0) }
end
end
end
end

View File

@ -22,15 +22,14 @@ class GridIOTest < Test::Unit::TestCase
end end
should "set default 256k chunk size" do should "set default 256k chunk size" do
file = GridIO.new(@files, @chunks, @filename, @mode) file = GridIO.new(@files, @chunks, @filename, @mode, false)
assert_equal 256 * 1024, file.chunk_size assert_equal 256 * 1024, file.chunk_size
end end
should "set chunk size" do should "set chunk size" do
file = GridIO.new(@files, @chunks, @filename, @mode, :chunk_size => 1000) file = GridIO.new(@files, @chunks, @filename, @mode, false, :chunk_size => 1000)
assert_equal 1000, file.chunk_size assert_equal 1000, file.chunk_size
end end
end end
end end