prevent catastrophic failures

2011-08-31 17:24:43 -04:00 · 2011-08-31 17:24:43 -04:00 · 16809a69dc
commit 16809a69dc
parent 95cbf09313
3 changed files with 28 additions and 0 deletions
--- a/lib/hydra/master.rb
+++ b/lib/hydra/master.rb
@ -201,6 +201,7 @@ module Hydra #:nodoc:
             @workers << worker if worker
           end
          if worker
+            dead_count = 0
            while true
              begin
                message = worker[:io].gets
@ -209,6 +210,9 @@ module Hydra #:nodoc:
                # SSH gives us back echoes, so we need to ignore our own messages
                if message and !message.class.to_s.index("Worker").nil?
                  message.handle(self, worker)
+                else
+                  dead_count += 1
+                  raise IOError if dead_count > 100
                end
              rescue IOError
                trace "lost Worker [#{worker.inspect}]"
--- a/lib/hydra/runner.rb
+++ b/lib/hydra/runner.rb
@ -11,6 +11,8 @@ module Hydra #:nodoc:
    traceable('RUNNER')

    DEFAULT_LOG_FILE = 'hydra-runner.log'
+    PING_COUNT_FAILURE_TIME = 5
+    WAIT_BETWEEN_PING = 0.1

    # Boot up a runner. It takes an IO object (generally a pipe from its
    # parent) to send it messages on which files to execute.
@ -102,6 +104,7 @@ module Hydra #:nodoc:
            message.handle(self)
          else
            @io.write Ping.new
+            sleep WAIT_BETWEEN_PING
          end
        rescue IOError => ex
          trace "Runner lost Worker"
--- a/test/master_test.rb
+++ b/test/master_test.rb
@ -273,6 +273,27 @@ class MasterTest < Test::Unit::TestCase

        assert_file_exists target_file
      end
+
+      should "not die horribly when the host cannot be reached" do
+        capture_stderr do # redirect stderr
+          @pid = Process.fork do
+            Hydra::Master.new(
+              :files => [test_file],
+              :autosort => false,
+              :listeners => [@master_listener],
+              :runner_listeners => [@runner_listener],
+              :workers => [{
+                :type => :ssh,
+                :connect => 'sdlsdkjfhadsfjsd',
+                :directory => remote_dir_path,
+                :runners => 1
+              }],
+              :verbose => false
+            )
+          end
+        end
+        Process.waitpid @pid
+      end
    end
  end