Merge pull request #710 from spiliopoulos/fix_issue_702

Fix issue 702
10 years ago · 8da79c1aa7
parent 11f9833816 4a8aa0921f
commit 8da79c1aa7
2 changed files with 116 additions and 23 deletions
--- a/rq/worker.py
+++ b/rq/worker.py
@ -507,13 +507,9 @@ class Worker(object):
        self.log.debug('Sent heartbeat to prevent worker timeout. '
                       'Next one should arrive within {0} seconds.'.format(timeout))

-    def execute_job(self, job, queue):
+    def fork_work_horse(self, job, queue):
        """Spawns a work horse to perform the actual work and passes it a job.
-        The worker will wait for the work horse and make sure it executes
-        within the given timeout bounds, or will end the work horse with
-        SIGALRM.
        """
-        self.set_state('busy')
        child_pid = os.fork()
        os.environ['RQ_WORKER_ID'] = self.name
        os.environ['RQ_JOB_ID'] = job.id
@ -522,20 +518,65 @@ class Worker(object):
        else:
            self._horse_pid = child_pid
            self.procline('Forked {0} at {1}'.format(child_pid, time.time()))
-            while True:
-                try:
-                    os.waitpid(child_pid, 0)
-                    self.set_state('idle')
-                    break
-                except OSError as e:
-                    # In case we encountered an OSError due to EINTR (which is
-                    # caused by a SIGINT or SIGTERM signal during
-                    # os.waitpid()), we simply ignore it and enter the next
-                    # iteration of the loop, waiting for the child to end.  In
-                    # any other case, this is some other unexpected OS error,
-                    # which we don't want to catch, so we re-raise those ones.
-                    if e.errno != errno.EINTR:
-                        raise
+
+    def monitor_work_horse(self, job):
+        """The worker will monitor the work horse and make sure that it
+        either executes successfully or the status of the job is set to
+        failed
+        """
+        while True:
+            try:
+                _, ret_val = os.waitpid(self._horse_pid, 0)
+                if ret_val != os.EX_OK:
+                    job_status = job.get_status()
+                    if job_status is None:
+                        # Job completed and its ttl has expired
+                        break
+                    if job_status not in [JobStatus.FINISHED, JobStatus.FAILED]:
+                        with self.connection._pipeline() as pipeline:
+                            self.handle_job_failure(
+                                job=job,
+                                pipeline=pipeline
+                            )
+                            try:
+                                pipeline.execute()
+                            except Exception:
+                                pass
+
+                            #Unhandled failure: move the job to the failed queue
+                            self.log.warning(
+                                'Moving job to {0!r} queue'.format(
+                                    self.failed_queue.name
+                                )
+                            )
+                            self.failed_queue.quarantine(
+                                job,
+                                exc_info=(
+                                    "Work-horse proccess "
+                                    "was terminated unexpectedly"
+                                )
+                            )
+                break
+            except OSError as e:
+                # In case we encountered an OSError due to EINTR (which is
+                # caused by a SIGINT or SIGTERM signal during
+                # os.waitpid()), we simply ignore it and enter the next
+                # iteration of the loop, waiting for the child to end.  In
+                # any other case, this is some other unexpected OS error,
+                # which we don't want to catch, so we re-raise those ones.
+                if e.errno != errno.EINTR:
+                    raise
+
+    def execute_job(self, job, queue):
+        """Spawns a work horse to perform the actual work and passes it a job.
+        The worker will wait for the work horse and make sure it executes
+        within the given timeout bounds, or will end the work horse with
+        SIGALRM.
+        """
+        self.set_state('busy')
+        self.fork_work_horse(job, queue)
+        self.monitor_work_horse(job)
+        self.set_state('idle')

    def main_work_horse(self, job, queue):
        """This is the entry point of the newly spawned work horse."""
@ -584,6 +625,27 @@ class Worker(object):
        msg = 'Processing {0} from {1} since {2}'
        self.procline(msg.format(job.func_name, job.origin, time.time()))

+    def handle_job_failure(
+        self,
+        job,
+        started_job_registry=None,
+        pipeline=None
+    ):
+        """Handles the failure or an executing job by:
+            1. Setting the job status to failed
+            2. Removing the job from the started_job_registry
+            3. Setting the workers current job to None
+        """
+
+        if started_job_registry is None:
+            started_job_registry = StartedJobRegistry(
+                job.origin,
+                self.connection
+            )
+        job.set_status(JobStatus.FAILED, pipeline=pipeline)
+        started_job_registry.remove(job, pipeline=pipeline)
+        self.set_current_job_id(None, pipeline=pipeline)
+
    def perform_job(self, job, queue):
        """Performs the actual work of a job.  Will/should only be called
        inside the work horse's process.
@ -624,9 +686,11 @@ class Worker(object):
                pipeline.execute()

            except Exception:
-                job.set_status(JobStatus.FAILED, pipeline=pipeline)
-                started_job_registry.remove(job, pipeline=pipeline)
-                self.set_current_job_id(None, pipeline=pipeline)
+                self.handle_job_failure(
+                    job=job,
+                    started_job_registry=started_job_registry,
+                    pipeline=pipeline
+                )
                try:
                    pipeline.execute()
                except Exception:
--- a/tests/test_worker.py
+++ b/tests/test_worker.py
@ -14,7 +14,8 @@ import subprocess
 from tests import RQTestCase, slow
 from tests.fixtures import (create_file, create_file_after_timeout,
                            div_by_zero, do_nothing, say_hello, say_pid,
-                            run_dummy_heroku_worker, access_self)
+                            run_dummy_heroku_worker, access_self,
+                            long_running_job)
 from tests.helpers import strip_microseconds

 from rq import (get_failed_queue, Queue, SimpleWorker, Worker,
@ -577,6 +578,10 @@ def kill_worker(pid, double_kill):
        time.sleep(0.5)
        os.kill(pid, signal.SIGTERM)

+def wait_and_kill_work_horse(pid, time_to_wait=0.0):
+    time.sleep(time_to_wait)
+    os.kill(pid, signal.SIGKILL)
+

 class TimeoutTestCase:
    def setUp(self):
@ -649,6 +654,30 @@ class WorkerShutdownTestCase(TimeoutTestCase, RQTestCase):
        self.assertIsNotNone(shutdown_requested_date)
        self.assertEqual(type(shutdown_requested_date).__name__, 'datetime')

+    @slow
+    def test_work_horse_death_sets_job_failed(self):
+        """worker with an ongoing job whose work horse dies unexpectadly (before
+        completing the job) should set the job's status to FAILED
+        """
+        fooq = Queue('foo')
+        failed_q = get_failed_queue()
+        self.assertEqual(failed_q.count, 0)
+        self.assertEqual(fooq.count, 0)
+        w = Worker(fooq)
+        sentinel_file = '/tmp/.rq_sentinel_work_horse_death'
+        if os.path.exists(sentinel_file):
+            os.remove(sentinel_file)
+        fooq.enqueue(create_file_after_timeout, sentinel_file, 100)
+        job, queue = w.dequeue_job_and_maintain_ttl(5)
+        w.fork_work_horse(job, queue)
+        p = Process(target=wait_and_kill_work_horse, args=(w._horse_pid, 0.5))
+        p.start()
+        w.monitor_work_horse(job)
+        job_status = job.get_status()
+        p.join(1)
+        self.assertEqual(job_status, JobStatus.FAILED)
+        self.assertEqual(failed_q.count, 1)
+        self.assertEqual(fooq.count, 0)

 def schedule_access_self():
    q = Queue('default', connection=get_current_connection())