Enhance worker termination logic (#1729)

* enhance worker termination logic and allow passing custom exc_info in failure callback

* handle ret_val None

* fix unbound variable

* typing

* docs

* Update exceptions.md

* rename

* typing

* rename

* Update exceptions.md

* revert test change
main
Rony Lutsky 2 years ago committed by GitHub
parent a985445486
commit bba781d288
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -119,3 +119,28 @@ use a custom exception handler that doesn't fall through, for example:
def black_hole(job, *exc_info): def black_hole(job, *exc_info):
return False return False
``` ```
## Work Horse Killed Handler
_New in version 1.13.0._
In addition to job exception handler(s), RQ supports registering a handler for unexpected workhorse termination.
This handler is called when a workhorse is unexpectedly terminated, for example due to OOM.
This is how you set a workhorse termination handler to an RQ worker:
```python
from my_handlers import my_work_horse_killed_handler
w = Worker([q], work_horse_killed_handler=my_work_horse_killed_handler)
```
The handler itself is a function that takes the following parameters: `job`,
`retpid`, `ret_val` and `rusage`:
```python
from resource import struct_rusage
from rq.job import Job
def my_work_horse_killed_handler(job: Job, retpid: int, ret_val: int, rusage: struct_rusage):
# do your thing here, for example set job.retries_left to 0
```

@ -1,7 +1,9 @@
import contextlib
import errno import errno
import logging import logging
import os import os
import random import random
import resource
import signal import signal
import socket import socket
import sys import sys
@ -231,6 +233,7 @@ class Worker:
disable_default_exception_handler: bool = False, disable_default_exception_handler: bool = False,
prepare_for_work: bool = True, prepare_for_work: bool = True,
serializer=None, serializer=None,
work_horse_killed_handler: Optional[Callable[[Job, int, int, resource.struct_rusage], None]] = None
): # noqa ): # noqa
self.default_result_ttl = default_result_ttl self.default_result_ttl = default_result_ttl
self.worker_ttl = default_worker_ttl self.worker_ttl = default_worker_ttl
@ -258,6 +261,7 @@ class Worker:
self.validate_queues() self.validate_queues()
self._ordered_queues = self.queues[:] self._ordered_queues = self.queues[:]
self._exc_handlers: List[Callable] = [] self._exc_handlers: List[Callable] = []
self._work_horse_killed_handler = work_horse_killed_handler
self._state: str = 'starting' self._state: str = 'starting'
self._is_horse: bool = False self._is_horse: bool = False
@ -549,18 +553,14 @@ class Worker:
else: else:
raise raise
def wait_for_horse(self) -> Tuple[Optional[int], Optional[int]]: def wait_for_horse(self) -> Tuple[Optional[int], Optional[int], Optional[resource.struct_rusage]]:
"""Waits for the horse process to complete. """Waits for the horse process to complete.
Uses `0` as argument as to include "any child in the process group of the current process". Uses `0` as argument as to include "any child in the process group of the current process".
""" """
pid = None pid = stat = rusage = None
stat = None with contextlib.suppress(ChildProcessError): # ChildProcessError: [Errno 10] No child processes
try: pid, stat, rusage = os.wait4(self.horse_pid, 0)
pid, stat = os.waitpid(self.horse_pid, 0) return pid, stat, rusage
except ChildProcessError:
# ChildProcessError: [Errno 10] No child processes
pass
return pid, stat
def request_force_stop(self, signum, frame): def request_force_stop(self, signum, frame):
"""Terminates the application (cold shutdown). """Terminates the application (cold shutdown).
@ -1058,12 +1058,12 @@ class Worker:
job (Job): _description_ job (Job): _description_
queue (Queue): _description_ queue (Queue): _description_
""" """
ret_val = None retpid = ret_val = rusage = None
job.started_at = utcnow() job.started_at = utcnow()
while True: while True:
try: try:
with UnixSignalDeathPenalty(self.job_monitoring_interval, HorseMonitorTimeoutException): with UnixSignalDeathPenalty(self.job_monitoring_interval, HorseMonitorTimeoutException):
retpid, ret_val = self.wait_for_horse() retpid, ret_val, rusage = self.wait_for_horse()
break break
except HorseMonitorTimeoutException: except HorseMonitorTimeoutException:
# Horse has not exited yet and is still running. # Horse has not exited yet and is still running.
@ -1109,14 +1109,14 @@ class Worker:
job.ended_at = utcnow() job.ended_at = utcnow()
# Unhandled failure: move the job to the failed queue # Unhandled failure: move the job to the failed queue
self.log.warning( signal_msg = f" (signal {os.WTERMSIG(ret_val)})" if ret_val and os.WIFSIGNALED(ret_val) else ""
('Moving job to FailedJobRegistry ' '(work-horse terminated unexpectedly; waitpid returned {})').format( exc_string = f"Work-horse terminated unexpectedly; waitpid returned {ret_val}{signal_msg}; "
ret_val self.log.warning(f'Moving job to FailedJobRegistry ({exc_string})')
)
)
self.handle_work_horse_killed(job, retpid, ret_val, rusage)
self.handle_job_failure( self.handle_job_failure(
job, queue=queue, exc_string="Work-horse was terminated unexpectedly " "(waitpid returned %s)" % ret_val job, queue=queue,
exc_string=exc_string
) )
def execute_job(self, job: 'Job', queue: 'Queue'): def execute_job(self, job: 'Job', queue: 'Queue'):
@ -1323,7 +1323,7 @@ class Worker:
with self.death_penalty_class(CALLBACK_TIMEOUT, JobTimeoutException, job_id=job.id): with self.death_penalty_class(CALLBACK_TIMEOUT, JobTimeoutException, job_id=job.id):
job.success_callback(job, self.connection, result) job.success_callback(job, self.connection, result)
def execute_failure_callback(self, job: 'Job'): def execute_failure_callback(self, job: 'Job', *exc_info):
"""Executes failure_callback with timeout """Executes failure_callback with timeout
Args: Args:
@ -1332,7 +1332,7 @@ class Worker:
self.log.debug(f"Running failure callbacks for {job.id}") self.log.debug(f"Running failure callbacks for {job.id}")
job.heartbeat(utcnow(), CALLBACK_TIMEOUT) job.heartbeat(utcnow(), CALLBACK_TIMEOUT)
with self.death_penalty_class(CALLBACK_TIMEOUT, JobTimeoutException, job_id=job.id): with self.death_penalty_class(CALLBACK_TIMEOUT, JobTimeoutException, job_id=job.id):
job.failure_callback(job, self.connection, *sys.exc_info()) job.failure_callback(job, self.connection, *exc_info)
def perform_job(self, job: 'Job', queue: 'Queue') -> bool: def perform_job(self, job: 'Job', queue: 'Queue') -> bool:
"""Performs the actual work of a job. Will/should only be called """Performs the actual work of a job. Will/should only be called
@ -1377,11 +1377,11 @@ class Worker:
if job.failure_callback: if job.failure_callback:
try: try:
self.execute_failure_callback(job) self.execute_failure_callback(job, *exc_info)
except: # noqa except: # noqa
self.log.error('Worker %s: error while executing failure callback', self.key, exc_info=True)
exc_info = sys.exc_info() exc_info = sys.exc_info()
exc_string = ''.join(traceback.format_exception(*exc_info)) exc_string = ''.join(traceback.format_exception(*exc_info))
self.log.error('Worker %s: error while executing failure callback', self.key, exc_info=exc_info)
self.handle_job_failure( self.handle_job_failure(
job=job, exc_string=exc_string, queue=queue, started_job_registry=started_job_registry job=job, exc_string=exc_string, queue=queue, started_job_registry=started_job_registry
@ -1453,6 +1453,12 @@ class Worker:
"""Pops the latest exception handler off of the exc handler stack.""" """Pops the latest exception handler off of the exc handler stack."""
return self._exc_handlers.pop() return self._exc_handlers.pop()
def handle_work_horse_killed(self, job, retpid, ret_val, rusage):
if self._work_horse_killed_handler is None:
return
self._work_horse_killed_handler(job, retpid, ret_val, rusage)
def __eq__(self, other): def __eq__(self, other):
"""Equality does not take the database/connection into account""" """Equality does not take the database/connection into account"""
if not isinstance(other, self.__class__): if not isinstance(other, self.__class__):

@ -1224,13 +1224,14 @@ class WorkerShutdownTestCase(TimeoutTestCase, RQTestCase):
w.prepare_job_execution(job) w.prepare_job_execution(job)
w.fork_work_horse(job, queue) w.fork_work_horse(job, queue)
job.timeout = 5 job.timeout = 5
time.sleep(1) time.sleep(1)
with open(sentinel_file) as f: with open(sentinel_file) as f:
subprocess_pid = int(f.read().strip()) subprocess_pid = int(f.read().strip())
self.assertTrue(psutil.pid_exists(subprocess_pid)) self.assertTrue(psutil.pid_exists(subprocess_pid))
w.monitor_work_horse(job, queue) with mock.patch.object(w, 'handle_work_horse_killed', wraps=w.handle_work_horse_killed) as mocked:
w.monitor_work_horse(job, queue)
self.assertEqual(mocked.call_count, 1)
fudge_factor = 1 fudge_factor = 1
total_time = w.job_monitoring_interval + 65 + fudge_factor total_time = w.job_monitoring_interval + 65 + fudge_factor

Loading…
Cancel
Save