Add failure callback call to started job registry cleanup (#1824)

* Add started job registry cleanup job failure callback call * WIP - need to fix test * fix test * rename, tests and docs * better log message * use class name * Update registry.py
3 years ago · 0ba3971d55
parent 95558fcc1d
commit 0ba3971d55
5 changed files with 52 additions and 13 deletions
--- a/docs/docs/exceptions.md
+++ b/docs/docs/exceptions.md
@ -144,3 +144,11 @@ def my_work_horse_killed_handler(job: Job, retpid: int, ret_val: int, rusage: st
    # do your thing here, for example set job.retries_left to 0 
 ```
 ## Built-in Exceptions
 RQ Exceptions you can get in your job failure callbacks
 # AbandonedJobError
 This error means an unfinished job was collected by another worker's maintenance task.  
 This usually happens when a worker is busy with a job and is terminated before it finished that job.  
 Another worker collects this job and moves it to the FailedJobRegistry.
--- a/rq/exceptions.py
+++ b/rq/exceptions.py
@ -30,3 +30,7 @@ class ShutDownImminentException(Exception):
 class TimeoutFormatError(Exception):
    pass
 class AbandonedJobError(Exception):
    pass
--- a/rq/registry.py
+++ b/rq/registry.py
@ -1,22 +1,30 @@
 import calendar
 import logging
 import traceback
 from rq.serializers import resolve_serializer
 import time
 from datetime import datetime, timedelta, timezone
 from typing import TYPE_CHECKING, Any, List, Optional, Type, Union
 from .timeouts import JobTimeoutException, UnixSignalDeathPenalty
 if TYPE_CHECKING:
    from redis import Redis
    from redis.client import Pipeline
 from .utils import as_text
 from .connections import resolve_connection
-from .defaults import DEFAULT_FAILURE_TTL
+from .defaults import DEFAULT_FAILURE_TTL, CALLBACK_TIMEOUT
-from .exceptions import InvalidJobOperation, NoSuchJobError
+from .exceptions import InvalidJobOperation, NoSuchJobError, AbandonedJobError
 from .job import Job, JobStatus
 from .queue import Queue
 from .utils import backend_class, current_timestamp
 logger = logging.getLogger("rq.registry")
 class BaseRegistry:
    """
    Base implementation of a job registry, implemented in Redis sorted set.
@ -202,9 +210,10 @@ class StartedJobRegistry(BaseRegistry):
    """
    key_template = 'rq:wip:{0}'
    death_penalty_class = UnixSignalDeathPenalty
    def cleanup(self, timestamp: Optional[float] = None):
-        """Remove expired jobs from registry and add them to FailedJobRegistry.
+        """Remove abandoned jobs from registry and add them to FailedJobRegistry.
        Removes jobs with an expiry time earlier than timestamp, specified as
        seconds since the Unix epoch. timestamp defaults to call time if
@ -226,6 +235,14 @@ class StartedJobRegistry(BaseRegistry):
                    except NoSuchJobError:
                        continue
                    if job.failure_callback:
                        try:
                            with self.death_penalty_class(CALLBACK_TIMEOUT, JobTimeoutException, job_id=job.id):
                                job.failure_callback(job, self.connection,
                                                     AbandonedJobError, AbandonedJobError(), traceback.extract_stack())
                        except:  # noqa
                            logger.exception('Registry %s: error while executing failure callback', self.key)
                    retry = job.retries_left and job.retries_left > 0
                    if retry:
@ -233,8 +250,11 @@ class StartedJobRegistry(BaseRegistry):
                        job.retry(queue, pipeline)
                    else:
                        exc_string = f"due to {AbandonedJobError.__name__}"
                        logger.warning(f'{self.__class__.__name__} cleanup: Moving job to {FailedJobRegistry.__name__} '
                                       f'({exc_string})')
                        job.set_status(JobStatus.FAILED)
-                        job._exc_info = "Moved to FailedJobRegistry at %s" % datetime.now()
+                        job._exc_info = f"Moved to {FailedJobRegistry.__name__}, {exc_string}, at {datetime.now()}"
                        job.save(pipeline=pipeline, include_meta=False)
                        job.cleanup(ttl=-1, pipeline=pipeline)
                        failed_job_registry.add(job, job.failure_ttl)
--- a/rq/worker.py
+++ b/rq/worker.py
@ -1346,6 +1346,9 @@ class Worker:
        Args:
            job (Job): The Job
        """
        if not job.failure_callback:
            return
        self.log.debug(f"Running failure callbacks for {job.id}")
        job.heartbeat(utcnow(), CALLBACK_TIMEOUT)
        with self.death_penalty_class(CALLBACK_TIMEOUT, JobTimeoutException, job_id=job.id):
@ -1392,13 +1395,12 @@ class Worker:
            exc_info = sys.exc_info()
            exc_string = ''.join(traceback.format_exception(*exc_info))
-            if job.failure_callback:
+            try:
-                try:
+                self.execute_failure_callback(job, *exc_info)
-                    self.execute_failure_callback(job, *exc_info)
+            except:  # noqa
-                except:  # noqa
+                exc_info = sys.exc_info()
-                    exc_info = sys.exc_info()
+                exc_string = ''.join(traceback.format_exception(*exc_info))
-                    exc_string = ''.join(traceback.format_exception(*exc_info))
+                self.log.error('Worker %s: error while executing failure callback', self.key, exc_info=exc_info)
                    self.log.error('Worker %s: error while executing failure callback', self.key, exc_info=exc_info)
            self.handle_job_failure(
                job=job, exc_string=exc_string, queue=queue, started_job_registry=started_job_registry
--- a/tests/test_registry.py
+++ b/tests/test_registry.py
@ -1,9 +1,12 @@
 from datetime import datetime, timedelta
 from unittest import mock
 from unittest.mock import PropertyMock, ANY
 from rq.serializers import JSONSerializer
 from rq.utils import as_text
 from rq.defaults import DEFAULT_FAILURE_TTL
-from rq.exceptions import InvalidJobOperation
+from rq.exceptions import InvalidJobOperation, AbandonedJobError
 from rq.job import Job, JobStatus, requeue_job
 from rq.queue import Queue
 from rq.utils import current_timestamp
@ -161,7 +164,9 @@ class TestRegistry(RQTestCase):
        self.assertNotIn(job, failed_job_registry)
        self.assertIn(job, self.registry)
-        self.registry.cleanup()
+        with mock.patch.object(Job, 'failure_callback', PropertyMock()) as mocked:
            self.registry.cleanup()
            mocked.return_value.assert_any_call(job, self.testconn, AbandonedJobError, ANY, ANY)
        self.assertIn(job.id, failed_job_registry)
        self.assertNotIn(job, self.registry)
        job.refresh()