From f00905e8b4b7b3b3c3e0c4ec37f95d2e6d38b55e Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Tue, 6 May 2025 15:21:16 -0300 Subject: [PATCH 1/2] feat(collect): add error rate threshold --- data_registry/process_manager/task/collect.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/data_registry/process_manager/task/collect.py b/data_registry/process_manager/task/collect.py index e1e279de..ad5828e2 100644 --- a/data_registry/process_manager/task/collect.py +++ b/data_registry/process_manager/task/collect.py @@ -144,6 +144,9 @@ def get_status(self): url = f"https://{Site.objects.get_current().domain}{path}" logger.warning("%s has warnings: %s\n%s\n", self, url, "\n".join(messages)) + if scrapy_log.error_rate > 0.15: # 15% + raise UnexpectedError(f"The crawl had a {scrapy_log.error_rate} error rate") + return Task.Status.COMPLETED raise RecoverableError(f"Unable to find status of Scrapyd job {scrapyd_job_id}") From 5e36f69f7802ce266270541164fdb1d1c2a917c6 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Tue, 6 May 2025 20:25:25 -0300 Subject: [PATCH 2/2] fix: use IrrecoverableError instead of Unexpected for kingfisher collect --- data_registry/process_manager/task/collect.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data_registry/process_manager/task/collect.py b/data_registry/process_manager/task/collect.py index ad5828e2..f81d160e 100644 --- a/data_registry/process_manager/task/collect.py +++ b/data_registry/process_manager/task/collect.py @@ -10,7 +10,7 @@ from django.urls import reverse from scrapyloganalyzer import ScrapyLogFile -from data_registry.exceptions import ConfigurationError, RecoverableError, UnexpectedError +from data_registry.exceptions import ConfigurationError, IrrecoverableError, RecoverableError, UnexpectedError from data_registry.models import Job, Task from data_registry.process_manager.util import TaskManager, skip_if_not_started from data_registry.util import CHANGE, scrapyd_url @@ -145,7 +145,7 @@ def get_status(self): logger.warning("%s has warnings: %s\n%s\n", self, url, "\n".join(messages)) if scrapy_log.error_rate > 0.15: # 15% - raise UnexpectedError(f"The crawl had a {scrapy_log.error_rate} error rate") + raise IrrecoverableError(f"The crawl had a {scrapy_log.error_rate} error rate") return Task.Status.COMPLETED