From 558d8483a40cc57971503ae0cd8c34e97b4ea98c Mon Sep 17 00:00:00 2001 From: Chris Arridge Date: Wed, 13 May 2026 18:13:15 +0100 Subject: [PATCH 1/2] config: update flake8 config to match black formatting --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index a6c2f05e..d6dc595a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,4 +1,4 @@ [flake8] exclude=build,.git,.ve,.hypothesis, ./venv -ignore=E123,E128,E265,E501,W601,W293,E741 +ignore=E123,E128,E265,E501,W601,W293,E741,W503 max-line-length = 119 From 8516511ccc98405d5366c2dde32659fb7c6bc21c Mon Sep 17 00:00:00 2001 From: Chris Arridge Date: Wed, 29 Apr 2026 12:09:05 +0100 Subject: [PATCH 2/2] tests: tests_links: fix for when tests fail due to blocking by Cloudflare This commit attempts to resolve link tests failing due to blocking by Cloudflare. If a link check fails with 403 then the request is tried again after a delay, and if it still fails with 403 the response is interrogated to see if it has come from Cloudflare. If so, a warning is printed and the test for that link is allowed to pass. --- tests/tests_links.py | 74 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 57 insertions(+), 17 deletions(-) diff --git a/tests/tests_links.py b/tests/tests_links.py index 019b8d32..71591aa7 100644 --- a/tests/tests_links.py +++ b/tests/tests_links.py @@ -1,5 +1,7 @@ import os import requests +import time +import urllib3 from tests.browser_test_case import BrowserTestCase @@ -11,6 +13,9 @@ prefix = os.path.join(os.path.dirname(__file__), "data") +urllib3.disable_warnings() + + @tag("link-runner") @override_settings( PROVENANCE_JSON=os.path.join(prefix, "data.json"), DISABLE_COOKIE_POPUP=True @@ -21,6 +26,51 @@ def test_links(self): links_checked = {} + def is_blocked_by_cloudflare(response: requests.Response) -> bool: + # Check for characteristic headers. + if ( + "cloudflare" in response.headers.get("Server", "").lower() + or "CF-RAY" in response.headers + ): + return True + + return False + + def check_single_link(link: str) -> int: + try: + # Some sites reject connection without a user agent. + DEFAULT_HEADERS = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36" + } + + # First, just try a head request. If this is anything but a 403 then return the status code. + r = requests.head(link, headers=DEFAULT_HEADERS, verify=False) + if r.status_code != 403: + return r.status_code + + # We have received 403 unauthorised and need to try again. Delay by two seconds and try again + # with GET which might look less suspicious but using stream=True still has a low impact. + time.sleep(2) + r = requests.get( + link, headers=DEFAULT_HEADERS, verify=False, stream=True + ) + if r.status_code != 403: + return r.status_code + + # Skip the test if we are blocked by Cloudflare. + if is_blocked_by_cloudflare(r): + print(f"{link} is blocked by Cloudflare - skipping") + return 200 + + return r.status_code + + except Exception: + # Set status code to 0 (not a HTTP response code) so it gets displayed along with the other errors at the end. + # This is usually triggered by the request timing out. + return 0 + + return 0 + def check_page_for_broken_links(page): # We use selenium for this kind of test because it's a convenient way to manipulate the dom self.get(page) @@ -42,7 +92,8 @@ def check_page_for_broken_links(page): for a in self.browser.find_elements(By.TAG_NAME, "a"): # Datatables quirk with empty tags, select2 quirk with same issue if ( - a.get_attribute("aria-controls") or a.get_attribute("class") == "remove-select2-option" + a.get_attribute("aria-controls") + or a.get_attribute("class") == "remove-select2-option" ): continue @@ -59,22 +110,8 @@ def check_page_for_broken_links(page): for link in links: if link not in links_checked.keys(): - try: - # Some sites reject connection without a user agent - r = requests.head( - link, - headers={ - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36" - }, - verify=False, - ) - status_code = r.status_code - except Exception as e: - # Set status code to 0 (not a HTTP response code) so it gets displayed along with the other errors at the end. - # This is usually triggered by the request timing out. - print(e) - status_code = 0 + status_code = check_single_link(link) links_checked[link] = status_code if status_code < 200 or status_code > 399: broken = True @@ -105,7 +142,10 @@ def check_page_for_broken_links(page): for page in pages_to_find_links: r = requests.head(f"{self.live_server_url}{page}") status_code = r.status_code - self.assertFalse((status_code < 200 or status_code > 399), f"{self.live_server_url}{page} error {status_code}") + self.assertFalse( + (status_code < 200 or status_code > 399), + f"{self.live_server_url}{page} error {status_code}", + ) # Test the links on the pages for page in pages_to_find_links: