Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
[flake8]
exclude=build,.git,.ve,.hypothesis, ./venv
ignore=E123,E128,E265,E501,W601,W293,E741
ignore=E123,E128,E265,E501,W601,W293,E741,W503
max-line-length = 119
74 changes: 57 additions & 17 deletions tests/tests_links.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import os
import requests
import time
import urllib3

from tests.browser_test_case import BrowserTestCase

Expand All @@ -11,6 +13,9 @@
prefix = os.path.join(os.path.dirname(__file__), "data")


urllib3.disable_warnings()


@tag("link-runner")
@override_settings(
PROVENANCE_JSON=os.path.join(prefix, "data.json"), DISABLE_COOKIE_POPUP=True
Expand All @@ -21,6 +26,51 @@ def test_links(self):

links_checked = {}

def is_blocked_by_cloudflare(response: requests.Response) -> bool:
# Check for characteristic headers.
if (
"cloudflare" in response.headers.get("Server", "").lower()
or "CF-RAY" in response.headers
):
return True

return False

def check_single_link(link: str) -> int:
try:
# Some sites reject connection without a user agent.
DEFAULT_HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36"
}

# First, just try a head request. If this is anything but a 403 then return the status code.
r = requests.head(link, headers=DEFAULT_HEADERS, verify=False)
if r.status_code != 403:
return r.status_code

# We have received 403 unauthorised and need to try again. Delay by two seconds and try again
# with GET which might look less suspicious but using stream=True still has a low impact.
time.sleep(2)
r = requests.get(
link, headers=DEFAULT_HEADERS, verify=False, stream=True
)
if r.status_code != 403:
return r.status_code

# Skip the test if we are blocked by Cloudflare.
if is_blocked_by_cloudflare(r):
print(f"{link} is blocked by Cloudflare - skipping")
return 200

return r.status_code

except Exception:
# Set status code to 0 (not a HTTP response code) so it gets displayed along with the other errors at the end.
# This is usually triggered by the request timing out.
return 0

return 0

def check_page_for_broken_links(page):
# We use selenium for this kind of test because it's a convenient way to manipulate the dom
self.get(page)
Expand All @@ -42,7 +92,8 @@ def check_page_for_broken_links(page):
for a in self.browser.find_elements(By.TAG_NAME, "a"):
# Datatables quirk with empty <a> tags, select2 quirk with same issue
if (
a.get_attribute("aria-controls") or a.get_attribute("class") == "remove-select2-option"
a.get_attribute("aria-controls")
or a.get_attribute("class") == "remove-select2-option"
):
continue

Expand All @@ -59,22 +110,8 @@ def check_page_for_broken_links(page):
for link in links:

if link not in links_checked.keys():
try:
# Some sites reject connection without a user agent
r = requests.head(
link,
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36"
},
verify=False,
)
status_code = r.status_code
except Exception as e:
# Set status code to 0 (not a HTTP response code) so it gets displayed along with the other errors at the end.
# This is usually triggered by the request timing out.
print(e)
status_code = 0

status_code = check_single_link(link)
links_checked[link] = status_code
if status_code < 200 or status_code > 399:
broken = True
Expand Down Expand Up @@ -105,7 +142,10 @@ def check_page_for_broken_links(page):
for page in pages_to_find_links:
r = requests.head(f"{self.live_server_url}{page}")
status_code = r.status_code
self.assertFalse((status_code < 200 or status_code > 399), f"{self.live_server_url}{page} error {status_code}")
self.assertFalse(
(status_code < 200 or status_code > 399),
f"{self.live_server_url}{page} error {status_code}",
)

# Test the links on the pages
for page in pages_to_find_links:
Expand Down
Loading