-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_citations.py
More file actions
118 lines (92 loc) · 4.52 KB
/
test_citations.py
File metadata and controls
118 lines (92 loc) · 4.52 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
"""Citation-policy enforcement tests for LabCraft."""
from __future__ import annotations
import json
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
PARAMETER_FILES = sorted((ROOT / "data" / "parameters").glob("*.json"))
GROUND_TRUTH_FILES = sorted((ROOT / "task_data").glob("*/ground_truth.json"))
ALLOWED_TIERS = {"Gold", "Silver", "Bronze", "Copper"}
TIER_RANK = {"Copper": 1, "Bronze": 2, "Silver": 3, "Gold": 4}
def _load_json(path: Path):
with open(path) as handle:
return json.load(handle)
def _assert_citation_shape(citation):
assert citation["tier"] in ALLOWED_TIERS
assert citation["tier_justification"].strip()
assert citation.get("doi") or citation.get("canonical_url")
if citation["tier"] == "Gold":
assert citation.get("citation_count_approx", 0) >= 100
def _tier_satisfies(citation, minimum_tier_required):
return TIER_RANK[citation["tier"]] >= TIER_RANK[minimum_tier_required]
def _iter_ground_truth_citation_blocks(payload):
yield from payload.get("decision_points", [])
yield from payload.get("failure_diagnosis_map", {}).values()
def _source_reference_tokens(citation):
tokens = []
doi = citation.get("doi")
if doi:
normalized = doi.lower().strip()
normalized = normalized.removeprefix("https://doi.org/")
normalized = normalized.removeprefix("http://doi.org/")
tokens.extend([normalized, "doi.org/{}".format(normalized)])
canonical_url = citation.get("canonical_url")
if canonical_url:
normalized = canonical_url.lower().strip().rstrip("/")
tokens.append(normalized)
tokens.append(normalized.removeprefix("https://").removeprefix("http://"))
return [token for token in tokens if token]
def test_parameter_files_exist():
assert PARAMETER_FILES
def test_parameter_records_have_valid_citations():
for path in PARAMETER_FILES:
payload = _load_json(path)
for parameter in payload.get("parameters", []):
citations = parameter.get("citations", [])
assert citations, "{} missing citations".format(parameter["parameter_name"])
for citation in citations:
_assert_citation_shape(citation)
minimum = parameter["minimum_tier_required"]
assert any(_tier_satisfies(citation, minimum) for citation in citations)
assert parameter.get("tier_satisfied") is True
def test_ground_truth_decision_points_have_citations():
assert GROUND_TRUTH_FILES
for path in GROUND_TRUTH_FILES:
payload = _load_json(path)
for decision_point in payload.get("decision_points", []):
citations = decision_point.get("citations", [])
assert citations, "{} missing citations".format(decision_point["id"])
for citation in citations:
_assert_citation_shape(citation)
minimum = decision_point["minimum_tier_required"]
assert any(_tier_satisfies(citation, minimum) for citation in citations)
def test_failure_maps_have_citations():
for path in GROUND_TRUTH_FILES:
payload = _load_json(path)
for failure_id, failure_item in payload.get("failure_diagnosis_map", {}).items():
citations = failure_item.get("citations", [])
assert citations, "{} missing citations".format(failure_id)
for citation in citations:
_assert_citation_shape(citation)
def test_sources_file_documents_rejected_sources():
for task_dir in sorted((ROOT / "task_data").glob("*")):
sources_path = task_dir / "SOURCES.md"
assert sources_path.exists()
content = sources_path.read_text()
assert "Rejected Sources" in content
def test_ground_truth_citations_are_documented_in_sources_files():
for path in GROUND_TRUTH_FILES:
sources_path = path.parent / "SOURCES.md"
sources_text = sources_path.read_text().lower()
payload = _load_json(path)
for block in _iter_ground_truth_citation_blocks(payload):
for citation in block.get("citations", []):
tokens = _source_reference_tokens(citation)
assert tokens, "{} citation needs a DOI or canonical URL".format(
citation.get("title", "<untitled>")
)
assert any(token in sources_text for token in tokens), (
"{} citation is not documented in {}".format(
citation.get("title", "<untitled>"),
sources_path.relative_to(ROOT),
)
)