diff --git a/.github/workflows/extractRQs.yml b/.github/workflows/extractRQs.yml
index d0f4bb6..b5dc91b 100644
--- a/.github/workflows/extractRQs.yml
+++ b/.github/workflows/extractRQs.yml
@@ -20,6 +20,8 @@ jobs:
run: pip install rdflib
- name: Extract
run: python scripts/transformDotTtlToDotSparql.py
+ - name: Lint headers
+ run: python scripts/lint_headers.py
- name: Commit new .rq files
run: |
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..5717ef9
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+.planning/
diff --git a/A. Metadata/authors.rq b/A. Metadata/authors.rq
index 38a57c8..1d73858 100644
--- a/A. Metadata/authors.rq
+++ b/A. Metadata/authors.rq
@@ -1,3 +1,8 @@
+# title: Authors of All Pathways
+# category: Metadata
+# description: Lists all pathway authors with their name, homepage, and ORCID,
+# along with the number of pathways each author created.
+
PREFIX dc:
PREFIX foaf:
diff --git a/A. Metadata/datacounts/averageDatanodes.rq b/A. Metadata/datacounts/averageDatanodes.rq
index 88f62b2..37a263b 100644
--- a/A. Metadata/datacounts/averageDatanodes.rq
+++ b/A. Metadata/datacounts/averageDatanodes.rq
@@ -1,3 +1,8 @@
+# title: Average Data Nodes per Pathway
+# category: Metadata
+# description: Calculates the average, minimum, and maximum number of data nodes per
+# pathway in WikiPathways.
+
SELECT (AVG(?no) AS ?avg)
(MIN(?no) AS ?min)
(MAX(?no) AS ?max)
diff --git a/A. Metadata/datacounts/averageGeneProducts.rq b/A. Metadata/datacounts/averageGeneProducts.rq
index 4b04573..c462696 100644
--- a/A. Metadata/datacounts/averageGeneProducts.rq
+++ b/A. Metadata/datacounts/averageGeneProducts.rq
@@ -1,3 +1,8 @@
+# title: Average Gene Products per Pathway
+# category: Metadata
+# description: Calculates the average, minimum, and maximum number of gene products
+# per pathway in WikiPathways.
+
SELECT (AVG(?no) AS ?avg)
(MIN(?no) AS ?min)
(MAX(?no) AS ?max)
diff --git a/A. Metadata/datacounts/averageInteractions.rq b/A. Metadata/datacounts/averageInteractions.rq
index 11e4d75..0451d1f 100644
--- a/A. Metadata/datacounts/averageInteractions.rq
+++ b/A. Metadata/datacounts/averageInteractions.rq
@@ -1,3 +1,8 @@
+# title: Average Interactions per Pathway
+# category: Metadata
+# description: Calculates the average, minimum, and maximum number of interactions
+# per pathway in WikiPathways.
+
SELECT (AVG(?no) AS ?avg)
(MIN(?no) AS ?min)
(MAX(?no) AS ?max)
diff --git a/A. Metadata/datacounts/averageMetabolites.rq b/A. Metadata/datacounts/averageMetabolites.rq
index 5936678..8ae4ac4 100644
--- a/A. Metadata/datacounts/averageMetabolites.rq
+++ b/A. Metadata/datacounts/averageMetabolites.rq
@@ -1,3 +1,8 @@
+# title: Average Metabolites per Pathway
+# category: Metadata
+# description: Calculates the average, minimum, and maximum number of metabolites per
+# pathway in WikiPathways.
+
SELECT (AVG(?no) AS ?avg)
(MIN(?no) AS ?min)
(MAX(?no) AS ?max)
diff --git a/A. Metadata/datacounts/averageProteins.rq b/A. Metadata/datacounts/averageProteins.rq
index 7dd1832..c054598 100644
--- a/A. Metadata/datacounts/averageProteins.rq
+++ b/A. Metadata/datacounts/averageProteins.rq
@@ -1,3 +1,8 @@
+# title: Average Proteins per Pathway
+# category: Metadata
+# description: Calculates the average, minimum, and maximum number of proteins per
+# pathway in WikiPathways.
+
SELECT (AVG(?no) AS ?avg)
(MIN(?no) AS ?min)
(MAX(?no) AS ?max)
diff --git a/A. Metadata/datacounts/countDataNodes.rq b/A. Metadata/datacounts/countDataNodes.rq
index 39776f5..dc89f4b 100644
--- a/A. Metadata/datacounts/countDataNodes.rq
+++ b/A. Metadata/datacounts/countDataNodes.rq
@@ -1,3 +1,7 @@
+# title: Count of Data Nodes
+# category: Metadata
+# description: Counts the total number of data nodes in WikiPathways.
+
SELECT DISTINCT count(?DataNodes) as ?DataNodeCount
WHERE {
?DataNodes a wp:DataNode .
diff --git a/A. Metadata/datacounts/countGeneProducts.rq b/A. Metadata/datacounts/countGeneProducts.rq
index d801061..bb70fe9 100644
--- a/A. Metadata/datacounts/countGeneProducts.rq
+++ b/A. Metadata/datacounts/countGeneProducts.rq
@@ -1,3 +1,7 @@
+# title: Count of Gene Products
+# category: Metadata
+# description: Counts the total number of gene products in WikiPathways.
+
SELECT DISTINCT count(?geneProduct) as ?GeneProductCount
WHERE {
?geneProduct a wp:GeneProduct .
diff --git a/A. Metadata/datacounts/countInteractions.rq b/A. Metadata/datacounts/countInteractions.rq
index 6986d60..6c44bd3 100644
--- a/A. Metadata/datacounts/countInteractions.rq
+++ b/A. Metadata/datacounts/countInteractions.rq
@@ -1,3 +1,7 @@
+# title: Count of Interactions
+# category: Metadata
+# description: Counts the total number of interactions in WikiPathways.
+
SELECT DISTINCT count(?Interaction) as ?InteractionCount
WHERE {
?Interaction a wp:Interaction .
diff --git a/A. Metadata/datacounts/countMetabolites.rq b/A. Metadata/datacounts/countMetabolites.rq
index fe74f13..c20fc83 100644
--- a/A. Metadata/datacounts/countMetabolites.rq
+++ b/A. Metadata/datacounts/countMetabolites.rq
@@ -1,3 +1,7 @@
+# title: Count of Metabolites
+# category: Metadata
+# description: Counts the total number of metabolites in WikiPathways.
+
SELECT DISTINCT count(?Metabolite) as ?MetaboliteCount
WHERE {
?Metabolite a wp:Metabolite .
diff --git a/A. Metadata/datacounts/countPathways.rq b/A. Metadata/datacounts/countPathways.rq
index 28d1bf3..a2e36ec 100644
--- a/A. Metadata/datacounts/countPathways.rq
+++ b/A. Metadata/datacounts/countPathways.rq
@@ -1,3 +1,7 @@
+# title: Count of Pathways
+# category: Metadata
+# description: Counts the total number of pathways in WikiPathways.
+
SELECT DISTINCT count(?Pathway) as ?PathwayCount
WHERE {
?Pathway a wp:Pathway, skos:Collection .
diff --git a/A. Metadata/datacounts/countProteins.rq b/A. Metadata/datacounts/countProteins.rq
index 758277f..fa7d13d 100644
--- a/A. Metadata/datacounts/countProteins.rq
+++ b/A. Metadata/datacounts/countProteins.rq
@@ -1,3 +1,7 @@
+# title: Count of Proteins
+# category: Metadata
+# description: Counts the total number of proteins in WikiPathways.
+
SELECT DISTINCT count(?protein) as ?ProteinCount
WHERE {
?protein a wp:Protein .
diff --git a/A. Metadata/datacounts/countSignalingPathways.rq b/A. Metadata/datacounts/countSignalingPathways.rq
index b81151d..a917c51 100644
--- a/A. Metadata/datacounts/countSignalingPathways.rq
+++ b/A. Metadata/datacounts/countSignalingPathways.rq
@@ -1,3 +1,8 @@
+# title: Count of Signaling Pathways
+# category: Metadata
+# description: Counts the total number of signaling pathways in WikiPathways by
+# filtering on the signaling pathway ontology tag.
+
SELECT count(distinct ?pathway) as ?pathwaycount
WHERE {
?tag1 a owl:Class ;
diff --git a/A. Metadata/datacounts/linkoutCounts.rq b/A. Metadata/datacounts/linkoutCounts.rq
index dc0efcf..2734e2d 100644
--- a/A. Metadata/datacounts/linkoutCounts.rq
+++ b/A. Metadata/datacounts/linkoutCounts.rq
@@ -1,3 +1,8 @@
+# title: External Linkout Counts
+# category: Metadata
+# description: Counts the number of distinct entities linked to each external database
+# (ChEBI, ChemSpider, HMDB, PubChem, Ensembl, NCBI Gene, HGNC, Rhea, UniProt).
+
SELECT ?pred (COUNT(DISTINCT ?entity) AS ?count) WHERE {
VALUES ?pred {
# metabolites
diff --git a/A. Metadata/datasources/WPforChemSpider.rq b/A. Metadata/datasources/WPforChemSpider.rq
index c96ceac..5869cfc 100644
--- a/A. Metadata/datasources/WPforChemSpider.rq
+++ b/A. Metadata/datasources/WPforChemSpider.rq
@@ -1,4 +1,7 @@
-#List of WikiPathways for ChemSpider identifiers
+# title: WikiPathways for ChemSpider Identifiers
+# category: Data Sources
+# description: Lists pathways containing metabolites with ChemSpider identifiers,
+# showing the pathway title and extracted ChemSpider ID.
select distinct ?pathwayRes (str(?wpid) as ?pathway) (str(?title) as ?pathwayTitle) (fn:substring(?csId,36) as ?chemspider) where {
?gene a wp:Metabolite ;
diff --git a/A. Metadata/datasources/WPforEnsembl.rq b/A. Metadata/datasources/WPforEnsembl.rq
index 721f26d..9a303c5 100644
--- a/A. Metadata/datasources/WPforEnsembl.rq
+++ b/A. Metadata/datasources/WPforEnsembl.rq
@@ -1,11 +1,14 @@
-#List of WikiPathways for Ensembl identifiers
-
-select distinct ?pathwayRes (str(?wpid) as ?pathway) (str(?title) as ?pathwayTitle) (fn:substring(?ensId,32) as ?ensembl) where {
- ?gene a wp:GeneProduct ;
- dcterms:identifier ?id ;
- dcterms:isPartOf ?pathwayRes ;
- wp:bdbEnsembl ?ensId .
- ?pathwayRes a wp:Pathway ;
- dcterms:identifier ?wpid ;
- dc:title ?title .
-}
+# title: WikiPathways for Ensembl Identifiers
+# category: Data Sources
+# description: Lists pathways containing gene products with Ensembl identifiers,
+# showing the pathway title and extracted Ensembl ID.
+
+select distinct ?pathwayRes (str(?wpid) as ?pathway) (str(?title) as ?pathwayTitle) (fn:substring(?ensId,32) as ?ensembl) where {
+ ?gene a wp:GeneProduct ;
+ dcterms:identifier ?id ;
+ dcterms:isPartOf ?pathwayRes ;
+ wp:bdbEnsembl ?ensId .
+ ?pathwayRes a wp:Pathway ;
+ dcterms:identifier ?wpid ;
+ dc:title ?title .
+}
diff --git a/A. Metadata/datasources/WPforHGNC.rq b/A. Metadata/datasources/WPforHGNC.rq
index 6d2b66f..a0cd6d8 100644
--- a/A. Metadata/datasources/WPforHGNC.rq
+++ b/A. Metadata/datasources/WPforHGNC.rq
@@ -1,4 +1,7 @@
-#List of WikiPathways for HGNC symbols
+# title: WikiPathways for HGNC Symbols
+# category: Data Sources
+# description: Lists pathways containing gene products with HGNC symbol identifiers,
+# showing the pathway title and extracted HGNC symbol.
select distinct ?pathwayRes (str(?wpid) as ?pathway) (str(?title) as ?pathwayTitle) (fn:substring(?hgncId,37) as ?HGNC) where {
?gene a wp:GeneProduct ;
diff --git a/A. Metadata/datasources/WPforHMDB.rq b/A. Metadata/datasources/WPforHMDB.rq
index 800bf8f..0a1d18f 100644
--- a/A. Metadata/datasources/WPforHMDB.rq
+++ b/A. Metadata/datasources/WPforHMDB.rq
@@ -1,4 +1,7 @@
-#ist of WikiPathways for HMDB identifiers
+# title: WikiPathways for HMDB Identifiers
+# category: Data Sources
+# description: Lists pathways containing metabolites with HMDB identifiers, showing
+# the pathway title and extracted HMDB ID.
select distinct ?pathwayRes (str(?wpid) as ?pathway) (str(?title) as ?pathwayTitle) (fn:substring(?hmdbId,29) as ?hmdb) where {
?gene a wp:Metabolite ;
diff --git a/A. Metadata/datasources/WPforNCBI.rq b/A. Metadata/datasources/WPforNCBI.rq
index 66a49f2..04dbec4 100644
--- a/A. Metadata/datasources/WPforNCBI.rq
+++ b/A. Metadata/datasources/WPforNCBI.rq
@@ -1,4 +1,7 @@
-#List of WikiPathways for NCBI Gene identifiers
+# title: WikiPathways for NCBI Gene Identifiers
+# category: Data Sources
+# description: Lists pathways containing gene products with NCBI Gene identifiers,
+# showing the pathway title and extracted NCBI Gene ID.
select distinct ?pathwayRes (str(?wpid) as ?pathway) (str(?title) as ?pathwayTitle) (fn:substring(?ncbiGeneId,33) as ?NCBIGene) where {
?gene a wp:GeneProduct ;
diff --git a/A. Metadata/datasources/WPforPubChemCID.rq b/A. Metadata/datasources/WPforPubChemCID.rq
index f4055fc..138b660 100644
--- a/A. Metadata/datasources/WPforPubChemCID.rq
+++ b/A. Metadata/datasources/WPforPubChemCID.rq
@@ -1,11 +1,14 @@
-#List of WikiPathways for PubChem CID identifiers
-
-select distinct ?pathwayRes (str(?wpid) as ?pathway) (str(?title) as ?pathwayTitle) (fn:substring(?cid,46) as ?PubChem) where {
- ?gene a wp:Metabolite ;
- dcterms:identifier ?id ;
- dcterms:isPartOf ?pathwayRes ;
- wp:bdbPubChem ?cid .
- ?pathwayRes a wp:Pathway ;
- dcterms:identifier ?wpid ;
- dc:title ?title .
-}
\ No newline at end of file
+# title: WikiPathways for PubChem CID Identifiers
+# category: Data Sources
+# description: Lists pathways containing metabolites with PubChem compound identifiers,
+# showing the pathway title and extracted PubChem CID.
+
+select distinct ?pathwayRes (str(?wpid) as ?pathway) (str(?title) as ?pathwayTitle) (fn:substring(?cid,46) as ?PubChem) where {
+ ?gene a wp:Metabolite ;
+ dcterms:identifier ?id ;
+ dcterms:isPartOf ?pathwayRes ;
+ wp:bdbPubChem ?cid .
+ ?pathwayRes a wp:Pathway ;
+ dcterms:identifier ?wpid ;
+ dc:title ?title .
+}
diff --git a/A. Metadata/linksets.rq b/A. Metadata/linksets.rq
index 6d5d7a0..e5bca9f 100644
--- a/A. Metadata/linksets.rq
+++ b/A. Metadata/linksets.rq
@@ -1,3 +1,8 @@
+# title: Linksets Overview
+# category: Metadata
+# description: Returns all VoID linksets in the WikiPathways RDF with their title,
+# creation date, and license information.
+
SELECT DISTINCT ?dataset (str(?titleLit) as ?title) ?date ?license
WHERE {
?dataset a void:Linkset ;
diff --git a/A. Metadata/metadata.rq b/A. Metadata/metadata.rq
index 9c55307..713f667 100644
--- a/A. Metadata/metadata.rq
+++ b/A. Metadata/metadata.rq
@@ -1,7 +1,12 @@
+# title: Dataset Metadata
+# category: Metadata
+# description: Returns all VoID datasets in the WikiPathways RDF with their title,
+# creation date, and license information.
+
SELECT DISTINCT ?dataset (str(?titleLit) as ?title) ?date ?license
WHERE {
?dataset a void:Dataset ;
dcterms:title ?titleLit ;
dcterms:license ?license ;
pav:createdOn ?date .
-}
\ No newline at end of file
+}
diff --git a/A. Metadata/prefixes.rq b/A. Metadata/prefixes.rq
index e652d55..5ff94e0 100644
--- a/A. Metadata/prefixes.rq
+++ b/A. Metadata/prefixes.rq
@@ -1,3 +1,8 @@
+# title: SPARQL Prefixes
+# category: Metadata
+# description: Lists all namespace prefixes declared in the WikiPathways SPARQL
+# endpoint via SHACL prefix declarations.
+
PREFIX sh:
PREFIX xsd:
@@ -6,4 +11,4 @@ SELECT ?prefix ?namespace WHERE {
sh:prefix ?prefix ;
sh:namespace ?namespace
] .
-}
\ No newline at end of file
+}
diff --git a/A. Metadata/species/PWsforSpecies.rq b/A. Metadata/species/PWsforSpecies.rq
index 113ef8f..ddbeb0f 100644
--- a/A. Metadata/species/PWsforSpecies.rq
+++ b/A. Metadata/species/PWsforSpecies.rq
@@ -1,8 +1,14 @@
+# title: Pathways for a Species
+# category: Metadata
+# description: Lists all pathways for a given species, returning the WikiPathways
+# identifier and page URL. Default species is Mus musculus.
+# param: species | string | Homo sapiens | Species
+
SELECT DISTINCT ?wpIdentifier ?pathway ?page
WHERE {
?pathway dc:title ?title .
?pathway foaf:page ?page .
?pathway dc:identifier ?wpIdentifier .
- ?pathway wp:organismName "Mus musculus" . #Replace "Mus musculus" with other species: "Homo sapiens", "Rattus norvegicus", "Danio rerio"
+ ?pathway wp:organismName "{{species}}" .
}
ORDER BY ?wpIdentifier
diff --git a/A. Metadata/species/countDataNodePerSpecies.rq b/A. Metadata/species/countDataNodePerSpecies.rq
index 97aea3f..6b0b896 100644
--- a/A. Metadata/species/countDataNodePerSpecies.rq
+++ b/A. Metadata/species/countDataNodePerSpecies.rq
@@ -1,3 +1,8 @@
+# title: Data Nodes per Species
+# category: Metadata
+# description: Counts the number of distinct data nodes per species in WikiPathways,
+# ordered by count descending.
+
select (count(distinct ?datanode) as ?count) (str(?label) as ?species) where {
?datanode a wp:DataNode ;
dcterms:isPartOf ?pw .
diff --git a/A. Metadata/species/countGeneProductsPerSpecies.rq b/A. Metadata/species/countGeneProductsPerSpecies.rq
index 33fe557..6aa6f4c 100644
--- a/A. Metadata/species/countGeneProductsPerSpecies.rq
+++ b/A. Metadata/species/countGeneProductsPerSpecies.rq
@@ -1,3 +1,8 @@
+# title: Gene Products per Species
+# category: Metadata
+# description: Counts the number of distinct gene products per species in WikiPathways,
+# ordered by count descending.
+
select (count(distinct ?gene) as ?count) (str(?label) as ?species) where {
?gene a wp:GeneProduct ;
dcterms:isPartOf ?pw .
diff --git a/A. Metadata/species/countMetabolitesPerSpecies.rq b/A. Metadata/species/countMetabolitesPerSpecies.rq
index 3897da6..4ed0c82 100644
--- a/A. Metadata/species/countMetabolitesPerSpecies.rq
+++ b/A. Metadata/species/countMetabolitesPerSpecies.rq
@@ -1,3 +1,8 @@
+# title: Metabolites per Species
+# category: Metadata
+# description: Counts the number of distinct metabolites per species in WikiPathways,
+# ordered by count descending.
+
select (str(?label) as ?species) (count(distinct ?metabolite) as ?count) where {
?metabolite a wp:Metabolite ;
dcterms:isPartOf ?pw .
diff --git a/A. Metadata/species/countPathwaysPerSpecies.rq b/A. Metadata/species/countPathwaysPerSpecies.rq
index 7300184..f495d1b 100644
--- a/A. Metadata/species/countPathwaysPerSpecies.rq
+++ b/A. Metadata/species/countPathwaysPerSpecies.rq
@@ -1,3 +1,8 @@
+# title: Pathways per Species
+# category: Metadata
+# description: Counts the number of pathways per species in WikiPathways, returning
+# the species name, organism URI, and pathway count.
+
SELECT DISTINCT (str(?label) as ?name) ?organism (count(?pw) as ?pathwayCount)
WHERE {
?pw dc:title ?title ;
diff --git a/A. Metadata/species/countProteinsPerSpecies.rq b/A. Metadata/species/countProteinsPerSpecies.rq
index 11b912a..9fc7bf0 100644
--- a/A. Metadata/species/countProteinsPerSpecies.rq
+++ b/A. Metadata/species/countProteinsPerSpecies.rq
@@ -1,3 +1,8 @@
+# title: Proteins per Species
+# category: Metadata
+# description: Counts the number of distinct proteins per species in WikiPathways,
+# ordered by count descending.
+
select (count(distinct ?protein) as ?count) (str(?label) as ?species) where {
?protein a wp:Protein ;
dcterms:isPartOf ?pw .
diff --git a/B. Communities/AOP/allPathways.rq b/B. Communities/AOP/allPathways.rq
index e9d9a35..175b357 100644
--- a/B. Communities/AOP/allPathways.rq
+++ b/B. Communities/AOP/allPathways.rq
@@ -1,3 +1,7 @@
+# title: AOP Community Pathways
+# category: Communities
+# description: Lists all pathways tagged with the AOP community curation tag.
+
PREFIX wp:
PREFIX dc:
PREFIX cur:
@@ -7,4 +11,4 @@ WHERE {
?pathway wp:ontologyTag cur:AOP ;
a wp:Pathway ;
dc:title ?title .
-}
\ No newline at end of file
+}
diff --git a/B. Communities/AOP/allProteins.rq b/B. Communities/AOP/allProteins.rq
index 2cc9987..6e02f2f 100644
--- a/B. Communities/AOP/allProteins.rq
+++ b/B. Communities/AOP/allProteins.rq
@@ -1,3 +1,7 @@
+# title: AOP Community Proteins
+# category: Communities
+# description: Lists all proteins found in AOP community pathways.
+
SELECT DISTINCT ?pathway (str(?label) as ?Protein)
WHERE {
?pathway wp:ontologyTag cur:AOP ;
diff --git a/B. Communities/CIRM Stem Cell Pathways/allPathways.rq b/B. Communities/CIRM Stem Cell Pathways/allPathways.rq
index 8f9752c..cfecf50 100644
--- a/B. Communities/CIRM Stem Cell Pathways/allPathways.rq
+++ b/B. Communities/CIRM Stem Cell Pathways/allPathways.rq
@@ -1,3 +1,7 @@
+# title: CIRM Stem Cell Pathways
+# category: Communities
+# description: Lists all pathways tagged with the CIRM Stem Cell community curation tag.
+
SELECT DISTINCT ?pathway (str(?title) as ?PathwayTitle)
WHERE {
?pathway wp:ontologyTag cur:CIRM_Related ;
diff --git a/B. Communities/CIRM Stem Cell Pathways/allProteins.rq b/B. Communities/CIRM Stem Cell Pathways/allProteins.rq
index 367a6c7..16e021b 100644
--- a/B. Communities/CIRM Stem Cell Pathways/allProteins.rq
+++ b/B. Communities/CIRM Stem Cell Pathways/allProteins.rq
@@ -1,3 +1,7 @@
+# title: CIRM Stem Cell Proteins
+# category: Communities
+# description: Lists all proteins found in CIRM Stem Cell community pathways.
+
SELECT DISTINCT ?pathway (str(?label) as ?Protein)
WHERE {
?pathway wp:ontologyTag cur:CIRM_Related ;
diff --git a/B. Communities/COVID19/allPathways.rq b/B. Communities/COVID19/allPathways.rq
index 5088812..9dc1e50 100644
--- a/B. Communities/COVID19/allPathways.rq
+++ b/B. Communities/COVID19/allPathways.rq
@@ -1,3 +1,7 @@
+# title: COVID-19 Community Pathways
+# category: Communities
+# description: Lists all pathways tagged with the COVID-19 community curation tag.
+
SELECT DISTINCT ?pathway (str(?title) as ?PathwayTitle)
WHERE {
?pathway wp:ontologyTag cur:COVID19 ;
diff --git a/B. Communities/COVID19/allProteins.rq b/B. Communities/COVID19/allProteins.rq
index e576ae1..bddb677 100644
--- a/B. Communities/COVID19/allProteins.rq
+++ b/B. Communities/COVID19/allProteins.rq
@@ -1,3 +1,7 @@
+# title: COVID-19 Community Proteins
+# category: Communities
+# description: Lists all proteins found in COVID-19 community pathways.
+
SELECT DISTINCT ?pathway (str(?label) as ?Protein)
WHERE {
?pathway wp:ontologyTag cur:COVID19 ;
diff --git a/B. Communities/Inborn Errors of Metabolism/allMetabolicPWs.rq b/B. Communities/Inborn Errors of Metabolism/allMetabolicPWs.rq
index ea5dd27..69e40db 100644
--- a/B. Communities/Inborn Errors of Metabolism/allMetabolicPWs.rq
+++ b/B. Communities/Inborn Errors of Metabolism/allMetabolicPWs.rq
@@ -1,3 +1,8 @@
+# title: Inborn Errors of Metabolism Metabolic Pathways
+# category: Communities
+# description: Retrieves pathways classified under metabolic pathway ontology terms,
+# filtering by label to find metabolic pathway annotations.
+
SELECT distinct ?pathway ?label ?tag
WHERE {
?tag1 a owl:Class ;
diff --git a/B. Communities/Inborn Errors of Metabolism/allPathways.rq b/B. Communities/Inborn Errors of Metabolism/allPathways.rq
index 0dc3ac8..fc60d19 100644
--- a/B. Communities/Inborn Errors of Metabolism/allPathways.rq
+++ b/B. Communities/Inborn Errors of Metabolism/allPathways.rq
@@ -1,3 +1,7 @@
+# title: Inborn Errors of Metabolism Pathways
+# category: Communities
+# description: Lists all pathways tagged with the Inborn Errors of Metabolism (IEM) community curation tag.
+
SELECT DISTINCT ?pathway (str(?title) as ?PathwayTitle)
WHERE {
?pathway wp:ontologyTag cur:IEM ;
diff --git a/B. Communities/Inborn Errors of Metabolism/allProteins.rq b/B. Communities/Inborn Errors of Metabolism/allProteins.rq
index f5f0bb2..0fdfdb8 100644
--- a/B. Communities/Inborn Errors of Metabolism/allProteins.rq
+++ b/B. Communities/Inborn Errors of Metabolism/allProteins.rq
@@ -1,3 +1,7 @@
+# title: Inborn Errors of Metabolism Proteins
+# category: Communities
+# description: Lists all proteins found in Inborn Errors of Metabolism (IEM) community pathways.
+
SELECT DISTINCT ?pathway (str(?label) as ?Protein)
WHERE {
?pathway wp:ontologyTag cur:IEM ;
diff --git a/B. Communities/Inborn Errors of Metabolism/countMetabolicPWs.rq b/B. Communities/Inborn Errors of Metabolism/countMetabolicPWs.rq
index 03f8ffe..b1c7317 100644
--- a/B. Communities/Inborn Errors of Metabolism/countMetabolicPWs.rq
+++ b/B. Communities/Inborn Errors of Metabolism/countMetabolicPWs.rq
@@ -1,3 +1,8 @@
+# title: Count of IEM Metabolic Pathways
+# category: Communities
+# description: Counts the total number of pathways classified under metabolic pathway
+# ontology terms.
+
SELECT count(distinct ?pathway) as ?pathwaycount
WHERE {
?tag1 a owl:Class ;
diff --git a/B. Communities/Inborn Errors of Metabolism/countProteinsMetabolitesRheaDiseases.rq b/B. Communities/Inborn Errors of Metabolism/countProteinsMetabolitesRheaDiseases.rq
index e2b25f1..fa1ee61 100644
--- a/B. Communities/Inborn Errors of Metabolism/countProteinsMetabolitesRheaDiseases.rq
+++ b/B. Communities/Inborn Errors of Metabolism/countProteinsMetabolitesRheaDiseases.rq
@@ -1,4 +1,9 @@
-#Prefixes required which might not be available in the SPARQL endpoint by default
+# title: IEM Proteins, Metabolites, Rhea, and Diseases
+# category: Communities
+# description: Summarizes IEM community pathways with counts of proteins, metabolites,
+# Rhea reaction annotations, missing Rhea IDs, and linked OMIM disease identifiers
+# per pathway.
+
PREFIX wp:
PREFIX rdfs:
PREFIX dcterms:
diff --git a/B. Communities/Lipids/LIPIDMAPS_Federated.rq b/B. Communities/Lipids/LIPIDMAPS_Federated.rq
index d993ac5..8522a60 100644
--- a/B. Communities/Lipids/LIPIDMAPS_Federated.rq
+++ b/B. Communities/Lipids/LIPIDMAPS_Federated.rq
@@ -1,4 +1,9 @@
-#Pathways describing the biology of oxygenated hydrocarbons (LMFA12)
+# title: LIPID MAPS Federated Query
+# category: Communities
+# description: Retrieves lipid names, formulas, and associated pathways for a specific
+# LIPID MAPS category by querying the LIPID MAPS SPARQL endpoint. May be slower due to
+# external endpoint dependency.
+
PREFIX chebi:
SELECT ?lipid ?name ?formula ?lmid (GROUP_CONCAT(?wpid_;separator=", ") AS ?pathway)
diff --git a/B. Communities/Lipids/LipidClassesTotal.rq b/B. Communities/Lipids/LipidClassesTotal.rq
index e195239..33530b1 100644
--- a/B. Communities/Lipids/LipidClassesTotal.rq
+++ b/B. Communities/Lipids/LipidClassesTotal.rq
@@ -1,11 +1,19 @@
+# title: Total Lipid Classes
+# category: Communities
+# description: Counts the number of individual lipids in a specific LIPID MAPS subclass
+# across human pathways. Change the FILTER value to query different subclasses (FA, GL,
+# GP, SP, ST, PR, SL, PK).
+# param: species | string | Homo sapiens | Species
+# param: lipidClass | enum:FA,GL,GP,SP,ST,PR,SL,PK | FA | LIPID MAPS Class
+
SELECT count(DISTINCT ?lipidID) as ?IndividualLipidsPerClass
WHERE { ?metabolite a wp:Metabolite ;
dcterms:identifier ?id ;
dcterms:isPartOf ?pathwayRes ;
wp:bdbLipidMaps ?lipidID . #Metabolite DataNodes need to have a LIPID MAPS ID, for this query to count correctly (some lipids might be missed due to missing Xrefs)
?pathwayRes a wp:Pathway ;
- wp:organismName "Homo sapiens"; #Filter for a species (ommit when querying all pathways available for all species)
+ wp:organismName "{{species}}"; #Filter for a species (ommit when querying all pathways available for all species)
dcterms:identifier ?wpid ;
dc:title ?title .
- FILTER regex(str(?lipidID), "FA" ). #Filter for a LIPID MAPS ID subclass: 'FA' Fatty Acids ; 'GL' Glycerolipid ; 'GP' Glycerophospholipid ; 'SP' Sphingolipids ; 'ST' Sterol lipids ; 'PR' Prenol Lipids ; 'SL' Saccharolipids ; 'PK' Polyketides
+ FILTER regex(str(?lipidID), "{{lipidClass}}" ). #Filter for a LIPID MAPS ID subclass: 'FA' Fatty Acids ; 'GL' Glycerolipid ; 'GP' Glycerophospholipid ; 'SP' Sphingolipids ; 'ST' Sterol lipids ; 'PR' Prenol Lipids ; 'SL' Saccharolipids ; 'PK' Polyketides
}
diff --git a/B. Communities/Lipids/LipidsClassesCountPerPathway.rq b/B. Communities/Lipids/LipidsClassesCountPerPathway.rq
index 63601ac..b249fa7 100644
--- a/B. Communities/Lipids/LipidsClassesCountPerPathway.rq
+++ b/B. Communities/Lipids/LipidsClassesCountPerPathway.rq
@@ -1,13 +1,20 @@
+# title: Lipid Classes Count per Pathway
+# category: Communities
+# description: Counts the number of lipids in a specific LIPID MAPS subclass per human
+# pathway, ordered by count. Change the FILTER value to query different subclasses.
+# param: species | string | Homo sapiens | Species
+# param: lipidClass | enum:FA,GL,GP,SP,ST,PR,SL,PK | FA | LIPID MAPS Class
+
SELECT DISTINCT ?pathwayRes (str(?wpid) AS ?pathway) (str(?title) AS ?pathwayTitle) (count(DISTINCT ?lipidID) AS ?Class_LipidsInPWs)
WHERE { ?metabolite a wp:Metabolite ;
dcterms:identifier ?id ;
dcterms:isPartOf ?pathwayRes ;
wp:bdbLipidMaps ?lipidID . #Metabolite DataNodes need to have a LIPID MAPS ID, for this query to count correctly (some lipids might be missed due to missing Xrefs)
?pathwayRes a wp:Pathway ;
- wp:organismName "Homo sapiens" ; #Filter for a species (ommit when querying all pathways available for all species)
+ wp:organismName "{{species}}" ; #Filter for a species (ommit when querying all pathways available for all species)
dcterms:identifier ?wpid ;
dc:title ?title .
- FILTER regex(str(?lipidID), "FA" ). #Filter for a LIPID MAPS ID subclass: 'FA' Fatty Acids ; 'GL' Glycerolipid ; 'GP' Glycerophospholipid ; 'SP' Sphingolipids ; 'ST' Sterol lipids ; 'PR' Prenol Lipids ; 'SL' Saccharolipids ; 'PK' Polyketides
+ FILTER regex(str(?lipidID), "{{lipidClass}}" ). #Filter for a LIPID MAPS ID subclass: 'FA' Fatty Acids ; 'GL' Glycerolipid ; 'GP' Glycerophospholipid ; 'SP' Sphingolipids ; 'ST' Sterol lipids ; 'PR' Prenol Lipids ; 'SL' Saccharolipids ; 'PK' Polyketides
}
ORDER BY DESC(?Class_LipidsInPWs)
diff --git a/B. Communities/Lipids/LipidsCountPerPathway.rq b/B. Communities/Lipids/LipidsCountPerPathway.rq
index 75f5406..5ea0466 100644
--- a/B. Communities/Lipids/LipidsCountPerPathway.rq
+++ b/B. Communities/Lipids/LipidsCountPerPathway.rq
@@ -1,3 +1,9 @@
+# title: Lipids Count per Pathway
+# category: Communities
+# description: Counts the total number of lipids with LIPID MAPS identifiers per human
+# pathway, ordered by count.
+# param: species | string | Homo sapiens | Species
+
prefix lipidmaps: #IRI can be used to create URLs from identifiers in line 7
select distinct ?pathwayRes (str(?wpid) as ?pathway) (str(?title) as ?pathwayTitle) (count(distinct ?lipidID) AS ?LipidsInPWs)
where {
@@ -6,7 +12,7 @@ where {
dcterms:isPartOf ?pathwayRes ; #Define metabolites are part of a pathway
wp:bdbLipidMaps ?lipidID . #Find the LIPID MAPS identifier for a certain metabolite
?pathwayRes a wp:Pathway ; #Define what is a pathway
- wp:organismName "Homo sapiens" ; #Filter pathways on species Human
+ wp:organismName "{{species}}" ; #Filter pathways on species Human
dcterms:identifier ?wpid ; #Obtain identifier of pathway
dc:title ?title . #Obtain title of pathway
}
diff --git a/B. Communities/Lipids/allPathways.rq b/B. Communities/Lipids/allPathways.rq
index 8db0ced..9c9042d 100644
--- a/B. Communities/Lipids/allPathways.rq
+++ b/B. Communities/Lipids/allPathways.rq
@@ -1,3 +1,7 @@
+# title: Lipids Community Pathways
+# category: Communities
+# description: Lists all pathways tagged with the Lipids community curation tag.
+
SELECT DISTINCT ?pathway (str(?title) as ?PathwayTitle)
WHERE {
?pathway wp:ontologyTag cur:Lipids ;
diff --git a/B. Communities/Lipids/allProteins.rq b/B. Communities/Lipids/allProteins.rq
index 0d68bbd..dee48f6 100644
--- a/B. Communities/Lipids/allProteins.rq
+++ b/B. Communities/Lipids/allProteins.rq
@@ -1,3 +1,7 @@
+# title: Lipids Community Proteins
+# category: Communities
+# description: Lists all proteins found in Lipids community pathways.
+
SELECT DISTINCT ?pathway (str(?label) as ?Protein)
WHERE {
?pathway wp:ontologyTag cur:Lipids ;
diff --git a/B. Communities/RareDiseases/allPathways.rq b/B. Communities/RareDiseases/allPathways.rq
index d00228f..b2f2568 100644
--- a/B. Communities/RareDiseases/allPathways.rq
+++ b/B. Communities/RareDiseases/allPathways.rq
@@ -1,3 +1,7 @@
+# title: Rare Diseases Community Pathways
+# category: Communities
+# description: Lists all pathways tagged with the Rare Diseases community curation tag.
+
SELECT DISTINCT ?pathway (str(?title) as ?PathwayTitle)
WHERE {
?pathway wp:ontologyTag cur:RareDiseases ;
diff --git a/B. Communities/RareDiseases/allProteins.rq b/B. Communities/RareDiseases/allProteins.rq
index 7d15f83..5dfc8f2 100644
--- a/B. Communities/RareDiseases/allProteins.rq
+++ b/B. Communities/RareDiseases/allProteins.rq
@@ -1,3 +1,7 @@
+# title: Rare Diseases Community Proteins
+# category: Communities
+# description: Lists all proteins found in Rare Diseases community pathways.
+
SELECT DISTINCT ?pathway (str(?label) as ?Protein)
WHERE {
?pathway wp:ontologyTag cur:RareDiseases ;
diff --git a/B. Communities/Reactome/getPathways.rq b/B. Communities/Reactome/getPathways.rq
index be5611b..c8f7b19 100644
--- a/B. Communities/Reactome/getPathways.rq
+++ b/B. Communities/Reactome/getPathways.rq
@@ -1,3 +1,7 @@
+# title: Reactome Pathways
+# category: Communities
+# description: Lists all pathways tagged with the Reactome Approved curation tag.
+
SELECT DISTINCT ?pathway (str(?titleLit) as ?title)
WHERE {
?pathway wp:ontologyTag cur:Reactome_Approved ;
diff --git a/B. Communities/Reactome/refsReactomeAndWP.rq b/B. Communities/Reactome/refsReactomeAndWP.rq
index 6e2f146..e4a49a2 100644
--- a/B. Communities/Reactome/refsReactomeAndWP.rq
+++ b/B. Communities/Reactome/refsReactomeAndWP.rq
@@ -1,3 +1,8 @@
+# title: References in Both Reactome and WikiPathways
+# category: Communities
+# description: Counts publication references that appear in both Reactome-approved and
+# WikiPathways Analysis Collection pathways.
+
SELECT (COUNT(DISTINCT ?pubmed) AS ?count)
WHERE {
?pubmed a wp:PublicationReference .
diff --git a/B. Communities/Reactome/refsReactomeNotWP.rq b/B. Communities/Reactome/refsReactomeNotWP.rq
index 9ea9796..ea3b1f5 100644
--- a/B. Communities/Reactome/refsReactomeNotWP.rq
+++ b/B. Communities/Reactome/refsReactomeNotWP.rq
@@ -1,3 +1,8 @@
+# title: References in Reactome but Not WikiPathways
+# category: Communities
+# description: Counts publication references found in Reactome-approved pathways but not
+# in the WikiPathways Analysis Collection.
+
SELECT (COUNT(DISTINCT ?pubmed) AS ?count)
WHERE {
?pubmed a wp:PublicationReference .
diff --git a/B. Communities/Reactome/refsWPNotReactome.rq b/B. Communities/Reactome/refsWPNotReactome.rq
index 380e272..e59578a 100644
--- a/B. Communities/Reactome/refsWPNotReactome.rq
+++ b/B. Communities/Reactome/refsWPNotReactome.rq
@@ -1,3 +1,8 @@
+# title: References in WikiPathways but Not Reactome
+# category: Communities
+# description: Counts publication references found in the WikiPathways Analysis Collection
+# but not in Reactome-approved pathways.
+
SELECT (COUNT(DISTINCT ?pubmed) AS ?count)
WHERE {
?pubmed a wp:PublicationReference .
diff --git a/B. Communities/WormBase/allPathways.rq b/B. Communities/WormBase/allPathways.rq
index 36082c6..c9cb6f7 100644
--- a/B. Communities/WormBase/allPathways.rq
+++ b/B. Communities/WormBase/allPathways.rq
@@ -1,3 +1,7 @@
+# title: WormBase Community Pathways
+# category: Communities
+# description: Lists all pathways tagged with the WormBase Approved community curation tag.
+
SELECT DISTINCT ?pathway (str(?title) as ?PathwayTitle)
WHERE {
?pathway wp:ontologyTag cur:WormBase_Approved ;
diff --git a/B. Communities/WormBase/allProteins.rq b/B. Communities/WormBase/allProteins.rq
index 0239f7a..2384a06 100644
--- a/B. Communities/WormBase/allProteins.rq
+++ b/B. Communities/WormBase/allProteins.rq
@@ -1,3 +1,7 @@
+# title: WormBase Community Proteins
+# category: Communities
+# description: Lists all proteins found in WormBase Approved community pathways.
+
SELECT DISTINCT ?pathway (str(?label) as ?Protein)
WHERE {
?pathway wp:ontologyTag cur:WormBase_Approved ;
diff --git a/C. Collaborations/AOP-Wiki/MetaboliteInAOP-Wiki.rq b/C. Collaborations/AOP-Wiki/MetaboliteInAOP-Wiki.rq
index 8fe9714..fd77d30 100644
--- a/C. Collaborations/AOP-Wiki/MetaboliteInAOP-Wiki.rq
+++ b/C. Collaborations/AOP-Wiki/MetaboliteInAOP-Wiki.rq
@@ -1,10 +1,17 @@
-PREFIX aopo:
-PREFIX cheminf:
+# title: Metabolites in AOP-Wiki
+# category: Collaborations
+# description: Finds metabolites in human pathways that are linked to stressors in
+# AOP-Wiki by querying the AOP-Wiki SPARQL endpoint via ChEBI identifiers. May be
+# slower due to external endpoint dependency.
+# param: species | string | Homo sapiens | Species
-SELECT DISTINCT (str(?title) as ?pathwayName) ?chemical ?ChEBI ?ChemicalName ?mappedid ?LinkedStressor
+PREFIX aopo:
+PREFIX cheminf:
+
+SELECT DISTINCT (str(?title) as ?pathwayName) ?chemical ?ChEBI ?ChemicalName ?mappedid ?LinkedStressor
WHERE {
- ?pathway a wp:Pathway ; wp:organismName "Homo sapiens"; dcterms:identifier ?WPID ; dc:title ?title .
+ ?pathway a wp:Pathway ; wp:organismName "{{species}}"; dcterms:identifier ?WPID ; dc:title ?title .
?chemical a wp:Metabolite; dcterms:isPartOf ?pathway; wp:bdbChEBI ?mappedid .
SERVICE {
?mappedid a cheminf:000407; cheminf:000407 ?ChEBI .
diff --git a/C. Collaborations/MetaNetX/reactionID_mapping.rq b/C. Collaborations/MetaNetX/reactionID_mapping.rq
index a356ffa..019e648 100644
--- a/C. Collaborations/MetaNetX/reactionID_mapping.rq
+++ b/C. Collaborations/MetaNetX/reactionID_mapping.rq
@@ -1,4 +1,9 @@
-#Prefixes required which might not be available in the SPARQL endpoint by default
+# title: MetaNetX Reaction ID Mapping
+# category: Collaborations
+# description: Maps Rhea reaction IDs from a WikiPathways pathway to MetaNetX reaction
+# identifiers by querying the MetaNetX SPARQL endpoint. May be slower due to external
+# endpoint dependency.
+
PREFIX wp:
PREFIX rdfs:
PREFIX dcterms:
diff --git a/C. Collaborations/MolMeDB/ONEpubchem_MANYpathways.rq b/C. Collaborations/MolMeDB/ONEpubchem_MANYpathways.rq
index 13f6ae9..fbf7c49 100644
--- a/C. Collaborations/MolMeDB/ONEpubchem_MANYpathways.rq
+++ b/C. Collaborations/MolMeDB/ONEpubchem_MANYpathways.rq
@@ -1,3 +1,10 @@
+# title: Pathways for a PubChem Compound (MolMeDB)
+# category: Collaborations
+# description: Finds all human pathways containing a specific MolMeDB compound by
+# resolving its PubChem identifier through the MolMeDB SPARQL endpoint. May be slower
+# due to external endpoint dependency.
+# param: species | string | Homo sapiens | Species
+
SELECT DISTINCT ?pathwayRes (str(?wpid) as ?pathway) (str(?title) as ?pathwayTitle) ((substr(str(?COMPOUND),46)) as ?PubChem) WHERE
{
SERVICE {
@@ -11,7 +18,7 @@ SELECT DISTINCT ?pathwayRes (str(?wpid) as ?pathway) (str(?title) as ?pathwayTit
wp:bdbPubChem ?COMPOUND .
?pathwayRes a wp:Pathway ;
- wp:organismName "Homo sapiens";
+ wp:organismName "{{species}}";
dcterms:identifier ?wpid ;
dc:title ?title .
}
diff --git a/C. Collaborations/MolMeDB/SUBSETpathways_ONEpubchem.rq b/C. Collaborations/MolMeDB/SUBSETpathways_ONEpubchem.rq
index 9f6e1fd..91b71c0 100644
--- a/C. Collaborations/MolMeDB/SUBSETpathways_ONEpubchem.rq
+++ b/C. Collaborations/MolMeDB/SUBSETpathways_ONEpubchem.rq
@@ -1,3 +1,10 @@
+# title: PubChem Compound in Pathway Subset (MolMeDB)
+# category: Collaborations
+# description: Checks a subset of pathways for the presence of a specific MolMeDB
+# compound by querying the MolMeDB SPARQL endpoint. Uses nested federation with both
+# MolMeDB and WikiPathways endpoints. May be slower due to external endpoint dependency.
+# param: species | string | Homo sapiens | Species
+
SELECT DISTINCT ?pathwayRes (str(?wpid) as ?pathway) (str(?title) as ?pathwayTitle) ((substr(str(?COMPOUND),46)) as ?PubChem) WHERE {
SERVICE {
SERVICE {
@@ -9,7 +16,7 @@ SELECT DISTINCT ?pathwayRes (str(?wpid) as ?pathway) (str(?title) as ?pathwayTit
wp:bdbPubChem ?COMPOUND .
?pathwayRes a wp:Pathway ;
- wp:organismName "Homo sapiens" ;
+ wp:organismName "{{species}}" ;
dcterms:identifier ?wpid ;
dc:title ?title .
}
diff --git a/C. Collaborations/neXtProt/ProteinCellularLocation.rq b/C. Collaborations/neXtProt/ProteinCellularLocation.rq
index 85ec675..56eb06f 100644
--- a/C. Collaborations/neXtProt/ProteinCellularLocation.rq
+++ b/C. Collaborations/neXtProt/ProteinCellularLocation.rq
@@ -1,3 +1,9 @@
+# title: Protein Cellular Location via neXtProt
+# category: Collaborations
+# description: Retrieves subcellular locations for proteins in Rett syndrome pathways
+# by querying the neXtProt SPARQL endpoint for gold-quality cellular component
+# annotations. May be slower due to external endpoint dependency.
+
PREFIX :
select distinct ?pathwayname ?entry str(?gen) (group_concat(distinct str(?loclab); SEPARATOR = ",") as ?locations) where {
{?geneProduct a wp:Protein}
diff --git a/C. Collaborations/neXtProt/ProteinMitochondria.rq b/C. Collaborations/neXtProt/ProteinMitochondria.rq
index 2bf6379..799cd95 100644
--- a/C. Collaborations/neXtProt/ProteinMitochondria.rq
+++ b/C. Collaborations/neXtProt/ProteinMitochondria.rq
@@ -1,3 +1,9 @@
+# title: Mitochondrial Proteins via neXtProt
+# category: Collaborations
+# description: Identifies mitochondrial proteins in Rett syndrome pathways by querying
+# the neXtProt SPARQL endpoint for gold-quality mitochondrion localization annotations.
+# May be slower due to external endpoint dependency.
+
PREFIX :
PREFIX cv:
diff --git a/C. Collaborations/smallMolecules_Rhea_IDSM/molecularSimularity_Reactions.rq b/C. Collaborations/smallMolecules_Rhea_IDSM/molecularSimularity_Reactions.rq
index c2e632a..4187157 100644
--- a/C. Collaborations/smallMolecules_Rhea_IDSM/molecularSimularity_Reactions.rq
+++ b/C. Collaborations/smallMolecules_Rhea_IDSM/molecularSimularity_Reactions.rq
@@ -1,3 +1,9 @@
+# title: Molecular Similarity Reactions via Rhea and IDSM
+# category: Collaborations
+# description: Finds structurally similar compounds for reaction sources and targets in a
+# pathway by querying the IDSM structure search service for molecular similarity. May be
+# slower due to external endpoint dependency.
+
PREFIX owl:
PREFIX ebi:
PREFIX sachem:
diff --git a/D. General/GenesofPathway.rq b/D. General/GenesofPathway.rq
index f040b00..5d5e7ca 100644
--- a/D. General/GenesofPathway.rq
+++ b/D. General/GenesofPathway.rq
@@ -1,8 +1,14 @@
+# title: Genes of a Pathway
+# category: General
+# description: Lists all gene products in a given pathway, returning the pathway
+# identifier and gene product labels.
+# param: pathwayId | string | WP1560 | Pathway ID
+
select distinct ?pathway (str(?label) as ?geneProduct) where {
- ?geneProduct a wp:GeneProduct .
+ ?geneProduct a wp:GeneProduct .
?geneProduct rdfs:label ?label .
?geneProduct dcterms:isPartOf ?pathwayRev .
?pathwayRev a wp:Pathway .
?pathwayRev dc:identifier ?pathway .
- ?pathwayRev dcterms:identifier "WP1560" . #Replace "WP1560" with WP ID of interest
+ ?pathwayRev dcterms:identifier "{{pathwayId}}" .
}
diff --git a/D. General/InteractionsofPathway.rq b/D. General/InteractionsofPathway.rq
index cf65977..6e61a67 100644
--- a/D. General/InteractionsofPathway.rq
+++ b/D. General/InteractionsofPathway.rq
@@ -1,8 +1,14 @@
+# title: Interactions of a Pathway
+# category: General
+# description: Returns all interactions in a given pathway along with the
+# participating data nodes and their labels.
+# param: pathwayId | string | WP1425 | Pathway ID
+
SELECT DISTINCT ?pathway ?interaction ?participants ?DataNodeLabel
WHERE {
?pathway a wp:Pathway ;
- dc:identifier .
+ dc:identifier .
?interaction dcterms:isPartOf ?pathway ;
a wp:Interaction ;
wp:participants ?participants .
diff --git a/D. General/MetabolitesofPathway.rq b/D. General/MetabolitesofPathway.rq
index f4f2497..cbf2e27 100644
--- a/D. General/MetabolitesofPathway.rq
+++ b/D. General/MetabolitesofPathway.rq
@@ -1,7 +1,13 @@
+# title: Metabolites of a Pathway
+# category: General
+# description: Lists all metabolites in a given pathway, returning the pathway
+# identifier and metabolite labels.
+# param: pathwayId | string | WP1560 | Pathway ID
+
select distinct ?pathway (str(?label) as ?Metabolite) where {
- ?Metabolite a wp:Metabolite ;
+ ?Metabolite a wp:Metabolite ;
rdfs:label ?label ;
dcterms:isPartOf ?pathway .
?pathway a wp:Pathway ;
- dcterms:identifier "WP1560" . #Replace "WP1560" with WP ID of interest
+ dcterms:identifier "{{pathwayId}}" .
}
diff --git a/D. General/OntologyofPathway.rq b/D. General/OntologyofPathway.rq
index f4a715f..9dba2e0 100644
--- a/D. General/OntologyofPathway.rq
+++ b/D. General/OntologyofPathway.rq
@@ -1,9 +1,15 @@
-SELECT (?o as ?pwOntologyTerm) (str(?titleLit) as ?title) ?pathway
+# title: Ontology Terms of a Pathway
+# category: General
+# description: Retrieves all ontology tags associated with a given pathway,
+# returning the ontology term URI, pathway title, and identifier.
+# param: pathwayId | string | WP1560 | Pathway ID
+
+SELECT (?o as ?pwOntologyTerm) (str(?titleLit) as ?title) ?pathway
WHERE {
?pathwayRDF wp:ontologyTag ?o ;
dc:identifier ?pathway ;
dc:title ?titleLit ;
- dcterms:identifier "WP1560" . #Replace "WP1560" with WP ID of interest
+ dcterms:identifier "{{pathwayId}}" .
FILTER (! regex(str(?pathway), "group"))
}
diff --git a/E. Literature/allPathwayswithPubMed.rq b/E. Literature/allPathwayswithPubMed.rq
index 1716dee..76e6299 100644
--- a/E. Literature/allPathwayswithPubMed.rq
+++ b/E. Literature/allPathwayswithPubMed.rq
@@ -1,6 +1,11 @@
-SELECT DISTINCT ?pathway ?pubmed
-WHERE
- {?pubmed a wp:PublicationReference .
+# title: All Pathways with PubMed References
+# category: Literature
+# description: Lists pathways that have associated PubMed publication references,
+# returning pathway and PubMed identifiers ordered by pathway.
+
+SELECT DISTINCT ?pathway ?pubmed
+WHERE
+ {?pubmed a wp:PublicationReference .
?pubmed dcterms:isPartOf ?pathway }
ORDER BY ?pathway
LIMIT 50
diff --git a/E. Literature/allReferencesForInteraction.rq b/E. Literature/allReferencesForInteraction.rq
index b44b619..f4af000 100644
--- a/E. Literature/allReferencesForInteraction.rq
+++ b/E. Literature/allReferencesForInteraction.rq
@@ -1,6 +1,13 @@
+# title: All References for an Interaction
+# category: Literature
+# description: Returns all publication references for interactions in a given
+# pathway, including references attached to both the interaction itself and its
+# participating data nodes.
+# param: pathwayId | string | WP5200 | Pathway ID
+
SELECT DISTINCT ?pathway ?interaction ?pubmed ?partnerref WHERE {
?pathway a wp:Pathway ;
- dc:identifier .
+ dc:identifier .
?interaction dcterms:isPartOf ?pathway ;
a wp:Interaction ;
wp:participants ?partner;
diff --git a/E. Literature/countRefsPerPW.rq b/E. Literature/countRefsPerPW.rq
index 95a6891..c014b65 100644
--- a/E. Literature/countRefsPerPW.rq
+++ b/E. Literature/countRefsPerPW.rq
@@ -1,5 +1,10 @@
+# title: Reference Count per Pathway
+# category: Literature
+# description: Counts the number of PubMed publication references per pathway,
+# sorted by descending reference count.
+
SELECT DISTINCT ?pathway COUNT(?pubmed) AS ?numberOfReferences
-WHERE
- {?pubmed a wp:PublicationReference .
+WHERE
+ {?pubmed a wp:PublicationReference .
?pubmed dcterms:isPartOf ?pathway }
-ORDER BY DESC(?numberOfReferences)
+ORDER BY DESC(?numberOfReferences)
diff --git a/E. Literature/referencesForInteraction.rq b/E. Literature/referencesForInteraction.rq
index 64ab62c..2e37acb 100644
--- a/E. Literature/referencesForInteraction.rq
+++ b/E. Literature/referencesForInteraction.rq
@@ -1,8 +1,14 @@
+# title: References for an Interaction
+# category: Literature
+# description: Returns publication references directly attached to interactions in a
+# given pathway, along with the participating data node labels.
+# param: pathwayId | string | WP5200 | Pathway ID
+
SELECT DISTINCT ?pathway ?interaction ?pubmed
WHERE {
?pathway a wp:Pathway ;
- dc:identifier . #filter for one pathway
+ dc:identifier . #filter for one pathway
?interaction dcterms:isPartOf ?pathway ;
a wp:Interaction ;
dcterms:references ?pubmed ;
diff --git a/E. Literature/referencesForSpecificInteraction.rq b/E. Literature/referencesForSpecificInteraction.rq
index 3d3aaff..8237dfa 100644
--- a/E. Literature/referencesForSpecificInteraction.rq
+++ b/E. Literature/referencesForSpecificInteraction.rq
@@ -1,8 +1,15 @@
+# title: References for a Specific Interaction
+# category: Literature
+# description: Returns publication references for a single interaction identified by
+# both a pathway and a specific participant URI.
+# param: pathwayId | string | WP5200 | Pathway ID
+# param: proteinId | string | P35498 | UniProt Protein ID
+
SELECT DISTINCT ?pathway ?interaction ?pubmed WHERE {
?pathway a wp:Pathway .
- ?pathway dc:identifier . #filter for pathway
+ ?pathway dc:identifier . #filter for pathway
?interaction dcterms:isPartOf ?pathway .
?interaction a wp:Interaction .
- ?interaction wp:participants . #filter for interaction
+ ?interaction wp:participants . #filter for interaction
?interaction dcterms:references ?pubmed .
} LIMIT 100
diff --git a/F. Datadump/CyTargetLinkerLinksetInput.rq b/F. Datadump/CyTargetLinkerLinksetInput.rq
index cf0ae34..fc124b8 100644
--- a/F. Datadump/CyTargetLinkerLinksetInput.rq
+++ b/F. Datadump/CyTargetLinkerLinksetInput.rq
@@ -1,3 +1,9 @@
+# title: CyTargetLinker Linkset Input
+# category: Data Export
+# description: Exports pathway-gene associations formatted as input for
+# CyTargetLinker, a Cytoscape app for link set analysis. Returns pathway names and
+# IDs paired with HGNC gene symbols and Entrez Gene IDs.
+
select distinct (str(?title) as ?PathwayName) (str(?wpid) as ?PathwayID) (fn:substring(?genename,37) as ?GeneName) (fn:substring(?ncbiGeneId,34) as ?GeneID) where {
?gene a wp:DataNode ;
dcterms:identifier ?id ;
diff --git a/F. Datadump/dumpOntologyAndPW.rq b/F. Datadump/dumpOntologyAndPW.rq
index 410959a..77fbc53 100644
--- a/F. Datadump/dumpOntologyAndPW.rq
+++ b/F. Datadump/dumpOntologyAndPW.rq
@@ -1,3 +1,8 @@
+# title: Ontology and Pathway Data Export
+# category: Data Export
+# description: Exports pathway metadata including page URLs, titles, species, identifiers,
+# and associated ontology tags for bulk download.
+
SELECT DISTINCT ?depicts (str(?titleLit) as ?title) (str(?speciesLabelLit) as ?speciesLabel) ?identifier ?ontology
WHERE {
?pathway foaf:page ?depicts .
diff --git a/F. Datadump/dumpPWsofSpecies.rq b/F. Datadump/dumpPWsofSpecies.rq
index 01020a6..ea37d83 100644
--- a/F. Datadump/dumpPWsofSpecies.rq
+++ b/F. Datadump/dumpPWsofSpecies.rq
@@ -1,8 +1,14 @@
+# title: Pathways by Species Data Export
+# category: Data Export
+# description: Exports all pathways for a given species, returning identifiers,
+# titles, and page URLs ordered by pathway ID.
+# param: species | string | Homo sapiens | Species
+
SELECT DISTINCT ?wpIdentifier ?pathway ?title ?page
WHERE {
?pathway dc:title ?title ;
foaf:page ?page ;
dc:identifier ?wpIdentifier ;
- wp:organismName "Mus musculus" .
+ wp:organismName "{{species}}" .
}
ORDER BY ?wpIdentifier
diff --git a/G. Curation/MetabolitesDoubleMappingWikidata.rq b/G. Curation/MetabolitesDoubleMappingWikidata.rq
index e266d47..99fb6cc 100644
--- a/G. Curation/MetabolitesDoubleMappingWikidata.rq
+++ b/G. Curation/MetabolitesDoubleMappingWikidata.rq
@@ -1,4 +1,7 @@
-# Finding double mappings to Wikidata for metabolites:
+# title: Metabolites with Duplicate Wikidata Mappings
+# category: Curation
+# description: Detects metabolites that are mapped to more than one Wikidata
+# identifier, listing all duplicate mappings per metabolite.
PREFIX wdt:
diff --git a/G. Curation/MetabolitesNotClassified.rq b/G. Curation/MetabolitesNotClassified.rq
index ef60820..b75febe 100644
--- a/G. Curation/MetabolitesNotClassified.rq
+++ b/G. Curation/MetabolitesNotClassified.rq
@@ -1,4 +1,7 @@
-#Metabolites not classified as such
+# title: Unclassified Metabolites
+# category: Curation
+# description: Finds data nodes with a data source annotation that are not classified
+# as metabolites, grouped by data source with counts sorted descending.
prefix wp:
prefix rdfs:
diff --git a/G. Curation/MetabolitesWithoutLinkWikidata.rq b/G. Curation/MetabolitesWithoutLinkWikidata.rq
index 0ad8ae6..4e5e2d2 100644
--- a/G. Curation/MetabolitesWithoutLinkWikidata.rq
+++ b/G. Curation/MetabolitesWithoutLinkWikidata.rq
@@ -1,4 +1,7 @@
-#Metabolites without a link to Wikidata
+# title: Metabolites Without Wikidata Links
+# category: Curation
+# description: Lists metabolites that have no Wikidata identifier mapping, useful for
+# identifying gaps in cross-database linkage.
PREFIX wdt:
diff --git a/G. Curation/PWsWithoutDatanodes.rq b/G. Curation/PWsWithoutDatanodes.rq
index 7e1f0d9..6c94c22 100644
--- a/G. Curation/PWsWithoutDatanodes.rq
+++ b/G. Curation/PWsWithoutDatanodes.rq
@@ -1,4 +1,7 @@
-#Pathways without (annotated) datanodes
+# title: Pathways Without Data Nodes
+# category: Curation
+# description: Finds pathways that contain no data nodes, indicating empty or
+# incomplete pathway entries that may need curation.
prefix wp:
prefix rdfs:
diff --git a/G. Curation/PWsWithoutRef.rq b/G. Curation/PWsWithoutRef.rq
index 2073eb7..2442109 100644
--- a/G. Curation/PWsWithoutRef.rq
+++ b/G. Curation/PWsWithoutRef.rq
@@ -1,4 +1,7 @@
-#Pathways without literature references
+# title: Pathways Without References
+# category: Curation
+# description: Lists pathways that have no associated publication references,
+# returning species, title, and pathway identifier sorted alphabetically.
SELECT (STR(?speciesLabelLit) AS ?species) (STR(?titleLit) AS ?title) ?pathway WHERE {
?pathway a wp:Pathway ; dc:title ?titleLit ; wp:organismName ?speciesLabelLit .
diff --git a/G. Curation/countPWsMetabolitesOccurSorted.rq b/G. Curation/countPWsMetabolitesOccurSorted.rq
index 5fccacf..ac5bffb 100644
--- a/G. Curation/countPWsMetabolitesOccurSorted.rq
+++ b/G. Curation/countPWsMetabolitesOccurSorted.rq
@@ -1,4 +1,7 @@
-#Sorting the metabolites by the number of pathways they occur in
+# title: Pathways by Metabolite Occurrence Count
+# category: Curation
+# description: Counts how many pathways each metabolite appears in, filtered to
+# metabolites without a Wikidata mapping, sorted by descending pathway count.
PREFIX wdt:
diff --git a/G. Curation/countPWsWithoutRef.rq b/G. Curation/countPWsWithoutRef.rq
index b726bb7..e6c09d2 100644
--- a/G. Curation/countPWsWithoutRef.rq
+++ b/G. Curation/countPWsWithoutRef.rq
@@ -1,3 +1,8 @@
+# title: Count of Pathways Without References
+# category: Curation
+# description: Returns the total number of pathways that have no associated
+# publication references.
+
SELECT count(DISTINCT ?pathway) WHERE {
?pathway a wp:Pathway ; dc:title ?titleLit ; wp:organismName ?speciesLabelLit .
MINUS { ?pubmed a wp:PublicationReference .
diff --git a/H. Chemistry/IDSM_similaritySearch.rq b/H. Chemistry/IDSM_similaritySearch.rq
index b26ab26..1b7bac5 100644
--- a/H. Chemistry/IDSM_similaritySearch.rq
+++ b/H. Chemistry/IDSM_similaritySearch.rq
@@ -1,3 +1,11 @@
+# title: IDSM Chemical Similarity Search
+# category: Chemistry
+# description: Finds structurally similar ChEBI compounds for source and target
+# metabolites in a pathway's directed interactions via the IDSM/ChEBI structure
+# search service (idsm.elixir-czech.cz). May be slower due to external endpoint
+# dependency.
+# param: pathwayId | string | WP4225 | Pathway ID
+
PREFIX owl:
PREFIX ebi:
PREFIX sachem:
@@ -8,14 +16,14 @@ PREFIX sso:
PREFIX rh:
PREFIX rdfs:
PREFIX xsd:
-SELECT distinct ((substr(str(?chebioSrc),32)) as ?SourceOrigin) ((substr(str(?similarSrc),32)) as ?SourceSimilar) ((substr(str(?chebioTgt),32)) as ?TargetOrigin) ((substr(str(?similarTgt),32)) as ?TargetSimilar) #?reaction
+SELECT distinct ((substr(str(?chebioSrc),32)) as ?SourceOrigin) ((substr(str(?similarSrc),32)) as ?SourceSimilar) ((substr(str(?chebioTgt),32)) as ?TargetOrigin) ((substr(str(?similarTgt),32)) as ?TargetSimilar) #?reaction
WHERE {
?interaction dcterms:isPartOf ?pathway ; a wp:Conversion ;
wp:source ?source ;
wp:target ?target .
?source wp:bdbChEBI ?chebiSrc .
?target wp:bdbChEBI ?chebiTgt .
- ?pathway dcterms:identifier "WP4225".
+ ?pathway dcterms:identifier "{{pathwayId}}".
BIND(iri(concat("http://purl.obolibrary.org/obo/CHEBI_", substr(str(?chebiSrc),37))) AS ?chebioSrc)
BIND(iri(concat("http://purl.obolibrary.org/obo/CHEBI_", substr(str(?chebiTgt),37))) AS ?chebioTgt)
#IDSM
diff --git a/H. Chemistry/smiles.rq b/H. Chemistry/smiles.rq
index 7566849..22f53e3 100644
--- a/H. Chemistry/smiles.rq
+++ b/H. Chemistry/smiles.rq
@@ -1,3 +1,8 @@
+# title: SMILES for Metabolites
+# category: Chemistry
+# description: Retrieves SMILES chemical structure notations for metabolites via
+# their Wikidata links.
+
PREFIX cheminf:
SELECT ?mol ?smilesDepict WHERE {
diff --git a/HEADER_CONVENTIONS.md b/HEADER_CONVENTIONS.md
new file mode 100644
index 0000000..ac9ec4b
--- /dev/null
+++ b/HEADER_CONVENTIONS.md
@@ -0,0 +1,180 @@
+# Header Conventions Guide
+
+Definitive reference for `.rq` file header format in the WikiPathways SPARQL query collection. All enrichment work (Phases 2-4) must follow these rules.
+
+## Header Format Overview
+
+- Headers are comment lines (`#`) at the **top** of `.rq` files
+- The header block ends at the **first blank line**
+- One blank line separates headers from the SPARQL query body
+- Fields use the format `# field: value`
+
+## Field Order
+
+Headers must appear in this order:
+
+```
+# title: [value]
+# category: [value]
+# description: [value]
+# description: [continued value if multi-line]
+# keywords: [optional, comma-separated]
+# param: [optional, pipe-delimited]
+```
+
+**Required fields:** `title`, `category`, `description`
+**Optional fields:** `keywords`, `param`
+
+## Field Specifications
+
+### `# title:` (required)
+
+One line. Clear, human-readable display name for the SNORQL UI.
+
+- Derived from query purpose, not the filename
+- Use title case
+- Keep concise (under ~60 characters)
+
+| Good | Bad |
+|-----------------------------------|--------------------------|
+| `# title: All Pathways for Species` | `# title: allPathwaysBySpecies` |
+| `# title: Gene-Pathway Associations` | `# title: query1` |
+
+### `# category:` (required)
+
+One line. Exactly one value from the controlled vocabulary in `categories.json`.
+
+Valid values: Metadata, Data Sources, Communities, Collaborations, General, Literature, Data Export, Curation, Chemistry, DSMN, Authors.
+
+The category is determined by the query's directory location. See `categories.json` for the directory-to-category mapping.
+
+### `# description:` (required)
+
+Explains what the query does and what results to expect.
+
+**Single-line:**
+```
+# description: Lists all pathways in the WikiPathways database.
+```
+
+**Multi-line:** Repeat the `# description:` prefix on each continuation line. This is required because the SNORQL parser collects all lines matching the `# description:` prefix. Bare continuation lines (e.g., `# continued text`) are NOT captured by the UI.
+
+```
+# description: Lists all pathways tagged with the AOP community.
+# description: Returns pathway identifiers, titles, and organism.
+```
+
+**Federated queries** (those containing `SERVICE` clauses) should mention federation and potential performance impact:
+```
+# description: Retrieves compound mappings from MetaNetX via federation.
+# description: Uses a federated SERVICE call; may be slow depending on endpoint availability.
+```
+
+### `# keywords:` (optional, future)
+
+Comma-separated values on one line. NOT currently rendered by the SNORQL UI but included for future compatibility.
+
+```
+# keywords: pathways, species, metadata
+```
+
+### `# param:` (optional, Phase 4)
+
+Pipe-delimited format for parameterized queries:
+
+```
+# param: name | type | defaultValue | label
+```
+
+**Supported types:**
+- `string` -- free-text input
+- `uri` -- expects a URI value
+- `enum:val1,val2,val3` -- dropdown selection
+
+Multiple parameters use multiple `# param:` lines.
+
+## SNORQL Parser Behavior
+
+The SNORQL parser scans **all lines** in the file for field-prefixed patterns, not just leading lines. This means:
+
+1. `# title:`, `# category:`, `# description:`, and `# param:` prefixes must **only** appear in the header block
+2. Inline SPARQL comments elsewhere in the file must **not** use these exact prefixes
+3. Use alternative phrasing for inline comments (e.g., `# Note: this filters by species` instead of `# description: this filters by species`)
+
+## Existing Comments Handling
+
+During enrichment (Phase 2+):
+
+- **Descriptive comments** at the top of `.rq` files should be interpreted and absorbed into `# description:` headers
+- **Inline usage hints** (e.g., `# Replace "WP1560" with WP ID of interest`) remain as inline comments BELOW the header block; they are not folded into the description
+- **Existing `# title:` or `# description:` lines** that already follow the conventions are kept as-is
+
+## TTL Metadata Mapping
+
+For queries with `.ttl` source files, the following mapping applies. This is documented for future reference; TTL metadata extraction is NOT implemented in Phase 1.
+
+| TTL Field | Header Field | Notes |
+|--------------------|-------------------|--------------------------------------------|
+| `rdfs:label` | `# title:` | If present; otherwise derive from filename |
+| `rdfs:comment` | `# description:` | May need splitting into multiple lines |
+| `schema:keywords` | `# keywords:` | NOT mapped to `# category:` |
+| (folder location) | `# category:` | Always derived from directory, never TTL |
+
+## Complete Examples
+
+### Example 1: Minimal query (title + category + description)
+
+```sparql
+# title: All Pathways
+# category: General
+# description: Returns all pathways in the WikiPathways database with their titles and organisms.
+
+SELECT DISTINCT ?pathway ?title ?organism
+WHERE {
+ ?pathway a wp:Pathway ;
+ dc:title ?title ;
+ wp:organismName ?organism .
+}
+ORDER BY ?title
+```
+
+### Example 2: Multi-line description
+
+```sparql
+# title: AOP Community Pathways
+# category: Communities
+# description: Lists all pathways tagged with the Adverse Outcome Pathway (AOP) community.
+# description: Returns pathway identifiers, titles, and last revision dates.
+# description: Useful for tracking AOP-related content in WikiPathways.
+
+SELECT ?pathway ?title ?date
+WHERE {
+ ?pathway a wp:Pathway ;
+ dc:title ?title ;
+ dcterms:subject cur:AOP ;
+ pav:lastRefreshedOn ?date .
+}
+ORDER BY DESC(?date)
+```
+
+### Example 3: Parameterized query (Phase 4)
+
+```sparql
+# title: Pathways by Species
+# category: General
+# description: Returns all pathways for a given species.
+# param: species | enum:Homo sapiens,Mus musculus,Rattus norvegicus,... | Homo sapiens | Species
+
+SELECT ?pathway ?title
+WHERE {
+ ?pathway a wp:Pathway ;
+ dc:title ?title ;
+ wp:organismName "{{species}}" .
+}
+ORDER BY ?title
+```
+
+---
+
+*Reference document for WikiPathways SPARQL query header enrichment.*
+*Controlled category vocabulary: see `categories.json`.*
diff --git a/I. DirectedSmallMoleculesNetwork (DSMN)/controlling duplicate mappings from Wikidata.rq b/I. DirectedSmallMoleculesNetwork (DSMN)/controlling duplicate mappings from Wikidata.rq
index 0bc6003..4eeb83e 100644
--- a/I. DirectedSmallMoleculesNetwork (DSMN)/controlling duplicate mappings from Wikidata.rq
+++ b/I. DirectedSmallMoleculesNetwork (DSMN)/controlling duplicate mappings from Wikidata.rq
@@ -1,3 +1,8 @@
+# title: Controlling Duplicate Mappings from Wikidata
+# category: DSMN
+# description: Detects metabolites mapped to multiple Wikidata identifiers as a
+# quality control step in the DSMN workflow.
+
### Part 1: ###
#Required prefixes for querying WikiPathways content in Blazegraph
PREFIX gpml:
diff --git a/I. DirectedSmallMoleculesNetwork (DSMN)/extracting directed metabolic reactions.rq b/I. DirectedSmallMoleculesNetwork (DSMN)/extracting directed metabolic reactions.rq
index 53d0931..c165db5 100644
--- a/I. DirectedSmallMoleculesNetwork (DSMN)/extracting directed metabolic reactions.rq
+++ b/I. DirectedSmallMoleculesNetwork (DSMN)/extracting directed metabolic reactions.rq
@@ -1,12 +1,19 @@
+# title: Extracting Directed Metabolic Reactions
+# category: DSMN
+# description: Extracts directed metabolite-to-metabolite interactions from human
+# pathways in the AnalysisCollection, returning source and target identifiers,
+# interaction types, and Rhea IDs as part of the DSMN workflow.
+# param: species | string | Homo sapiens | Species
+
### Part 1: ###
-SELECT DISTINCT ?interaction ?sourceDb ?targetDb ?mimtype
-?pathway (str(?titleLit) as ?title)
+SELECT DISTINCT ?interaction ?sourceDb ?targetDb ?mimtype
+?pathway (str(?titleLit) as ?title)
?sourceCHEBI ?targetDbCHEBI ?sourceHMDB ?targetDbHMDB ?InteractionID
WHERE {
### Part 2: ###
?pathway a wp:Pathway ;
- wp:organismName "Homo sapiens" ;
+ wp:organismName "{{species}}" ;
dc:title ?titleLit .
### Part 3A: ###
diff --git a/I. DirectedSmallMoleculesNetwork (DSMN)/extracting ontologies and references for metabolic reactions.rq b/I. DirectedSmallMoleculesNetwork (DSMN)/extracting ontologies and references for metabolic reactions.rq
index 7a91a0e..595c345 100644
--- a/I. DirectedSmallMoleculesNetwork (DSMN)/extracting ontologies and references for metabolic reactions.rq
+++ b/I. DirectedSmallMoleculesNetwork (DSMN)/extracting ontologies and references for metabolic reactions.rq
@@ -1,9 +1,16 @@
+# title: Extracting Ontologies and References for Metabolic Reactions
+# category: DSMN
+# description: Retrieves ontology annotations, curation status, and literature
+# references for directed metabolic reactions in human pathways as part of the
+# DSMN workflow.
+# param: species | string | Homo sapiens | Species
+
### Part 1: ###
-SELECT DISTINCT ?interaction ?sourceDb ?targetDb ?PWOnt ?DiseaseOnt
+SELECT DISTINCT ?interaction ?sourceDb ?targetDb ?PWOnt ?DiseaseOnt
?curationstatus ?InteractionRef ?PWref ?sourceLit ?targetLit
WHERE {
?pathway a wp:Pathway ;
- wp:organismName "Homo sapiens";
+ wp:organismName "{{species}}";
dc:title ?titleLit .
?interaction dcterms:isPartOf ?pathway ;
a wp:DirectedInteraction ;
diff --git a/I. DirectedSmallMoleculesNetwork (DSMN)/extracting protein titles and identifiers for metabolic reactions.rq b/I. DirectedSmallMoleculesNetwork (DSMN)/extracting protein titles and identifiers for metabolic reactions.rq
index 0ec618e..10efc47 100644
--- a/I. DirectedSmallMoleculesNetwork (DSMN)/extracting protein titles and identifiers for metabolic reactions.rq
+++ b/I. DirectedSmallMoleculesNetwork (DSMN)/extracting protein titles and identifiers for metabolic reactions.rq
@@ -1,9 +1,16 @@
+# title: Extracting Protein Titles and Identifiers for Metabolic Reactions
+# category: DSMN
+# description: Extracts catalyzing proteins for directed metabolic reactions in
+# human AnalysisCollection pathways, returning Ensembl identifiers and protein
+# names as part of the DSMN workflow.
+# param: species | string | Homo sapiens | Species
+
### Part 1: ###
-SELECT DISTINCT ?interaction ?sourceDb ?targetDb ?proteinDBWPs ?proteinName
-WHERE {
+SELECT DISTINCT ?interaction ?sourceDb ?targetDb ?proteinDBWPs ?proteinName
+WHERE {
?pathway a wp:Pathway ;
wp:ontologyTag cur:AnalysisCollection ;
-wp:organismName "Homo sapiens";
+wp:organismName "{{species}}";
dc:title ?titleLit .
?interaction dcterms:isPartOf ?pathway ;
a wp:DirectedInteraction ;
diff --git a/J. Authors/authorsOfAPathway.rq b/J. Authors/authorsOfAPathway.rq
index 0093a44..2e6caf8 100644
--- a/J. Authors/authorsOfAPathway.rq
+++ b/J. Authors/authorsOfAPathway.rq
@@ -1,10 +1,16 @@
+# title: Authors of a Pathway
+# category: Authors
+# description: Lists all authors of a given pathway in ordinal order, returning
+# name, ORCID, homepage, and pathway version.
+# param: pathwayId | string | WP4846 | Pathway ID
+
PREFIX dc:
PREFIX foaf:
PREFIX wpq:
PREFIX pav:
SELECT ?pathway ?version ?ordinal ?author_ ?name ?orcid ?page WHERE {
- VALUES ?pathway { }
+ VALUES ?pathway { }
?author_ a foaf:Person ;
wp:hasAuthorship ?authorship .
?authorship ^wp:hasAuthorship ?pathway ;
diff --git a/J. Authors/contributors.rq b/J. Authors/contributors.rq
index c59dafd..4519e6e 100644
--- a/J. Authors/contributors.rq
+++ b/J. Authors/contributors.rq
@@ -1,3 +1,8 @@
+# title: All Contributors
+# category: Authors
+# description: Counts the number of pathways each first author (ordinal 1)
+# contributes to, sorted by descending pathway count.
+
PREFIX dc:
PREFIX foaf:
PREFIX wpq:
diff --git a/J. Authors/firstAuthors.rq b/J. Authors/firstAuthors.rq
index a442bdd..5789a35 100644
--- a/J. Authors/firstAuthors.rq
+++ b/J. Authors/firstAuthors.rq
@@ -1,3 +1,8 @@
+# title: First Authors of Pathways
+# category: Authors
+# description: Lists the first author (ordinal 1) of each pathway, ordered by
+# pathway version number.
+
PREFIX dc:
PREFIX foaf:
PREFIX wpq:
diff --git a/J. Authors/pathwayCountWithAtLeastXAuthors.rq b/J. Authors/pathwayCountWithAtLeastXAuthors.rq
index 2026e4e..fb93921 100644
--- a/J. Authors/pathwayCountWithAtLeastXAuthors.rq
+++ b/J. Authors/pathwayCountWithAtLeastXAuthors.rq
@@ -1,3 +1,8 @@
+# title: Pathways with Multiple Authors
+# category: Authors
+# description: Counts how many pathways have at least N authors for each author
+# ordinal position, showing the distribution of author counts across pathways.
+
PREFIX dc:
PREFIX wpq:
diff --git a/categories.json b/categories.json
new file mode 100644
index 0000000..ed9710e
--- /dev/null
+++ b/categories.json
@@ -0,0 +1,50 @@
+{
+ "categories": {
+ "Metadata": [
+ "A. Metadata/",
+ "A. Metadata/datacounts/",
+ "A. Metadata/species/"
+ ],
+ "Data Sources": [
+ "A. Metadata/datasources/"
+ ],
+ "Communities": [
+ "B. Communities/AOP/",
+ "B. Communities/CIRM Stem Cell Pathways/",
+ "B. Communities/COVID19/",
+ "B. Communities/Inborn Errors of Metabolism/",
+ "B. Communities/Lipids/",
+ "B. Communities/RareDiseases/",
+ "B. Communities/Reactome/",
+ "B. Communities/WormBase/"
+ ],
+ "Collaborations": [
+ "C. Collaborations/AOP-Wiki/",
+ "C. Collaborations/MetaNetX/",
+ "C. Collaborations/MolMeDB/",
+ "C. Collaborations/neXtProt/",
+ "C. Collaborations/smallMolecules_Rhea_IDSM/"
+ ],
+ "General": [
+ "D. General/"
+ ],
+ "Literature": [
+ "E. Literature/"
+ ],
+ "Data Export": [
+ "F. Datadump/"
+ ],
+ "Curation": [
+ "G. Curation/"
+ ],
+ "Chemistry": [
+ "H. Chemistry/"
+ ],
+ "DSMN": [
+ "I. DirectedSmallMoleculesNetwork (DSMN)/"
+ ],
+ "Authors": [
+ "J. Authors/"
+ ]
+ }
+}
diff --git a/scripts/lint_headers.py b/scripts/lint_headers.py
new file mode 100644
index 0000000..627302d
--- /dev/null
+++ b/scripts/lint_headers.py
@@ -0,0 +1,73 @@
+"""CI lint script: validates required headers on all .rq query files."""
+
+import pathlib
+import re
+import sys
+
+ROOT = pathlib.Path(__file__).resolve().parent.parent
+EXCLUDED_DIRS = {".planning", ".git", ".github", "scripts", "tests"}
+
+REQUIRED_FIELDS = ["title", "category", "description"]
+FIELD_PATTERNS = {
+ field: re.compile(rf"^# {field}: .+") for field in REQUIRED_FIELDS
+}
+
+
+def find_rq_files():
+ """Return sorted list of .rq file paths, excluding non-query directories."""
+ results = []
+ for rq_file in sorted(ROOT.rglob("*.rq")):
+ rel = rq_file.relative_to(ROOT)
+ parts = rel.parts
+ if parts and parts[0] in EXCLUDED_DIRS:
+ continue
+ results.append(rq_file)
+ return results
+
+
+def parse_header(filepath):
+ """Extract consecutive comment lines from the top of an .rq file."""
+ lines = []
+ with open(filepath, encoding="utf-8") as f:
+ for line in f:
+ stripped = line.rstrip("\n\r")
+ if stripped.startswith("#"):
+ lines.append(stripped)
+ else:
+ break
+ return lines
+
+
+def lint_file(filepath):
+ """Check a single .rq file for required header fields.
+
+ Returns a list of error strings (empty if file passes).
+ """
+ header = parse_header(filepath)
+ rel_path = filepath.relative_to(ROOT)
+ errors = []
+ for field in REQUIRED_FIELDS:
+ pattern = FIELD_PATTERNS[field]
+ if not any(pattern.match(line) for line in header):
+ errors.append(f"{rel_path}: missing '# {field}:' header")
+ return errors
+
+
+def main():
+ """Lint all .rq files and report results."""
+ rq_files = find_rq_files()
+ all_errors = []
+ for rq_file in rq_files:
+ all_errors.extend(lint_file(rq_file))
+
+ if all_errors:
+ for error in all_errors:
+ print(f"ERROR: {error}")
+ sys.exit(1)
+ else:
+ print(f"OK: {len(rq_files)} files passed lint check")
+ sys.exit(0)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/transformDotTtlToDotSparql.py b/scripts/transformDotTtlToDotSparql.py
index 01e755c..821061e 100644
--- a/scripts/transformDotTtlToDotSparql.py
+++ b/scripts/transformDotTtlToDotSparql.py
@@ -2,29 +2,76 @@
import glob
from rdflib import Graph
-# Path to Turtle files
-ttl_files_path = '**/*.ttl'
-# Get the list of .ttl files
-ttl_files = glob.glob(ttl_files_path, recursive=True)
+def extract_header(filepath):
+ """Extract the leading comment-line header block from an .rq file.
+
+ Reads consecutive lines starting with '#' from the top of the file,
+ stopping at the first blank line or first non-comment line. Returns
+ the header lines joined with a trailing newline (the blank separator),
+ or an empty string if no header is found or the file does not exist.
+ """
+ if not os.path.exists(filepath):
+ return ""
+
+ header_lines = []
+ with open(filepath, encoding="utf-8") as f:
+ for line in f:
+ stripped = line.rstrip("\n")
+ if stripped.startswith("#"):
+ header_lines.append(stripped)
+ else:
+ break
+
+ if header_lines:
+ return "\n".join(header_lines) + "\n"
+ return ""
-# Process each Turtle file
-for i in ttl_files:
- fn = os.path.basename(i)[0:-4] # extract name without extension
- sparql = i[0:-4] + ".rq" # create .rq filename
+
+def process_ttl_file(ttl_path):
+ """Parse a .ttl file and write the extracted SPARQL to a .rq file.
+
+ If the .rq file already exists and has a comment header block, that
+ header is preserved above the regenerated SPARQL content. If the TTL
+ contains no SPARQL query, the .rq file is not touched.
+ """
+ rq_path = ttl_path[:-4] + ".rq"
+ fn = os.path.basename(ttl_path)[:-4]
print("file: " + fn)
-
- # Open .ttl file to write
+
+ header = extract_header(rq_path)
+
g = Graph()
- g.parse(i)
+ g.parse(ttl_path)
- with open(sparql, 'w') as sparql_file:
- knows_query = """prefix sh:
+ knows_query = """prefix sh:
SELECT DISTINCT ?query ?sparql
WHERE {
?query sh:select | sh:ask | sh:construct ?sparql .
}"""
- qres = g.query(knows_query)
- for row in qres:
- sparql_file.write(f"{row.sparql}")
+ qres = g.query(knows_query)
+ sparql_content = ""
+ for row in qres:
+ sparql_content += str(row.sparql)
+
+ if not sparql_content.strip():
+ print(f" WARNING: No SPARQL found in {ttl_path}, skipping .rq write")
+ return
+
+ with open(rq_path, "w", encoding="utf-8") as sparql_file:
+ if header:
+ sparql_file.write(header + "\n")
+ sparql_file.write(sparql_content)
+
+
+# Path to Turtle files
+ttl_files_path = '**/*.ttl'
+
+if __name__ == "__main__":
+ # Get the list of .ttl files
+ ttl_files = glob.glob(ttl_files_path, recursive=True)
+
+ # Process each Turtle file
+ for i in ttl_files:
+ process_ttl_file(i)
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..85f5a38
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1 @@
+# Shared fixtures for CI script tests
diff --git a/tests/fixtures/sample.ttl b/tests/fixtures/sample.ttl
new file mode 100644
index 0000000..3150da6
--- /dev/null
+++ b/tests/fixtures/sample.ttl
@@ -0,0 +1,13 @@
+@prefix ex: .
+@prefix rdf: .
+@prefix rdfs: .
+@prefix schema: .
+@prefix sh: .
+
+ex:sample a sh:SPARQLExecutable,
+ sh:SPARQLSelectExecutable ;
+ rdfs:comment "A sample query for testing."@en ;
+ sh:prefixes _:sparql_examples_prefixes ;
+ sh:select """SELECT ?x WHERE { ?x a ?type }""" ;
+ schema:target ;
+ schema:keywords "test" .
diff --git a/tests/fixtures/sample_empty.ttl b/tests/fixtures/sample_empty.ttl
new file mode 100644
index 0000000..89ec874
--- /dev/null
+++ b/tests/fixtures/sample_empty.ttl
@@ -0,0 +1,6 @@
+@prefix ex: .
+@prefix rdf: .
+@prefix rdfs: .
+
+ex:empty a rdfs:Resource ;
+ rdfs:comment "A TTL file with no SPARQL query." .
diff --git a/tests/fixtures/sample_no_header.rq b/tests/fixtures/sample_no_header.rq
new file mode 100644
index 0000000..18813b5
--- /dev/null
+++ b/tests/fixtures/sample_no_header.rq
@@ -0,0 +1 @@
+SELECT ?old WHERE { ?old a ?type }
diff --git a/tests/fixtures/sample_with_header.rq b/tests/fixtures/sample_with_header.rq
new file mode 100644
index 0000000..43cf1bb
--- /dev/null
+++ b/tests/fixtures/sample_with_header.rq
@@ -0,0 +1,5 @@
+# title: Sample Query
+# category: Metadata
+# description: A test query.
+
+SELECT ?old WHERE { ?old a ?type }
diff --git a/tests/test_categories.py b/tests/test_categories.py
new file mode 100644
index 0000000..a0745fc
--- /dev/null
+++ b/tests/test_categories.py
@@ -0,0 +1,96 @@
+"""Validate the controlled category vocabulary against the filesystem."""
+
+import json
+import os
+import pathlib
+
+import pytest
+
+ROOT = pathlib.Path(__file__).resolve().parent.parent
+CATEGORIES_FILE = ROOT / "categories.json"
+
+EXCLUDED_DIRS = {".planning", ".git", ".github", "scripts", "tests"}
+
+
+def load_categories():
+ with open(CATEGORIES_FILE) as f:
+ return json.load(f)
+
+
+def find_rq_directories():
+ """Return set of relative directory paths that contain .rq files."""
+ dirs = set()
+ for rq_file in ROOT.rglob("*.rq"):
+ rel = rq_file.parent.relative_to(ROOT)
+ # Skip excluded top-level directories
+ parts = rel.parts
+ if parts and parts[0] in EXCLUDED_DIRS:
+ continue
+ # Normalize to string with trailing slash (matching categories.json format)
+ dirs.add(str(rel) + "/")
+ return dirs
+
+
+def all_mapped_dirs(data):
+ """Return set of all directories listed across all categories."""
+ result = set()
+ for folders in data["categories"].values():
+ result.update(folders)
+ return result
+
+
+def category_for_dir(data, directory):
+ """Return the category name that contains the given directory."""
+ for cat_name, folders in data["categories"].items():
+ if directory in folders:
+ return cat_name
+ return None
+
+
+class TestCategoriesJSON:
+ def test_valid_json_and_structure(self):
+ """categories.json loads without error and has the expected structure."""
+ data = load_categories()
+ assert "categories" in data
+ assert isinstance(data["categories"], dict)
+ for name, folders in data["categories"].items():
+ assert isinstance(name, str)
+ assert isinstance(folders, list)
+ for f in folders:
+ assert isinstance(f, str)
+ assert f.endswith("/"), f"Folder path must end with /: {f}"
+
+ def test_exactly_11_categories(self):
+ """The vocabulary contains exactly 11 category names."""
+ data = load_categories()
+ assert len(data["categories"]) == 11, (
+ f"Expected 11 categories, got {len(data['categories'])}: "
+ f"{list(data['categories'].keys())}"
+ )
+
+ def test_all_directories_covered(self):
+ """Every directory containing .rq files maps to a category."""
+ data = load_categories()
+ mapped = all_mapped_dirs(data)
+ fs_dirs = find_rq_directories()
+ unmapped = fs_dirs - mapped
+ assert not unmapped, (
+ f"Directories with .rq files not in any category: {sorted(unmapped)}"
+ )
+
+ def test_no_orphan_directories(self):
+ """No query-containing directory is missing from the mapping."""
+ data = load_categories()
+ mapped = all_mapped_dirs(data)
+ fs_dirs = find_rq_directories()
+ # Same check as above but phrased for clarity
+ for d in sorted(fs_dirs):
+ assert d in mapped, f"Directory '{d}' contains .rq files but is not mapped"
+
+ def test_datasources_maps_to_data_sources(self):
+ """The datasources/ subfolder maps to 'Data Sources', not 'Metadata'."""
+ data = load_categories()
+ cat = category_for_dir(data, "A. Metadata/datasources/")
+ assert cat == "Data Sources", (
+ f"Expected 'Data Sources' but got '{cat}' for A. Metadata/datasources/"
+ )
diff --git a/tests/test_ci_script.py b/tests/test_ci_script.py
new file mode 100644
index 0000000..5f14638
--- /dev/null
+++ b/tests/test_ci_script.py
@@ -0,0 +1,121 @@
+"""Tests for the CI TTL-to-SPARQL extraction script with header preservation."""
+
+import os
+import shutil
+import sys
+
+import pytest
+
+FIXTURES = os.path.join(os.path.dirname(__file__), "fixtures")
+
+# Add project root to path so we can import the script module
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+from scripts.transformDotTtlToDotSparql import extract_header, process_ttl_file
+
+
+def _copy_fixture(src_name, dst_dir, dst_name=None):
+ """Copy a fixture file into a temp directory."""
+ dst_name = dst_name or src_name
+ src = os.path.join(FIXTURES, src_name)
+ dst = os.path.join(dst_dir, dst_name)
+ shutil.copy2(src, dst)
+ return dst
+
+
+class TestHeaderPreservation:
+ """Test 1: Header block is preserved when .rq is regenerated from .ttl."""
+
+ def test_preserves_existing_headers(self, tmp_path):
+ ttl = _copy_fixture("sample.ttl", tmp_path)
+ rq = _copy_fixture("sample_with_header.rq", tmp_path, "sample.rq")
+
+ process_ttl_file(str(ttl))
+
+ content = open(rq, encoding="utf-8").read()
+ assert content.startswith("# title: Sample Query\n")
+ assert "# category: Metadata" in content
+ assert "# description: A test query." in content
+ # SPARQL should be the new one from the TTL, not the old one
+ assert "SELECT ?x WHERE { ?x a ?type }" in content
+ assert "SELECT ?old" not in content
+
+
+class TestNoHeader:
+ """Test 2: .rq with no headers stays headerless after regeneration."""
+
+ def test_no_phantom_header_injected(self, tmp_path):
+ ttl = _copy_fixture("sample.ttl", tmp_path)
+ _copy_fixture("sample_no_header.rq", tmp_path, "sample.rq")
+
+ process_ttl_file(str(ttl))
+
+ content = open(os.path.join(tmp_path, "sample.rq"), encoding="utf-8").read()
+ assert not content.startswith("#")
+ assert "SELECT ?x WHERE { ?x a ?type }" in content
+
+
+class TestNoExistingRq:
+ """Test 3: When no .rq exists, one is created with just SPARQL."""
+
+ def test_creates_rq_from_scratch(self, tmp_path):
+ ttl = _copy_fixture("sample.ttl", tmp_path)
+ rq_path = os.path.join(tmp_path, "sample.rq")
+ assert not os.path.exists(rq_path)
+
+ process_ttl_file(str(ttl))
+
+ assert os.path.exists(rq_path)
+ content = open(rq_path, encoding="utf-8").read()
+ assert "SELECT ?x WHERE { ?x a ?type }" in content
+ assert not content.startswith("#")
+
+
+class TestSparqlCorrectness:
+ """Test 4: Extracted SPARQL matches expected output (regression test)."""
+
+ def test_exact_sparql_extraction(self, tmp_path):
+ ttl = _copy_fixture("sample.ttl", tmp_path)
+
+ process_ttl_file(str(ttl))
+
+ content = open(os.path.join(tmp_path, "sample.rq"), encoding="utf-8").read()
+ assert content.strip() == "SELECT ?x WHERE { ?x a ?type }"
+
+
+class TestBlankLineSeparator:
+ """Test 5: Exactly one blank line separates header block from SPARQL."""
+
+ def test_single_blank_line_between_header_and_sparql(self, tmp_path):
+ ttl = _copy_fixture("sample.ttl", tmp_path)
+ _copy_fixture("sample_with_header.rq", tmp_path, "sample.rq")
+
+ process_ttl_file(str(ttl))
+
+ content = open(os.path.join(tmp_path, "sample.rq"), encoding="utf-8").read()
+ # Split on the last header line
+ lines = content.split("\n")
+ # Find the transition from header to SPARQL
+ header_end = -1
+ for idx, line in enumerate(lines):
+ if line.startswith("#"):
+ header_end = idx
+ # Line after last header should be blank, then SPARQL
+ assert lines[header_end + 1] == "", "Expected blank line after header"
+ assert lines[header_end + 2].startswith("SELECT"), "Expected SPARQL after blank line"
+
+
+class TestErrorGuard:
+ """Test 6: Empty TTL (no SPARQL query) does not overwrite existing .rq."""
+
+ def test_does_not_overwrite_on_empty_sparql(self, tmp_path):
+ ttl = _copy_fixture("sample_empty.ttl", tmp_path)
+ rq_path = os.path.join(tmp_path, "sample_empty.rq")
+ # Create a pre-existing .rq with content
+ with open(rq_path, "w") as f:
+ f.write("SELECT ?existing WHERE { ?existing a ?type }\n")
+
+ process_ttl_file(str(ttl))
+
+ content = open(rq_path, encoding="utf-8").read()
+ assert "SELECT ?existing" in content, "Existing .rq should not be overwritten"
diff --git a/tests/test_headers.py b/tests/test_headers.py
new file mode 100644
index 0000000..66f2346
--- /dev/null
+++ b/tests/test_headers.py
@@ -0,0 +1,176 @@
+"""Validate that all .rq files have required header fields (title, category)."""
+
+import json
+import pathlib
+import re
+
+import pytest
+
+ROOT = pathlib.Path(__file__).resolve().parent.parent
+CATEGORIES_FILE = ROOT / "categories.json"
+
+EXCLUDED_DIRS = {".planning", ".git", ".github", "scripts", "tests"}
+
+
+def find_rq_files():
+ """Return sorted list of .rq file paths, excluding tests/ and other non-query dirs."""
+ results = []
+ for rq_file in sorted(ROOT.rglob("*.rq")):
+ rel = rq_file.relative_to(ROOT)
+ parts = rel.parts
+ if parts and parts[0] in EXCLUDED_DIRS:
+ continue
+ results.append(rq_file)
+ return results
+
+
+def parse_header(filepath):
+ """Extract header block from an .rq file.
+
+ The header block is the consecutive sequence of lines starting with '#'
+ at the top of the file, ending at the first blank line or non-comment line.
+ Returns a list of header line strings (with the leading '# ' stripped where applicable).
+ """
+ lines = []
+ with open(filepath, encoding="utf-8") as f:
+ for line in f:
+ stripped = line.rstrip("\n\r")
+ if stripped.startswith("#"):
+ lines.append(stripped)
+ else:
+ break
+ return lines
+
+
+def load_valid_categories():
+ """Return the set of valid category names from categories.json."""
+ with open(CATEGORIES_FILE, encoding="utf-8") as f:
+ data = json.load(f)
+ return set(data["categories"].keys())
+
+
+# Collect files once at module level for parametrization
+_RQ_FILES = find_rq_files()
+_RQ_PARAMS = [
+ pytest.param(f, id=str(f.relative_to(ROOT))) for f in _RQ_FILES
+]
+
+
+@pytest.mark.parametrize("rq_file", _RQ_PARAMS)
+def test_all_rq_have_title(rq_file):
+ """Every .rq file must have a '# title: ...' line in its header block."""
+ header = parse_header(rq_file)
+ title_pattern = re.compile(r"^# title: .+")
+ titles = [line for line in header if title_pattern.match(line)]
+ assert titles, (
+ f"Missing '# title:' header in {rq_file.relative_to(ROOT)}"
+ )
+
+
+@pytest.mark.parametrize("rq_file", _RQ_PARAMS)
+def test_all_rq_have_valid_category(rq_file):
+ """Every .rq file must have a '# category: VALUE' line with a valid category."""
+ header = parse_header(rq_file)
+ valid = load_valid_categories()
+ cat_pattern = re.compile(r"^# category: (.+)")
+ categories = []
+ for line in header:
+ m = cat_pattern.match(line)
+ if m:
+ categories.append(m.group(1).strip())
+ assert categories, (
+ f"Missing '# category:' header in {rq_file.relative_to(ROOT)}"
+ )
+ for cat in categories:
+ assert cat in valid, (
+ f"Invalid category '{cat}' in {rq_file.relative_to(ROOT)}. "
+ f"Valid categories: {sorted(valid)}"
+ )
+
+
+@pytest.mark.parametrize("rq_file", _RQ_PARAMS)
+def test_all_rq_have_description(rq_file):
+ """Every .rq file must have a '# description: ...' line in its header block."""
+ header = parse_header(rq_file)
+ desc_pattern = re.compile(r"^# description: .+")
+ descriptions = [line for line in header if desc_pattern.match(line)]
+ assert descriptions, (
+ f"Missing '# description:' header in {rq_file.relative_to(ROOT)}"
+ )
+
+
+def test_titles_are_unique():
+ """All title values across .rq files must be unique (no duplicates)."""
+ title_pattern = re.compile(r"^# title: (.+)")
+ seen = {}
+ for rq_file in _RQ_FILES:
+ header = parse_header(rq_file)
+ for line in header:
+ m = title_pattern.match(line)
+ if m:
+ title = m.group(1).strip()
+ rel = str(rq_file.relative_to(ROOT))
+ if title in seen:
+ seen[title].append(rel)
+ else:
+ seen[title] = [rel]
+ duplicates = {t: files for t, files in seen.items() if len(files) > 1}
+ assert not duplicates, (
+ f"Duplicate titles found: {duplicates}"
+ )
+
+
+def test_header_field_order():
+ """When title, category, description are present, they must appear in that order."""
+ title_pattern = re.compile(r"^# title: ")
+ cat_pattern = re.compile(r"^# category: ")
+ desc_pattern = re.compile(r"^# description: ")
+ for rq_file in _RQ_FILES:
+ header = parse_header(rq_file)
+ title_idx = None
+ cat_idx = None
+ desc_idx = None
+ for i, line in enumerate(header):
+ if title_pattern.match(line) and title_idx is None:
+ title_idx = i
+ if cat_pattern.match(line) and cat_idx is None:
+ cat_idx = i
+ if desc_pattern.match(line) and desc_idx is None:
+ desc_idx = i
+ if title_idx is not None and cat_idx is not None:
+ assert title_idx < cat_idx, (
+ f"In {rq_file.relative_to(ROOT)}: title (line {title_idx}) "
+ f"must appear before category (line {cat_idx})"
+ )
+ if cat_idx is not None and desc_idx is not None:
+ assert cat_idx < desc_idx, (
+ f"In {rq_file.relative_to(ROOT)}: category (line {cat_idx}) "
+ f"must appear before description (line {desc_idx})"
+ )
+
+
+def test_blank_line_separator():
+ """Files with structured header fields must have a blank line before the query body."""
+ field_pattern = re.compile(r"^# (title|category|description|keywords|param): ")
+ for rq_file in _RQ_FILES:
+ header = parse_header(rq_file)
+ # Only check files that have at least one structured header field
+ has_field = any(field_pattern.match(line) for line in header)
+ if not has_field:
+ continue
+ with open(rq_file, encoding="utf-8") as f:
+ content = f.read()
+ lines = content.split("\n")
+ # Find end of header block (consecutive # lines at top)
+ header_end = 0
+ for i, line in enumerate(lines):
+ if line.startswith("#"):
+ header_end = i + 1
+ else:
+ break
+ # The line immediately after the header block should be blank
+ if header_end < len(lines):
+ assert lines[header_end].strip() == "", (
+ f"In {rq_file.relative_to(ROOT)}: expected blank line after "
+ f"header block at line {header_end + 1}, got: '{lines[header_end]}'"
+ )