From efbf24fa7f321444692af94268f959ddde8d17ab Mon Sep 17 00:00:00 2001 From: raphaelgazzotti Date: Mon, 16 Jun 2025 13:48:33 +0200 Subject: [PATCH 1/5] First draft of OpenMINDS schemas translation into OWL format. --- .github/workflows/build.yml | 46 ++++++++++++ build.py | 24 ++++++ pipeline/__init__.py | 0 pipeline/translator.py | 141 ++++++++++++++++++++++++++++++++++++ pipeline/utils.py | 29 ++++++++ requirements.txt | 2 + 6 files changed, 242 insertions(+) create mode 100644 .github/workflows/build.yml create mode 100644 build.py create mode 100644 pipeline/__init__.py create mode 100644 pipeline/translator.py create mode 100644 pipeline/utils.py create mode 100644 requirements.txt diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 00000000..cff4c6f2 --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,46 @@ +name: openMINDS_OWL_build_pipeline + +on: + push: + branches: + - pipeline + workflow_dispatch: # This triggers the workflow when a webhook is received + +jobs: + build: + runs-on: ubuntu-latest + steps: + + - name: Checkout Repository + uses: actions/checkout@v3 + + - name: Set up Python 3.11 + uses: actions/setup-python@v2 + with: + python-version: 3.11 + + - name: Run build + run: | + mu install -r requirements.txt + python build.py + + - name: Checkout main branch + uses: actions/checkout@v3 + with: + ref: main + path: main + token: ${{ secrets.GITHUB_TOKEN }} + + - name: Push to main + run: | + cp -R target/* main + cd main + git config --global user.email "support@openmetadatainitiative.org" + git config --global user.name "openMINDS pipeline" + if [[ $(git add . --dry-run | wc -l) -gt 0 ]]; then + git add . + git commit -m "build triggered by ${{ github.event_name }}" + git push -f + else + echo "Nothing to commit" + fi diff --git a/build.py b/build.py new file mode 100644 index 00000000..35fd1ba4 --- /dev/null +++ b/build.py @@ -0,0 +1,24 @@ +import os.path +import shutil + +from pipeline.translator import OWLSchemaBuilder +from pipeline.utils import clone_sources, SchemaLoader + +print("***************************************") +print(f"Triggering the generation of OWL for openMINDS") +print("***************************************") + +# Step 1 - clone central repository in main branch to get the latest sources +clone_sources() +schema_loader = SchemaLoader() +if os.path.exists("target"): + shutil.rmtree("target") + +for schema_version in schema_loader.get_schema_versions(): + + # Step 2 - find all involved schemas for the current version + schemas_file_paths = schema_loader.find_schemas(schema_version) + + for schema_file_path in schemas_file_paths: + # Step 3 - translate and build each openMINDS schema as JSON-Schema + OWLSchemaBuilder(schema_file_path, schema_loader.schemas_sources).build() diff --git a/pipeline/__init__.py b/pipeline/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pipeline/translator.py b/pipeline/translator.py new file mode 100644 index 00000000..cb0d60bc --- /dev/null +++ b/pipeline/translator.py @@ -0,0 +1,141 @@ +import os +import os.path + +from pipeline.utils import load_json + +from rdflib import Graph, Namespace, Literal, URIRef, BNode +from rdflib.collection import Collection +from rdflib.namespace import FOAF, RDF, RDFS, OWL, XSD +from typing import List, Dict + +class OWLSchemaBuilder(object): + def __init__(self, schema_file_path:str, root_path:str): + _relative_path_without_extension = schema_file_path[len(root_path)+1:].replace(".schema.omi.json", "").split("/") + self.version = _relative_path_without_extension[0] + self.relative_path_without_extension = _relative_path_without_extension[1:] + self.graph = Graph() + self.properties_file = load_json(os.path.join(os.path.realpath("."), "sources", "vocab", "properties.json")) + self._schema_payload = load_json(schema_file_path) + self.class_uri = URIRef(self._schema_payload["_type"]) + + def _target_file_without_extension(self) -> str: + return os.path.join(self.version, "/".join(self.relative_path_without_extension)) + + def _restriction_mutiple_range(self, prop_uri:URIRef, prop_range:List, prop_spec:Dict, required:bool): + restriction = BNode() + self.graph.add((self.class_uri, RDFS.subClassOf, restriction)) + self.graph.add((restriction, RDF.type, OWL.Restriction)) + self.graph.add((restriction, OWL.onProperty, prop_uri)) + if len(prop_range) == 1: + self.graph.add((restriction, OWL.allValuesFrom, URIRef(prop_range[0]))) + else: + all_values_from = BNode() + self.graph.add((restriction, OWL.allValuesFrom, all_values_from)) + union_list_node = BNode() + Collection(self.graph, union_list_node, [URIRef(embedded_type) for embedded_type in prop_range]) + self.graph.add((all_values_from, RDF.type, OWL.Class)) + self.graph.add((all_values_from, OWL.unionOf, union_list_node)) + if ('type' in prop_spec and prop_spec['type'] != 'array') or ('type' not in prop_spec): + self.graph.add((restriction, OWL.maxCardinality, Literal(1, datatype=XSD.nonNegativeInteger))) + #elif ('type' in prop_spec and prop_spec['type'] != 'array') and (len(linked_types := prop_spec.get('_linkedTypes') or []) == 1 or len(embedded_types := prop_spec.get('_embeddedTypes') or []) == 1): + # self.graph.add((prop_uri, RDFS.range, URIRef((linked_types or embedded_types)[0]))) + + if required: + self.graph.add((self.class_uri, RDFS.subClassOf, restriction)) + self.graph.add((restriction, RDF.type, OWL.Restriction)) + self.graph.add((restriction, OWL.onProperty, URIRef(prop_uri))) + self.graph.add((restriction, OWL.minCardinality, Literal(1, datatype=XSD.nonNegativeInteger))) + + def _translate_property_specifications(self, prop_uri:URIRef, prop_spec:Dict, required:bool): + prop_uri = URIRef(prop_uri) + if '_linkedTypes' in prop_spec: + self.graph.add((prop_uri, RDF.type, OWL.ObjectProperty)) + self._restriction_mutiple_range(prop_uri, prop_spec['_linkedTypes'], prop_spec, required) + + elif '_embeddedTypes' in prop_spec: + self.graph.add((prop_uri, RDF.type, OWL.ObjectProperty)) + self._restriction_mutiple_range(prop_uri, prop_spec['_embeddedTypes'], prop_spec, required) + + elif 'type' in prop_spec and prop_spec['type'] in ['string', 'number', 'array']: + self.graph.add((prop_uri, RDF.type, OWL.DatatypeProperty)) + if prop_spec['type'] == 'string': + # TODO include list of _formats and other datatypes + if '_formats' in prop_spec and 'date' in prop_spec['_formats'] and len(prop_spec['_formats']) == 1: + self.graph.add((prop_uri, RDFS.range, XSD.date)) + else: # IRI not represented in OWL as datatype (xsd:anyURI not suitable) + self.graph.add((prop_uri, RDFS.range, XSD.string)) + elif prop_spec['type'] == 'number': + self.graph.add((prop_uri, RDFS.range, XSD.decimal)) + elif prop_spec['type'] == 'array': + self.graph.add((prop_uri, RDFS.range, RDF.List)) + restriction = BNode() + self.graph.add((self.class_uri, RDFS.subClassOf, restriction)) + self.graph.add((restriction, RDF.type, OWL.Restriction)) + self.graph.add((restriction, OWL.onProperty, prop_uri)) + self.graph.add((restriction, OWL.maxCardinality, Literal(1, datatype=XSD.nonNegativeInteger))) + + if required: + restriction = BNode() + self.graph.add((self.class_uri, RDFS.subClassOf, restriction)) + self.graph.add((restriction, RDF.type, OWL.Restriction)) + self.graph.add((restriction, OWL.onProperty, URIRef(prop_uri))) + self.graph.add((restriction, OWL.minCardinality, Literal(1, datatype=XSD.nonNegativeInteger))) + pass + + self.graph.add((prop_uri, RDFS.label, Literal(prop_spec['label']))) + if 'description' in prop_spec: + self.graph.add((prop_uri, RDFS.comment, Literal(prop_spec['description']))) + #self.graph.add((prop_uri, FOAF.name, Literal(prop_spec['name']))) + + return + + def translate(self): + self.graph.add((self.class_uri, RDF.type, OWL.Class)) + self.graph.add((self.class_uri, RDFS.label, Literal(self._schema_payload['label']))) + if 'description' in self._schema_payload: + self.graph.add((self.class_uri, RDFS.comment, Literal(self._schema_payload['description']))) + #self.graph.add((self.class_uri, FOAF.name, Literal(self._schema_payload['name']))) + + if "properties" in self._schema_payload and self._schema_payload["properties"]: + for prop_uri, prop_spec in self._schema_payload['properties'].items(): + required = False + if "required" in self._schema_payload and self._schema_payload["required"]: + if prop_uri in self._schema_payload['required']: + required=True + self._translate_property_specifications(prop_uri, prop_spec, required) + + #self.graph.add((URIRef(prop_uri), RDFS.domain, OWL.Thing)) + if prop_uri.split('/')[-1] in self.properties_file: + # property rdfs:domain + if len(self.properties_file[prop_uri.split('/')[-1]]['usedIn'][self.version]) > 1: + property_domain = BNode() + self.graph.add((URIRef(prop_uri), RDFS.domain, property_domain)) + union_list_node = BNode() + Collection(self.graph, union_list_node, [URIRef(domain_type) for domain_type in self.properties_file[prop_uri.split('/')[-1]]['usedIn'][self.version]]) + self.graph.add((property_domain, RDF.type, OWL.Class)) + self.graph.add((property_domain, OWL.unionOf, union_list_node)) + else: + self.graph.add((URIRef(prop_uri), RDFS.domain, URIRef(self.properties_file[prop_uri.split('/')[-1]]['usedIn'][self.version][0]))) + # property rdfs:range + if 'asEdge' in self.properties_file[prop_uri.split('/')[-1]] and len(self.properties_file[prop_uri.split('/')[-1]]['asEdge']['canPointTo'].get(self.version, [])) > 1: + property_range = BNode() + self.graph.add((URIRef(prop_uri), RDFS.range, property_range)) + union_list_node = BNode() + Collection(self.graph, union_list_node, [URIRef(range_type) for range_type in self.properties_file[prop_uri.split('/')[-1]]['asEdge']['canPointTo'][self.version]]) + self.graph.add((property_range, RDF.type, OWL.Class)) + self.graph.add((property_range, OWL.unionOf, union_list_node)) + elif 'asEdge' in self.properties_file[prop_uri.split('/')[-1]] and self.properties_file[prop_uri.split('/')[-1]]['asEdge']['canPointTo'].get(self.version): + self.graph.add((URIRef(prop_uri), RDFS.range, URIRef(self.properties_file[prop_uri.split('/')[-1]]['asEdge']['canPointTo'][self.version][0]))) + return + + def build(self): + target_file = os.path.join("target", "schemas", "Turtle", f"{self._target_file_without_extension()}.ttl") + os.makedirs(os.path.dirname(target_file), exist_ok=True) + self.translate() + self.graph.serialize(destination=target_file, format="ttl") + target_file = os.path.join("target", "schemas", "RDF-XML", f"{self._target_file_without_extension()}.xml") + os.makedirs(os.path.dirname(target_file), exist_ok=True) + self.graph.serialize(destination=target_file, format="xml") + target_file = os.path.join("target", "schemas", "JSON-LD", f"{self._target_file_without_extension()}.jsonld") + os.makedirs(os.path.dirname(target_file), exist_ok=True) + self.graph.serialize(destination=target_file, format="json-ld") diff --git a/pipeline/utils.py b/pipeline/utils.py new file mode 100644 index 00000000..741aa7d7 --- /dev/null +++ b/pipeline/utils.py @@ -0,0 +1,29 @@ +import glob +import os +import json +import shutil +from typing import List + +from git import Repo, GitCommandError + +def clone_sources(): + if os.path.exists("sources"): + shutil.rmtree("sources") + Repo.clone_from("https://github.com/openMetadataInitiative/openMINDS.git", to_path="sources", depth=1) + +def load_json(path): + with open(path) as f: + json_file = json.load(f) + return json_file + +class SchemaLoader(object): + + def __init__(self): + self._root_directory = os.path.realpath(".") + self.schemas_sources = os.path.join(self._root_directory, "sources", "schemas") + + def get_schema_versions(self) -> List[str]: + return os.listdir(self.schemas_sources) + + def find_schemas(self, version:str) -> List[str]: + return glob.glob(os.path.join(self.schemas_sources, version, f'**/*.schema.omi.json'), recursive=True) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..72bc6020 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +gitpython +rdflib From 079aecd97bb28d777ce84528fd8a6bb11766cac4 Mon Sep 17 00:00:00 2001 From: raphaelgazzotti Date: Mon, 16 Jun 2025 14:29:02 +0200 Subject: [PATCH 2/5] Update build.yml with uv. --- .github/workflows/build.yml | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index cff4c6f2..4b92f40e 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -12,16 +12,19 @@ jobs: steps: - name: Checkout Repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v5 - - name: Set up Python 3.11 - uses: actions/setup-python@v2 + - name: Set up Python + uses: actions/setup-python@v5 with: - python-version: 3.11 + python-version: 3.13 - name: Run build run: | - mu install -r requirements.txt + uv pip install --system -r openMINDS_actions/requirements.txt python build.py - name: Checkout main branch From c3032c183ed98af6ff4cc917f2b66e37b5fb5874 Mon Sep 17 00:00:00 2001 From: raphaelgazzotti Date: Mon, 16 Jun 2025 14:31:34 +0200 Subject: [PATCH 3/5] Update build.yml with uv. --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 4b92f40e..9d3adc41 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -24,7 +24,7 @@ jobs: - name: Run build run: | - uv pip install --system -r openMINDS_actions/requirements.txt + uv pip install --system -r requirements.txt python build.py - name: Checkout main branch From a8ab351b8b714cbca69f63a8b23f65f8c9db9cd0 Mon Sep 17 00:00:00 2001 From: Raphael-Gazzotti <125291580+Raphael-Gazzotti@users.noreply.github.com> Date: Tue, 17 Jun 2025 17:57:17 +0200 Subject: [PATCH 4/5] Modify extension for turtle files. --- pipeline/translator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline/translator.py b/pipeline/translator.py index cb0d60bc..587c12a7 100644 --- a/pipeline/translator.py +++ b/pipeline/translator.py @@ -129,7 +129,7 @@ def translate(self): return def build(self): - target_file = os.path.join("target", "schemas", "Turtle", f"{self._target_file_without_extension()}.ttl") + target_file = os.path.join("target", "schemas", "Turtle", f"{self._target_file_without_extension()}.owl") os.makedirs(os.path.dirname(target_file), exist_ok=True) self.translate() self.graph.serialize(destination=target_file, format="ttl") From 11a18c05290fafccfc16b192f0dd88565e1ccb87 Mon Sep 17 00:00:00 2001 From: raphaelgazzotti Date: Tue, 19 May 2026 10:50:37 +0200 Subject: [PATCH 5/5] Improve build logs and add rsync --- .github/workflows/build.yml | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 9d3adc41..711047b2 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -5,6 +5,15 @@ on: branches: - pipeline workflow_dispatch: # This triggers the workflow when a webhook is received + inputs: + branch: + description: 'The branch of the submodule the workflow was triggered for' + required: true + type: string + repository: + description: 'The repository of the submodule the workflow was triggered for' + required: true + type: string jobs: build: @@ -36,13 +45,17 @@ jobs: - name: Push to main run: | - cp -R target/* main + rsync -a --delete --exclude '.git/' --exclude 'img/' --exclude 'README.md' --exclude 'LICENSE' target/ main/ cd main git config --global user.email "support@openmetadatainitiative.org" git config --global user.name "openMINDS pipeline" - if [[ $(git add . --dry-run | wc -l) -gt 0 ]]; then - git add . - git commit -m "build triggered by ${{ github.event_name }}" + if [[ $(git add -A --dry-run | wc -l) -gt 0 ]]; then + git add -A + if [ "${{ github.event_name }}" == "workflow_dispatch" ]; then + git commit -m "build triggered by submodule ${{ inputs.repository }} version ${{ inputs.branch }}" + else + git commit -m "build triggered by ${{ github.event_name }}" + fi git push -f else echo "Nothing to commit"