From aac58c90466156cdef509666a22da821bb9accc6 Mon Sep 17 00:00:00 2001 From: Carsten Milling Date: Tue, 24 Mar 2026 21:31:25 +0100 Subject: [PATCH 1/2] Add support for dracorCorpus dracorCorpus is replacing teiCorpus in our corpus.xml files, but we still need to support it until the next major API version. see dracor-org/dracor-schema#167 --- api.yaml | 13 +-- jobs/process-webhook-delivery.xq | 6 +- modules/api.xqm | 135 +++++++--------------- modules/dts.xqm | 17 ++- modules/load.xqm | 8 +- modules/metrics.xqm | 3 +- modules/util.xqm | 191 ++++++++++++++++++++++--------- modules/webhook.xqm | 7 +- 8 files changed, 214 insertions(+), 166 deletions(-) diff --git a/api.yaml b/api.yaml index 08d5d560..72a66423 100644 --- a/api.yaml +++ b/api.yaml @@ -140,13 +140,10 @@ paths: The meta data for the new corpus can be provided in either JSON or XML format. The JSON structure is a straightforward object providing corpus name, title and (optionally) a repository URL. The XML format - needs to be a TEI document with `teiCorpus` as its root element. The + needs to be a TEI document with `dracorCorpus` as its root element. The corpus title needs to be provided in the `titleStmt` while the name and repo URL are encoded in particular `idno` elements in the `publicationStmt` (see example). - - NB: Contrary to the TEI schema our teiCorpus document must not contain - the `TEI` elements for individual plays. content: application/json: schema: @@ -183,7 +180,7 @@ paths: type: string example: | - + @@ -191,12 +188,12 @@ paths: DraCor - test - https://github.com/dracor-org/testdracor + test + - + responses: '200': description: Returns corpus metadata diff --git a/jobs/process-webhook-delivery.xq b/jobs/process-webhook-delivery.xq index a16db7f9..781a7e92 100644 --- a/jobs/process-webhook-delivery.xq +++ b/jobs/process-webhook-delivery.xq @@ -150,8 +150,10 @@ declare function local:process-delivery () { /delivery[@id = $local:delivery and not(@processed)] let $repo := $delivery/@repo/string() let $after := $delivery/@after/string() - let $corpus := collection($config:corpora-root)//tei:teiCorpus[ - tei:teiHeader//tei:publicationStmt/tei:idno[@type="repo" and . = $repo] + (: DEPRECATED: remove teiCorpus support in v2 :) + let $corpus := collection($config:corpora-root)/(tei:dracorCorpus|tei:teiCorpus)[ + tei:teiHeader//tei:publicationStmt/tei:ref[@type="repo" and @target = $repo] + or tei:teiHeader//tei:publicationStmt/tei:idno[@type="repo" and . = $repo] ] let $info := dutil:get-corpus-info($corpus) diff --git a/modules/api.xqm b/modules/api.xqm index 845286f9..c5501d5c 100644 --- a/modules/api.xqm +++ b/modules/api.xqm @@ -238,7 +238,8 @@ declare %output:method("json") function api:corpora($include) { array { - for $corpus in collection($config:corpora-root)//tei:teiCorpus + (: DEPRECATED: remove teiCorpus support in v2 :) + for $corpus in collection($config:corpora-root)/(tei:dracorCorpus|tei:teiCorpus) let $info := dutil:get-corpus-info($corpus) let $name := $info?name order by $name @@ -256,11 +257,8 @@ function api:corpora($include) { (:~ : Add new corpus : - : @param $data corpus.xml containing teiCorpus element. - : @result XML document - : - : FIXME: create utility function that can be used both here and in - : api:corpora-post-json() below. + : @param $data corpus.xml containing dracorCorpus element. + : @result JSON :) declare %rest:POST("{$data}") @@ -269,69 +267,40 @@ declare %rest:consumes("application/xml", "text/xml") %rest:produces("application/json") %output:method("json") -function api:corpora-post-tei($data, $auth) { +function api:corpora-post-tei($data as document-node(), $auth) { if (not($auth)) then ( - - - , - map { - "message": "authorization required" - } + , + map { "message": "authorization required" } ) - else - - let $header := if ($data) then $data//tei:teiCorpus/tei:teiHeader else () - let $name := $header//tei:publicationStmt/tei:idno[ - @type = "URI" and @xml:base = "https://dracor.org/" - ]/text() - - let $title := $header//tei:titleStmt/tei:title[1]/text() - - return if (not($header)) then + else try { + dutil:create-corpus-from-xml($data/*) + } catch dutil:invalid-corpus-document { ( - - - , - map { - "error": "invalid document, expecting " - } + , + map { "error": "Invalid corpus document. " || $err:description } ) - else if (not($name) or not($title)) then + } catch dutil:invalid-corpus-name { ( - - - , - map { - "error": "missing name or title" - } + , + map { "error": $err:description } ) - else if (not(matches($name, '^[-a-z0-1]+$'))) then + } catch dutil:corpus-exists { ( - - - , - map { - "error": "invalid name", - "message": "Only lower case ASCII letters and digits are accepted." - } + , + map { "error": $err:description } ) - else - let $corpus := dutil:get-corpus($name) - return if ($corpus) then ( - - - , - map { - "error": "corpus already exists" - } - ) else ( - dutil:create-corpus($name, $data/tei:teiCorpus), + } catch * { + ( + , map { - "name": $name, - "title": $title + "error": $err:description, + "module": $err:module, + "line": $err:line-number, + "code": $err:code } ) + } }; (:~ @@ -339,9 +308,6 @@ function api:corpora-post-tei($data, $auth) { : : @param $data JSON object describing corpus meta data : @result JSON object - : - : FIXME: create utility function that can be used both here and in - : api:corpora-post-tei() above. :) declare %rest:POST("{$data}") @@ -351,43 +317,30 @@ declare %output:media-type("application/json") %output:method("json") function api:corpora-post-json($data) { - let $json := parse-json(util:base64-decode($data)) - let $name := $json?name - let $description := $json?description - let $corpus := dutil:get-corpus($name) - - return if ($corpus) then + if (not($auth)) then ( - - - , - map { - "error": "corpus already exists" - } + , + map { "message": "authorization required" } ) - else if (not($name) or not($json?title)) then + else try { + let $json := parse-json(util:base64-decode($data)) + return dutil:create-corpus($json) + } catch dutil:invalid-corpus-name { ( - - - , - map { - "error": "missing name or title" - } + , + map { "error": $err:description } ) - else if (not(matches($name, '^[-a-z0-1]+$'))) then + } catch dutil:corpus-exists { ( - - - , - map { - "error": "invalid name", - "message": "Only lower case ASCII letters and digits are accepted." - } + , + map { "error": $err:description } ) - else ( - dutil:create-corpus($json), - $json - ) + } catch * { + ( + , + map { "error": $err:description } + ) + } }; (:~ diff --git a/modules/dts.xqm b/modules/dts.xqm index 0ac9de6f..c80e2744 100644 --- a/modules/dts.xqm +++ b/modules/dts.xqm @@ -219,7 +219,8 @@ as item()+ { "Paging is not possible on a single resource. Try without parameter 'page'!" ) - else if ($corpus/name() eq "teiCorpus") then + (: DEPRECATED: remove teiCorpus support in v2 :) + else if ($corpus/name() = ("dracorCorpus", "teiCorpus")) then if ( $nav eq 'parents') then local:corpus-to-collection-with-parent-as-member($id) else @@ -238,7 +239,8 @@ as item()+ { let $corpusname := local:uri-to-id($id) let $corpus := dutil:get-corpus($corpusname) return - if ($corpus/name() eq "teiCorpus") then + (: DEPRECATED: remove teiCorpus support in v2 :) + if ($corpus/name() = ("dracorCorpus", "teiCorpus")) then if ( $page ) then (: paging is currently not supported :) (: test: http://localhost:8088/api/v1/dts/collection?id=http://localhost:8088/id/rus&page=1 :) @@ -368,10 +370,13 @@ as item()+ { :) declare function local:root-collection() as map() { - (: Get the corpora, get info needed for the member-array :) - let $corpora := collection($config:corpora-root)//tei:teiCorpus - (: get all the ids – these has to evaluate the teiCorpus files, unfortunately :) - let $corpus-ids := $corpora//tei:idno[@type eq "URI"][@xml:base eq "https://dracor.org/"]/string() + (: Get the corpora, get info needed for the member-array :) + (: DEPRECATED: remove teiCorpus support in v2 :) + let $corpora := collection($config:corpora-root)//(tei:dracorCorpus|tei:teiCorpus) + (: get all the ids – these has to evaluate the dracorCorpus files, unfortunately :) + let $corpus-ids := $corpora//tei:publicationStmt/tei:idno[ + not(@type) or (@type eq "URI" and @xml:base eq "https://dracor.org/") + ]/string() let $members := array { for $corpus-id in $corpus-ids return local:collection-member-by-id($corpus-id) diff --git a/modules/load.xqm b/modules/load.xqm index e2214d7b..b343c08b 100644 --- a/modules/load.xqm +++ b/modules/load.xqm @@ -72,11 +72,13 @@ declare function local:record-corpus-sha($name) { (:~ : Load corpus from ZIP archive : - : @param $corpus The element providing corpus name and archive URL + : @param $corpus The element providing corpus name and archive URL : @return List of created collections and files + : + : NB: until we remove support for it in v2 $corpus can also be a + : element :) -declare function load:load-corpus($corpus as element(tei:teiCorpus)) -as xs:string* { +declare function load:load-corpus($corpus as element()) as xs:string* { let $info := dutil:get-corpus-info($corpus) let $name := $info?name diff --git a/modules/metrics.xqm b/modules/metrics.xqm index 4e92a5a1..bbacaad0 100644 --- a/modules/metrics.xqm +++ b/modules/metrics.xqm @@ -77,7 +77,8 @@ declare function metrics:collect-sitelinks($corpus as xs:string) { : sitelinks collection :) declare function metrics:collect-sitelinks() { - for $corpus in collection($config:corpora-root)//tei:teiCorpus + (: DEPRECATED: remove teiCorpus support in v2 :) + for $corpus in collection($config:corpora-root)/(tei:dracorCorpus|tei:teiCorpus) let $info := dutil:get-corpus-info($corpus) return metrics:collect-sitelinks($info?name) }; diff --git a/modules/util.xqm b/modules/util.xqm index cde4f4f2..793cb17a 100644 --- a/modules/util.xqm +++ b/modules/util.xqm @@ -473,19 +473,20 @@ declare function dutil:count-sitelinks( }; (:~ - : Get teiCorpus element for corpus identified by $corpusname. + : Get dracorCorpus element for corpus identified by $corpusname. : : @param $corpusname - : @return teiCorpus element + : @return dracorCorpus element :) declare function dutil:get-corpus( $corpusname as xs:string ) as element()* { - collection($config:corpora-root)//tei:teiCorpus[ + (: DEPRECATED: remove teiCorpus support in v2 :) + collection($config:corpora-root)/(tei:dracorCorpus|tei:teiCorpus)[ tei:teiHeader//tei:publicationStmt/tei:idno[ - @type="URI" and - @xml:base="https://dracor.org/" and - . = $corpusname + (not(@type) or + (@type="URI" and @xml:base="https://dracor.org/") + ) and . = $corpusname ] ] }; @@ -514,15 +515,17 @@ declare function local:markdown($input as element()) as item()* { : @return map :) declare function dutil:get-corpus-info( - $corpus as element(tei:teiCorpus)* + $corpus as element()* ) as map(*)* { - let $header := $corpus/tei:teiHeader + let $header := $corpus[1]/tei:teiHeader let $name := $header//tei:publicationStmt/tei:idno[ - @type="URI" and @xml:base="https://dracor.org/" - ][1]/string() + not(@type) or (@type="URI" and @xml:base="https://dracor.org/") + ][1]/text() let $title := $header/tei:fileDesc/tei:titleStmt/tei:title[1]/string() let $acronym := $header/tei:fileDesc/tei:titleStmt/tei:title[@type="acronym"][1]/string() - let $repo := $header//tei:publicationStmt/tei:idno[@type="repo"][1]/string() + let $repo := $header//tei:publicationStmt/( + tei:idno[@type="repo"][1]|tei:ref[@type="repo"]/@target + )/string() let $projectDesc := $header/tei:encodingDesc/tei:projectDesc let $licence := $header//tei:publicationStmt/tei:availability/tei:licence let $description := if ($projectDesc) then ( @@ -1376,61 +1379,143 @@ declare function dutil:id-to-url ( else () }; +declare function local:create-corpus( + $name as xs:string, + $xml as element(tei:dracorCorpus) +) { + util:log-system-out("creating corpus"), + util:log-system-out($xml), + xmldb:store( + xmldb:create-collection($config:corpora-root, $name), + "corpus.xml", + $xml + ) +}; + (:~ - : Create new corpus collection + : Create new corpus collection from JSON : : @param $corpus Map with corpus description :) declare function dutil:create-corpus($corpus as map()) { - let $xml := - - - - - {$corpus?title} - - - {$corpus?name} - { - if ($corpus?repository) - then {$corpus?repository} - else () - } - - - {if ($corpus?description) then ( - - + let $name := $corpus?name + + return if (not($name) or not($corpus?title)) then + error ( + xs:QName('dutil:invalid-corpus-document'), + "Missing corpus name or title" + ) + else if (not(matches($name, '^[a-z]+$'))) then + error ( + xs:QName('dutil:invalid-corpus-name'), + "Invalid name '" || $name + || "'. Only lower case ASCII letters are accepted" + ) + else ( + let $exists := dutil:get-corpus($name) + return if ($exists) then + error ( + xs:QName('dutil:corpus-exists'), + "Corpus with name '" || $name || "' already exists" + ) + else + let $xml := + + + + + {$corpus?title} + + + {$corpus?name} { - for $p in tokenize($corpus?description, " ") - return

{$p}

+ if ($corpus?repository) + then + else () } -
-
- ) else ()} -
-
- - return dutil:create-corpus($corpus?name, $xml) + + + {if ($corpus?description) then ( + + + { + for $p in tokenize($corpus?description, " ") + return

{$p}

+ } +
+
+ ) else ()} + +
+ return ( + local:create-corpus($name, $xml), + $corpus + ) + ) }; (:~ - : Create new corpus collection + : Create new corpus collection from dracorCorpus document : - : @param $name Corpus name - : @param $xml Corpus description + : This function does not rely on the corpus document to be valid against the + : DraCor schema. In fact, it works with legacy teiCorpus documents from which + : it only extracts the teiHeader element and wraps it into dracorCorpus. + : + : @param $xml dracorCorpus document :) -declare function dutil:create-corpus( - $name as xs:string, - $xml as element(tei:teiCorpus) -) { - util:log-system-out("creating corpus"), - util:log-system-out($xml), - xmldb:store( - xmldb:create-collection($config:corpora-root, $name), - "corpus.xml", - $xml +declare function dutil:create-corpus-from-xml($xml as element()) { + let $corpus := element {QName('http://www.tei-c.org/ns/1.0', 'dracorCorpus')} { + $xml/tei:teiHeader + } + + let $header := $corpus/tei:teiHeader[1] + (: A document might (erroneously) have multiple idnos from which we extract + the distinct values. Below we will throw an error if there is more than + one unique name. :) + let $name := distinct-values( + $header//tei:publicationStmt/tei:idno[ + not(@type) + or (: DEPRECATED: drop support for in v2 :) + @type = "URI" and @xml:base = "https://dracor.org/" + ] ) + let $title := $header//tei:titleStmt/tei:title[1]/text() + + return if (not($header)) then + error ( + xs:QName('dutil:invalid-corpus-document'), + "Missing teiHeader element" + ) + else if (count($name) > 1) then + error ( + xs:QName('dutil:invalid-corpus-document'), + "Multiple corpus names found" + ) + else if (not($name) or not($title)) then + error ( + xs:QName('dutil:invalid-corpus-document'), + "Missing corpus name or title" + ) + else if (not(matches($name, '^[a-z]+$'))) then + error ( + xs:QName('dutil:invalid-corpus-name'), + "Invalid name '" || $name + || "'. Only lower case ASCII letters are accepted" + ) + else + let $exists := dutil:get-corpus($name) + return if ($exists) then ( + error ( + xs:QName('dutil:corpus-exists'), + "Corpus with name '" || $name || "' already exists" + ) + ) else ( + local:create-corpus($name, $corpus), + map { + "name": $name, + "title": $title + } + ) }; (:~ diff --git a/modules/webhook.xqm b/modules/webhook.xqm index 5aac7422..d9e1729c 100644 --- a/modules/webhook.xqm +++ b/modules/webhook.xqm @@ -29,8 +29,11 @@ declare function local:check-signature ( }; declare function local:get-corpus ($repo-url as xs:string) as element()? { - collection($config:corpora-root)//tei:teiCorpus[ - tei:teiHeader//tei:publicationStmt/tei:idno[@type="repo" and . = $repo-url] + collection($config:corpora-root)/(tei:dracorCorpus|tei:teiCorpus)[ + tei:teiHeader//tei:publicationStmt/( + tei:ref[@type="repo" and @target = $repo-url] | + tei:idno[@type="repo" and . = $repo-url] + ) ] }; From 1e8f1296296709d597697f6d568532742418ebf0 Mon Sep 17 00:00:00 2001 From: Carsten Milling Date: Wed, 25 Mar 2026 10:33:12 +0100 Subject: [PATCH 2/2] Fix problem of vanishing corpus name in load-corpus() loading stopped with "recreating null" --- modules/util.xqm | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/modules/util.xqm b/modules/util.xqm index 793cb17a..1571eece 100644 --- a/modules/util.xqm +++ b/modules/util.xqm @@ -514,13 +514,15 @@ declare function local:markdown($input as element()) as item()* { : @param $corpusname : @return map :) -declare function dutil:get-corpus-info( - $corpus as element()* -) as map(*)* { +declare function dutil:get-corpus-info($corpus as element()*) as map(*)* { + (: IMPORTANT: do not assign any text() nodes to variables emitted with the + returned map. These will vanish when the corpus.xml is deleted and thus + break load:load-corpus(). (the "recreating null" problem) + :) let $header := $corpus[1]/tei:teiHeader let $name := $header//tei:publicationStmt/tei:idno[ not(@type) or (@type="URI" and @xml:base="https://dracor.org/") - ][1]/text() + ][1]/string() let $title := $header/tei:fileDesc/tei:titleStmt/tei:title[1]/string() let $acronym := $header/tei:fileDesc/tei:titleStmt/tei:title[@type="acronym"][1]/string() let $repo := $header//tei:publicationStmt/( @@ -533,7 +535,7 @@ declare function dutil:get-corpus-info( return string-join($paras, " ") ) else () let $git-file := $config:corpora-root || "/" || $name || "/git.xml" - let $sha := doc($git-file)/git/sha/text() + let $sha := doc($git-file)/git/sha/string() return if ($header) then ( map:merge((