From 1ab38bb559b2aaa366cffe881e630610f8cf3510 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Fri, 15 Aug 2025 13:16:10 -0400 Subject: [PATCH 01/10] base managed executor --- .../iq/dataverse/api/MakeDataCountApi.java | 178 +++++++++++------- 1 file changed, 107 insertions(+), 71 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/api/MakeDataCountApi.java b/src/main/java/edu/harvard/iq/dataverse/api/MakeDataCountApi.java index 562fd7fcb81..72788f1d8e7 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/MakeDataCountApi.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/MakeDataCountApi.java @@ -27,7 +27,10 @@ import java.util.List; import java.util.logging.Level; import java.util.logging.Logger; + +import jakarta.annotation.Resource; import jakarta.ejb.EJB; +import jakarta.enterprise.concurrent.ManagedExecutorService; import jakarta.json.Json; import jakarta.json.JsonArray; import jakarta.json.JsonArrayBuilder; @@ -62,6 +65,10 @@ public class MakeDataCountApi extends AbstractApiBean { @EJB SystemConfig systemConfig; + // Inject the managed executor service provided by the container + @Resource(name = "concurrent/CitationUpdateExecutor") + private ManagedExecutorService executorService; + /** * TODO: For each dataset, send the following: * @@ -141,89 +148,118 @@ public Response addUsageMetricsFromSushiReportAll(@QueryParam("reportOnDisk") St @POST @Path("{id}/updateCitationsForDataset") - public Response updateCitationsForDataset(@PathParam("id") String id) throws IOException { + public Response updateCitationsForDataset(@PathParam("id") String id) { try { - Dataset dataset = findDatasetOrDie(id); - GlobalId pid = dataset.getGlobalId(); - PidProvider pidProvider = PidUtil.getPidProvider(pid.getProviderId()); + // First validate that the dataset exists and has a valid DOI + final Dataset dataset = findDatasetOrDie(id); + final GlobalId pid = dataset.getGlobalId(); + final PidProvider pidProvider = PidUtil.getPidProvider(pid.getProviderId()); + // Only supported for DOIs and for DataCite DOI providers if(!DataCiteDOIProvider.TYPE.equals(pidProvider.getProviderType())) { return error(Status.BAD_REQUEST, "Only DataCite DOI providers are supported"); } - String persistentId = pid.toString(); - - // DataCite wants "doi=", not "doi:". - String authorityPlusIdentifier = persistentId.replaceFirst("doi:", ""); - // Request max page size and then loop to handle multiple pages - URL url = null; - try { - url = new URI(JvmSettings.DATACITE_REST_API_URL.lookup(pidProvider.getId()) + - "/events?doi=" + - authorityPlusIdentifier + - "&source=crossref&page[size]=1000&page[cursor]=1").toURL(); - } catch (URISyntaxException e) { - //Nominally this means a config error/ bad DATACITE_REST_API_URL for this provider - logger.warning("Unable to create URL for " + persistentId + ", pidProvider " + pidProvider.getId()); - return error(Status.INTERNAL_SERVER_ERROR, "Unable to create DataCite URL to retrieve citations."); - } - logger.fine("Retrieving Citations from " + url.toString()); - boolean nextPage = true; - JsonArrayBuilder dataBuilder = Json.createArrayBuilder(); - do { - HttpURLConnection connection = (HttpURLConnection) url.openConnection(); - connection.setRequestMethod("GET"); - int status = connection.getResponseCode(); - if (status != 200) { - logger.warning("Failed to get citations from " + url.toString()); - connection.disconnect(); - return error(Status.fromStatusCode(status), "Failed to get citations from " + url.toString()); - } - JsonObject report; - try (InputStream inStream = connection.getInputStream()) { - report = JsonUtil.getJsonObject(inStream); - } finally { - connection.disconnect(); - } - JsonObject links = report.getJsonObject("links"); - JsonArray data = report.getJsonArray("data"); - Iterator iter = data.iterator(); - while (iter.hasNext()) { - dataBuilder.add(iter.next()); + + // Submit the task to the managed executor service + Future future = executorService.submit(() -> { + try { + processCitationUpdate(dataset, pid, pidProvider); + } catch (Exception e) { + logger.log(Level.SEVERE, "Error processing citation update for dataset " + id, e); } - if (links.containsKey("next")) { - try { - url = new URI(links.getString("next")).toURL(); - } catch (URISyntaxException e) { - logger.warning("Unable to create URL from DataCite response: " + links.getString("next")); - return error(Status.INTERNAL_SERVER_ERROR, "Unable to retrieve all results from DataCite"); - } - } else { - nextPage = false; - } - logger.fine("body of citation response: " + report.toString()); - } while (nextPage == true); - JsonArray allData = dataBuilder.build(); - List datasetExternalCitations = datasetExternalCitationsService.parseCitations(allData); - /* - * ToDo: If this is the only source of citations, we should remove all the existing ones for the dataset and repopulate them. - * As is, this call doesn't remove old citations if there are now none (legacy issue if we decide to stop counting certain types of citation - * as we've done for 'hasPart'). - * If there are some, this call individually checks each one and if a matching item exists, it removes it and adds it back. Faster and better to delete all and - * add the new ones. - */ - if (!datasetExternalCitations.isEmpty()) { - for (DatasetExternalCitations dm : datasetExternalCitations) { - datasetExternalCitationsService.save(dm); - } - } - + }); + JsonObjectBuilder output = Json.createObjectBuilder(); - output.add("citationCount", datasetExternalCitations.size()); + output.add("status", "queued"); + output.add("message", "Citation update for dataset " + id + " has been queued for processing"); return ok(output); } catch (WrappedResponse wr) { return wr.getResponse(); } } + + /** + * Process the citation update for a dataset + * This method contains the logic that was previously in updateCitationsForDataset + */ + private void processCitationUpdate(Dataset dataset, GlobalId pid, PidProvider pidProvider) throws IOException { + String persistentId = pid.asRawIdentifier(); + + // Request max page size and then loop to handle multiple pages + URL url = null; + try { + url = new URI(JvmSettings.DATACITE_REST_API_URL.lookup(pidProvider.getId()) + + "/events?doi=" + + persistentId + + "&source=crossref&page[size]=1000&page[cursor]=1").toURL(); + } catch (URISyntaxException e) { + //Nominally this means a config error/ bad DATACITE_REST_API_URL for this provider + logger.warning("Unable to create URL for " + persistentId + ", pidProvider " + pidProvider.getId()); + return; + } + + logger.fine("Retrieving Citations from " + url.toString()); + boolean nextPage = true; + JsonArrayBuilder dataBuilder = Json.createArrayBuilder(); + + do { + HttpURLConnection connection = (HttpURLConnection) url.openConnection(); + connection.setRequestMethod("GET"); + int status = connection.getResponseCode(); + if (status != 200) { + logger.warning("Failed to get citations from " + url.toString()); + connection.disconnect(); + return; + } + + JsonObject report; + try (InputStream inStream = connection.getInputStream()) { + report = JsonUtil.getJsonObject(inStream); + } finally { + connection.disconnect(); + } + + JsonObject links = report.getJsonObject("links"); + JsonArray data = report.getJsonArray("data"); + Iterator iter = data.iterator(); + while (iter.hasNext()) { + dataBuilder.add(iter.next()); + } + + if (links.containsKey("next")) { + try { + url = new URI(links.getString("next")).toURL(); + } catch (URISyntaxException e) { + logger.warning("Unable to create URL from DataCite response: " + links.getString("next")); + return; + } + } else { + nextPage = false; + } + + logger.fine("body of citation response: " + report.toString()); + } while (nextPage == true); + + JsonArray allData = dataBuilder.build(); + List datasetExternalCitations = datasetExternalCitationsService.parseCitations(allData); + + /* + * ToDo: If this is the only source of citations, we should remove all the existing ones for the dataset and repopulate them. + * As is, this call doesn't remove old citations if there are now none (legacy issue if we decide to stop counting certain types of citation + * as we've done for 'hasPart'). + * If there are some, this call individually checks each one and if a matching item exists, it removes it and adds it back. Faster and better to delete all and + * add the new ones. + */ + if (!datasetExternalCitations.isEmpty()) { + for (DatasetExternalCitations dm : datasetExternalCitations) { + datasetExternalCitationsService.save(dm); + } + } + + logger.info("Citation update completed for dataset " + dataset.getId() + + " with " + datasetExternalCitations.size() + " citations"); + } + @GET @Path("{yearMonth}/processingState") public Response getProcessingState(@PathParam("yearMonth") String yearMonth) { From 3981933314e8c695f58315455c80f4f83506f01b Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Fri, 15 Aug 2025 13:38:02 -0400 Subject: [PATCH 02/10] add API_MDC_UPDATE_MIN_DELAY_MS --- .../iq/dataverse/api/MakeDataCountApi.java | 37 +++++++++++++++++++ .../iq/dataverse/settings/JvmSettings.java | 4 ++ 2 files changed, 41 insertions(+) diff --git a/src/main/java/edu/harvard/iq/dataverse/api/MakeDataCountApi.java b/src/main/java/edu/harvard/iq/dataverse/api/MakeDataCountApi.java index 72788f1d8e7..3bed917c789 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/MakeDataCountApi.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/MakeDataCountApi.java @@ -25,6 +25,8 @@ import java.net.URL; import java.util.Iterator; import java.util.List; +import java.util.concurrent.Future; +import java.util.concurrent.atomic.AtomicLong; import java.util.logging.Level; import java.util.logging.Logger; @@ -69,6 +71,9 @@ public class MakeDataCountApi extends AbstractApiBean { @Resource(name = "concurrent/CitationUpdateExecutor") private ManagedExecutorService executorService; + // Track the last execution time to implement rate limiting during Citation updates + private static final AtomicLong lastExecutionTime = new AtomicLong(0); + /** * TODO: For each dataset, send the following: * @@ -163,7 +168,14 @@ public Response updateCitationsForDataset(@PathParam("id") String id) { // Submit the task to the managed executor service Future future = executorService.submit(() -> { try { + // Apply rate limiting if enabled + applyRateLimit(); + + // Process the citation update processCitationUpdate(dataset, pid, pidProvider); + + // Update the last execution time after processing + lastExecutionTime.set(System.currentTimeMillis()); } catch (Exception e) { logger.log(Level.SEVERE, "Error processing citation update for dataset " + id, e); } @@ -178,6 +190,31 @@ public Response updateCitationsForDataset(@PathParam("id") String id) { } } + /** + * Apply rate limiting by waiting if necessary + */ + private void applyRateLimit() { + // Check if rate limiting is enabled + long minDelay = JvmSettings.API_MDC_UPDATE_MIN_DELAY_MS.lookupOptional(Long.class).orElse(0l); + + // Calculate how long to wait + long lastExecution = lastExecutionTime.get(); + long currentTime = System.currentTimeMillis(); + long elapsedTime = currentTime - lastExecution; + + // If not enough time has passed since the last execution, wait + if (lastExecution > 0 && elapsedTime < minDelay) { + long waitTime = minDelay - elapsedTime; + logger.fine("Rate limiting: waiting " + waitTime + " ms before processing next citation update"); + try { + Thread.sleep(waitTime); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + logger.warning("Rate limiting sleep interrupted: " + e.getMessage()); + } + } + } + /** * Process the citation update for a dataset * This method contains the logic that was previously in updateCitationsForDataset diff --git a/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java b/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java index 53dff244ae1..87123801a3e 100644 --- a/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java +++ b/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java @@ -93,6 +93,10 @@ public enum JvmSettings { API_BLOCKED_ENDPOINTS(SCOPE_API_BLOCKED, "endpoints"), API_BLOCKED_POLICY(SCOPE_API_BLOCKED, "policy"), API_BLOCKED_KEY(SCOPE_API_BLOCKED, "key"), + // API: MDC Citation updates + SCOPE_API_MDC(SCOPE_API, "mdc"), + API_MDC_UPDATE_MIN_DELAY_MS(SCOPE_API_MDC, "min-delay-ms"), + // SIGNPOSTING SETTINGS SCOPE_SIGNPOSTING(PREFIX, "signposting"), From 973ab872e203085aee6be078eba85706ac0ac601 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Fri, 15 Aug 2025 13:47:43 -0400 Subject: [PATCH 03/10] filter out hasPart etc earlier --- .../iq/dataverse/api/MakeDataCountApi.java | 22 +++++++++++++++++-- .../DatasetExternalCitationsServiceBean.java | 4 ++-- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/api/MakeDataCountApi.java b/src/main/java/edu/harvard/iq/dataverse/api/MakeDataCountApi.java index 3bed917c789..64309886dd0 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/MakeDataCountApi.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/MakeDataCountApi.java @@ -196,7 +196,9 @@ public Response updateCitationsForDataset(@PathParam("id") String id) { private void applyRateLimit() { // Check if rate limiting is enabled long minDelay = JvmSettings.API_MDC_UPDATE_MIN_DELAY_MS.lookupOptional(Long.class).orElse(0l); - + if(minDelay ==0) { + return; + } // Calculate how long to wait long lastExecution = lastExecutionTime.get(); long currentTime = System.currentTimeMillis(); @@ -260,12 +262,28 @@ private void processCitationUpdate(Dataset dataset, GlobalId pid, PidProvider pi JsonArray data = report.getJsonArray("data"); Iterator iter = data.iterator(); while (iter.hasNext()) { - dataBuilder.add(iter.next()); + JsonValue citationValue = iter.next(); + JsonObject citation = (JsonObject) citationValue; + + // Filter out relations we don't use (e.g. hasPart) to lower memory req. with many files + if (citation.containsKey("attributes")) { + JsonObject attributes = citation.getJsonObject("attributes"); + if (attributes.containsKey("relation-type-id")) { + String relationshipType = attributes.getString("relation-type-id"); + + // Only add citations with relationship types we care about + if (DatasetExternalCitationsServiceBean.inboundRelationships.contains(relationshipType) || + DatasetExternalCitationsServiceBean.outboundRelationships.contains(relationshipType)) { + dataBuilder.add(citationValue); + } + } + } } if (links.containsKey("next")) { try { url = new URI(links.getString("next")).toURL(); + applyRateLimit(); } catch (URISyntaxException e) { logger.warning("Unable to create URL from DataCite response: " + links.getString("next")); return; diff --git a/src/main/java/edu/harvard/iq/dataverse/makedatacount/DatasetExternalCitationsServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/makedatacount/DatasetExternalCitationsServiceBean.java index fa56432cc3c..fa87926210f 100644 --- a/src/main/java/edu/harvard/iq/dataverse/makedatacount/DatasetExternalCitationsServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/makedatacount/DatasetExternalCitationsServiceBean.java @@ -39,13 +39,13 @@ public class DatasetExternalCitationsServiceBean implements java.io.Serializable DatasetServiceBean datasetService; //Array of relationship types that are considered to be citations - static ArrayList inboundRelationships = new ArrayList( + public static ArrayList inboundRelationships = new ArrayList( Arrays.asList( "cites", "references", "supplements", "is-supplement-to")); - static ArrayList outboundRelationships = new ArrayList( + public static ArrayList outboundRelationships = new ArrayList( Arrays.asList( "is-cited-by", "is-referenced-by", From 5f6b076ab248e69f414b4df0a64508f9598c0417 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Fri, 15 Aug 2025 15:06:54 -0400 Subject: [PATCH 04/10] cleanup logging/exception handling --- .../iq/dataverse/api/MakeDataCountApi.java | 147 ++++++++++-------- 1 file changed, 80 insertions(+), 67 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/api/MakeDataCountApi.java b/src/main/java/edu/harvard/iq/dataverse/api/MakeDataCountApi.java index 64309886dd0..6de0e86a254 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/MakeDataCountApi.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/MakeDataCountApi.java @@ -172,10 +172,16 @@ public Response updateCitationsForDataset(@PathParam("id") String id) { applyRateLimit(); // Process the citation update - processCitationUpdate(dataset, pid, pidProvider); + boolean success = processCitationUpdate(dataset, pid, pidProvider); // Update the last execution time after processing lastExecutionTime.set(System.currentTimeMillis()); + + if (success) { + logger.fine("Successfully processed citation update for dataset " + id); + } else { + logger.warning("Failed to process citation update for dataset " + id); + } } catch (Exception e) { logger.log(Level.SEVERE, "Error processing citation update for dataset " + id, e); } @@ -220,8 +226,9 @@ private void applyRateLimit() { /** * Process the citation update for a dataset * This method contains the logic that was previously in updateCitationsForDataset + * @return true if processing was successful, false otherwise */ - private void processCitationUpdate(Dataset dataset, GlobalId pid, PidProvider pidProvider) throws IOException { + private boolean processCitationUpdate(Dataset dataset, GlobalId pid, PidProvider pidProvider) { String persistentId = pid.asRawIdentifier(); // Request max page size and then loop to handle multiple pages @@ -231,88 +238,94 @@ private void processCitationUpdate(Dataset dataset, GlobalId pid, PidProvider pi "/events?doi=" + persistentId + "&source=crossref&page[size]=1000&page[cursor]=1").toURL(); - } catch (URISyntaxException e) { + } catch (URISyntaxException | MalformedURLException e) { //Nominally this means a config error/ bad DATACITE_REST_API_URL for this provider logger.warning("Unable to create URL for " + persistentId + ", pidProvider " + pidProvider.getId()); - return; + return false; } logger.fine("Retrieving Citations from " + url.toString()); boolean nextPage = true; JsonArrayBuilder dataBuilder = Json.createArrayBuilder(); - do { - HttpURLConnection connection = (HttpURLConnection) url.openConnection(); - connection.setRequestMethod("GET"); - int status = connection.getResponseCode(); - if (status != 200) { - logger.warning("Failed to get citations from " + url.toString()); - connection.disconnect(); - return; - } - - JsonObject report; - try (InputStream inStream = connection.getInputStream()) { - report = JsonUtil.getJsonObject(inStream); - } finally { - connection.disconnect(); - } - - JsonObject links = report.getJsonObject("links"); - JsonArray data = report.getJsonArray("data"); - Iterator iter = data.iterator(); - while (iter.hasNext()) { - JsonValue citationValue = iter.next(); - JsonObject citation = (JsonObject) citationValue; + try { + do { + HttpURLConnection connection = (HttpURLConnection) url.openConnection(); + connection.setRequestMethod("GET"); + int status = connection.getResponseCode(); + if (status != 200) { + logger.warning("Failed to get citations from " + url.toString()); + connection.disconnect(); + return false; + } - // Filter out relations we don't use (e.g. hasPart) to lower memory req. with many files - if (citation.containsKey("attributes")) { - JsonObject attributes = citation.getJsonObject("attributes"); - if (attributes.containsKey("relation-type-id")) { - String relationshipType = attributes.getString("relation-type-id"); - - // Only add citations with relationship types we care about - if (DatasetExternalCitationsServiceBean.inboundRelationships.contains(relationshipType) || - DatasetExternalCitationsServiceBean.outboundRelationships.contains(relationshipType)) { - dataBuilder.add(citationValue); + JsonObject report; + try (InputStream inStream = connection.getInputStream()) { + report = JsonUtil.getJsonObject(inStream); + } finally { + connection.disconnect(); + } + + JsonObject links = report.getJsonObject("links"); + JsonArray data = report.getJsonArray("data"); + Iterator iter = data.iterator(); + while (iter.hasNext()) { + JsonValue citationValue = iter.next(); + JsonObject citation = (JsonObject) citationValue; + + // Filter out relations we don't use (e.g. hasPart) to lower memory req. with many files + if (citation.containsKey("attributes")) { + JsonObject attributes = citation.getJsonObject("attributes"); + if (attributes.containsKey("relation-type-id")) { + String relationshipType = attributes.getString("relation-type-id"); + + // Only add citations with relationship types we care about + if (DatasetExternalCitationsServiceBean.inboundRelationships.contains(relationshipType) || + DatasetExternalCitationsServiceBean.outboundRelationships.contains(relationshipType)) { + dataBuilder.add(citationValue); + } } } } - } + + if (links.containsKey("next")) { + try { + url = new URI(links.getString("next")).toURL(); + applyRateLimit(); + } catch (URISyntaxException e) { + logger.warning("Unable to create URL from DataCite response: " + links.getString("next")); + return false; + } + } else { + nextPage = false; + } + + logger.fine("body of citation response: " + report.toString()); + } while (nextPage == true); - if (links.containsKey("next")) { - try { - url = new URI(links.getString("next")).toURL(); - applyRateLimit(); - } catch (URISyntaxException e) { - logger.warning("Unable to create URL from DataCite response: " + links.getString("next")); - return; + JsonArray allData = dataBuilder.build(); + List datasetExternalCitations = datasetExternalCitationsService.parseCitations(allData); + + /* + * ToDo: If this is the only source of citations, we should remove all the existing ones for the dataset and repopulate them. + * As is, this call doesn't remove old citations if there are now none (legacy issue if we decide to stop counting certain types of citation + * as we've done for 'hasPart'). + * If there are some, this call individually checks each one and if a matching item exists, it removes it and adds it back. Faster and better to delete all and + * add the new ones. + */ + if (!datasetExternalCitations.isEmpty()) { + for (DatasetExternalCitations dm : datasetExternalCitations) { + datasetExternalCitationsService.save(dm); } - } else { - nextPage = false; } - logger.fine("body of citation response: " + report.toString()); - } while (nextPage == true); - - JsonArray allData = dataBuilder.build(); - List datasetExternalCitations = datasetExternalCitationsService.parseCitations(allData); - - /* - * ToDo: If this is the only source of citations, we should remove all the existing ones for the dataset and repopulate them. - * As is, this call doesn't remove old citations if there are now none (legacy issue if we decide to stop counting certain types of citation - * as we've done for 'hasPart'). - * If there are some, this call individually checks each one and if a matching item exists, it removes it and adds it back. Faster and better to delete all and - * add the new ones. - */ - if (!datasetExternalCitations.isEmpty()) { - for (DatasetExternalCitations dm : datasetExternalCitations) { - datasetExternalCitationsService.save(dm); - } + logger.fine("Citation update completed for dataset " + dataset.getId() + + " with " + datasetExternalCitations.size() + " citations"); + return true; + } catch (IOException e) { + logger.log(Level.WARNING, "Error processing citation update for dataset " + dataset.getId(), e); + return false; } - - logger.info("Citation update completed for dataset " + dataset.getId() + - " with " + datasetExternalCitations.size() + " citations"); } @GET From a96dffb0678485601e50324d7cb701e08836d350 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Fri, 15 Aug 2025 16:05:33 -0400 Subject: [PATCH 05/10] handle queue full error --- .../iq/dataverse/api/MakeDataCountApi.java | 62 +++++++++++-------- 1 file changed, 35 insertions(+), 27 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/api/MakeDataCountApi.java b/src/main/java/edu/harvard/iq/dataverse/api/MakeDataCountApi.java index 6de0e86a254..ca4f55da822 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/MakeDataCountApi.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/MakeDataCountApi.java @@ -26,6 +26,7 @@ import java.util.Iterator; import java.util.List; import java.util.concurrent.Future; +import java.util.concurrent.RejectedExecutionException; import java.util.concurrent.atomic.AtomicLong; import java.util.logging.Level; import java.util.logging.Logger; @@ -159,38 +160,45 @@ public Response updateCitationsForDataset(@PathParam("id") String id) { final Dataset dataset = findDatasetOrDie(id); final GlobalId pid = dataset.getGlobalId(); final PidProvider pidProvider = PidUtil.getPidProvider(pid.getProviderId()); - + // Only supported for DOIs and for DataCite DOI providers - if(!DataCiteDOIProvider.TYPE.equals(pidProvider.getProviderType())) { + if (!DataCiteDOIProvider.TYPE.equals(pidProvider.getProviderType())) { return error(Status.BAD_REQUEST, "Only DataCite DOI providers are supported"); } - + // Submit the task to the managed executor service - Future future = executorService.submit(() -> { - try { - // Apply rate limiting if enabled - applyRateLimit(); - - // Process the citation update - boolean success = processCitationUpdate(dataset, pid, pidProvider); - - // Update the last execution time after processing - lastExecutionTime.set(System.currentTimeMillis()); - - if (success) { - logger.fine("Successfully processed citation update for dataset " + id); - } else { - logger.warning("Failed to process citation update for dataset " + id); + Future future; + try { + future = executorService.submit(() -> { + try { + // Apply rate limiting if enabled + applyRateLimit(); + + // Process the citation update + boolean success = processCitationUpdate(dataset, pid, pidProvider); + + // Update the last execution time after processing + lastExecutionTime.set(System.currentTimeMillis()); + + if (success) { + logger.fine("Successfully processed citation update for dataset " + id); + } else { + logger.warning("Failed to process citation update for dataset " + id); + } + } catch (Exception e) { + logger.log(Level.SEVERE, "Error processing citation update for dataset " + id, e); } - } catch (Exception e) { - logger.log(Level.SEVERE, "Error processing citation update for dataset " + id, e); - } - }); - - JsonObjectBuilder output = Json.createObjectBuilder(); - output.add("status", "queued"); - output.add("message", "Citation update for dataset " + id + " has been queued for processing"); - return ok(output); + }); + + JsonObjectBuilder output = Json.createObjectBuilder(); + output.add("status", "queued"); + output.add("message", "Citation update for dataset " + id + " has been queued for processing"); + return ok(output); + } catch (RejectedExecutionException ree) { + logger.warning("Citation update for dataset " + id + " was rejected: Queue is full"); + return error(Status.SERVICE_UNAVAILABLE, + "Citation update service is currently at capacity. Please try again later."); + } } catch (WrappedResponse wr) { return wr.getResponse(); } From 5d8095b190e8be29c9bdea4d4797d29123de9d59 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Fri, 15 Aug 2025 16:05:48 -0400 Subject: [PATCH 06/10] update script for asynch api call --- conf/mdc/counter_weekly.sh | 92 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 conf/mdc/counter_weekly.sh diff --git a/conf/mdc/counter_weekly.sh b/conf/mdc/counter_weekly.sh new file mode 100644 index 00000000000..67cb5df2af2 --- /dev/null +++ b/conf/mdc/counter_weekly.sh @@ -0,0 +1,92 @@ +#!/bin/sh +#counter_weekly.sh + +# This script iterates through all published Datasets in all Dataverses and calls the Make Data Count API to update their citations from DataCite +# Note: Requires curl and jq for parsing JSON responses form curl + +# A recursive method to process each Dataverse +processDV () { +echo "Processing Dataverse ID#: $1" + +#Call the Dataverse API to get the contents of the Dataverse (without credentials, this will only list published datasets and dataverses +DVCONTENTS=$(curl -s http://localhost:8080/api/dataverses/$1/contents) + +# Iterate over all datasets, pulling the value of their DOIs (as part of the persistentUrl) from the json returned +for subds in $(echo "${DVCONTENTS}" | jq -r '.data[] | select(.type == "dataset") | .persistentUrl'); do + +#The authority/identifier are preceded by a protocol/host, i.e. https://doi.org/ +DOI=`expr "$subds" : '.*:\/\/\doi\.org\/\(.*\)'` + +# Call the Dataverse API for this dataset and capture both the response and HTTP status code +HTTP_RESPONSE=$(curl -s -w "\n%{http_code}" -X POST "http://localhost:8080/api/admin/makeDataCount/:persistentId/updateCitationsForDataset?persistentId=doi:$DOI") + +# Extract the HTTP status code from the last line +HTTP_STATUS=$(echo "$HTTP_RESPONSE" | tail -n1) +# Extract the response body (everything except the last line) +RESPONSE_BODY=$(echo "$HTTP_RESPONSE" | sed '$d') + +# Check the HTTP status code and report accordingly +case $HTTP_STATUS in + 200) + # Successfully queued + # Extract status from the nested data object + STATUS=$(echo "$RESPONSE_BODY" | jq -r '.data.status') + + # Extract message from the nested data object + if echo "$RESPONSE_BODY" | jq -e '.data.message' > /dev/null 2>&1 && [ "$(echo "$RESPONSE_BODY" | jq -r '.data.message')" != "null" ]; then + MESSAGE=$(echo "$RESPONSE_BODY" | jq -r '.data.message') + echo "[SUCCESS] doi:$DOI - $STATUS: $MESSAGE" + else + # If message is missing or null, just show the status + echo "[SUCCESS] doi:$DOI - $STATUS: Citation update queued" + fi + ;; + 400) + # Bad request + if echo "$RESPONSE_BODY" | jq -e '.message' > /dev/null 2>&1; then + ERROR=$(echo "$RESPONSE_BODY" | jq -r '.message') + echo "[ERROR 400] doi:$DOI - Bad request: $ERROR" + else + echo "[ERROR 400] doi:$DOI - Bad request" + fi + ;; + 404) + # Not found + if echo "$RESPONSE_BODY" | jq -e '.message' > /dev/null 2>&1; then + ERROR=$(echo "$RESPONSE_BODY" | jq -r '.message') + echo "[ERROR 404] doi:$DOI - Not found: $ERROR" + else + echo "[ERROR 404] doi:$DOI - Not found" + fi + ;; + 503) + # Service unavailable (queue full) + if echo "$RESPONSE_BODY" | jq -e '.message' > /dev/null 2>&1; then + ERROR=$(echo "$RESPONSE_BODY" | jq -r '.message') + echo "[ERROR 503] doi:$DOI - Service unavailable: $ERROR" + elif echo "$RESPONSE_BODY" | jq -e '.data.message' > /dev/null 2>&1; then + ERROR=$(echo "$RESPONSE_BODY" | jq -r '.data.message') + echo "[ERROR 503] doi:$DOI - Service unavailable: $ERROR" + else + echo "[ERROR 503] doi:$DOI - Service unavailable: Queue is full" + fi + ;; + *) + # Other error + echo "[ERROR $HTTP_STATUS] doi:$DOI - Unexpected error" + echo "Response: $RESPONSE_BODY" + ;; +esac + +done + +# Now iterate over any child Dataverses and recursively process them +for subdv in $(echo "${DVCONTENTS}" | jq -r '.data[] | select(.type == "dataverse") | .id'); do +echo $subdv +processDV $subdv +done + +} + +# Call the function on the root dataverse to start processing +processDV 1 \ No newline at end of file From ba61c09459fdc9e994d30359b81031f001fe8324 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Fri, 29 Aug 2025 14:35:50 -0400 Subject: [PATCH 07/10] release note --- doc/release-notes/11777-MDC-citation-api-improvement.md | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 doc/release-notes/11777-MDC-citation-api-improvement.md diff --git a/doc/release-notes/11777-MDC-citation-api-improvement.md b/doc/release-notes/11777-MDC-citation-api-improvement.md new file mode 100644 index 00000000000..9441e9e0f44 --- /dev/null +++ b/doc/release-notes/11777-MDC-citation-api-improvement.md @@ -0,0 +1,7 @@ +The /api/admin/makeDataCount/{id}/updateCitationsForDataset endpoint, which allows citations for a dataset to be retrieved from DataCite, is often called periodically for all datasets. However, allowing calls for many datasets to be processed in parallel can cause performance problems in Dataverse and/or cause calls to DataCite to fail due to rate limiting. The existing implementation was also inefficient w.r.t. memory use when used on datasets with many (>~1K) files. This release configures Dataverse to queue calls to this api, processes them serially, adds optional throttling to avoid hitting DataCite rate limits and improves memory use. + +New optional MPConfig setting: + +dataverse.api.mdc.min-delay-ms - number of milliseconds to wait between calls to DataCite. A value of ~100 should conservatively address DataCite's current 3000/5 minute limit. A value of 250 may be required for their test service. + +Backward compatibility: This api call is now asynchronous and will return an OK response when the call is queued or a 503 if the queue is full. \ No newline at end of file From 71321b995f7667f53c3eb2c8a4aec1e94d3cecd2 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Fri, 29 Aug 2025 14:36:09 -0400 Subject: [PATCH 08/10] switch to app executor service --- src/main/webapp/WEB-INF/glassfish-resources.xml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/main/webapp/WEB-INF/glassfish-resources.xml b/src/main/webapp/WEB-INF/glassfish-resources.xml index 3fbbf4c3586..74af3be42ce 100644 --- a/src/main/webapp/WEB-INF/glassfish-resources.xml +++ b/src/main/webapp/WEB-INF/glassfish-resources.xml @@ -11,4 +11,14 @@ + + + + + + + + \ No newline at end of file From 867af5e08398f88a0fd211a9e1e1eb3e145ee634 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Fri, 29 Aug 2025 14:46:05 -0400 Subject: [PATCH 09/10] docs --- .../source/admin/make-data-count.rst | 2 ++ doc/sphinx-guides/source/api/changelog.rst | 2 +- doc/sphinx-guides/source/installation/config.rst | 16 ++++++++++++++++ 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/doc/sphinx-guides/source/admin/make-data-count.rst b/doc/sphinx-guides/source/admin/make-data-count.rst index 0103a6f9e38..f8ffa7bb084 100644 --- a/doc/sphinx-guides/source/admin/make-data-count.rst +++ b/doc/sphinx-guides/source/admin/make-data-count.rst @@ -166,6 +166,8 @@ The example :download:`counter_weekly.sh <../_static/util/counter_weekly.sh>` wi Citations will be retrieved for each published dataset and recorded in the your Dataverse installation's database. +Note that the :ref:`dataverse.api.mdc.min-delay-ms` setting can be used to avoid getting rate-limit errors from DataCite. + For how to get the citations out of your Dataverse installation, see "Retrieving Citations for a Dataset" under :ref:`Dataset Metrics ` in the :doc:`/api/native-api` section of the API Guide. Please note that while the Dataverse Software has a metadata field for "Related Dataset" this information is not currently sent as a citation to Crossref. diff --git a/doc/sphinx-guides/source/api/changelog.rst b/doc/sphinx-guides/source/api/changelog.rst index 16157459220..08e8620ba13 100644 --- a/doc/sphinx-guides/source/api/changelog.rst +++ b/doc/sphinx-guides/source/api/changelog.rst @@ -13,7 +13,7 @@ v6.8 - For POST /api/files/{id}/metadata passing an empty string ("description":"") or array ("categories":[]) will no longer be ignored. Empty fields will now clear out the values in the file's metadata. To ignore the fields simply do not include them in the JSON string. - For PUT /api/datasets/{id}/editMetadata the query parameter "sourceInternalVersionNumber" has been removed and replaced with "sourceLastUpdateTime" to verify that the data being edited hasn't been modified and isn't stale. - For GET /api/dataverses/$dataverse-alias/links the Json response has changed breaking the backward compatibility of the API. - +- The POST /api/admin/makeDataCount/{id}/updateCitationsForDataset processing is now asynchronous and the response no longer includes the number of citations. The response can be OK if the request is queued or 503 if the queue is full (default queue size is 1000). v6.7 ---- diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index d2eff275392..a8e6129c501 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -3729,6 +3729,22 @@ Example: Can also be set via any `supported MicroProfile Config API source`_, e.g. the environment variable ``DATAVERSE_CORS_HEADERS_EXPOSE``. + +.. _dataverse.api.mdc.min-delay-ms: + +dataverse.api.mdc.min-delay-ms +++++++++++++++++++++++++++++++ + +Minimum delay in milliseconds between Make Data Count (MDC) API requests from the /api/admin/makeDataCount/{id}/updateCitationsForDataset api. +This setting helps prevent overloading the MDC service by enforcing a minimum time interval between consecutive requests. +If a request arrives before this interval has elapsed since the previous request, it will be rate-limited. + +Default: ``0`` (no delay enforced) + +Example: ``dataverse.api.mdc.min-delay-ms=100`` (enforces a minimum 100ms delay between MDC API requests) + +Can also be set via any `supported MicroProfile Config API source`_, e.g. the environment variable ``DATAVERSE_API_MDC_MIN_DELAY_MS``. + .. _feature-flags: Feature Flags From 85fcbac7a05d652360fd9fac3ac5ccd8a06627f5 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Fri, 29 Aug 2025 15:10:54 -0400 Subject: [PATCH 10/10] missing blank line --- doc/sphinx-guides/source/api/changelog.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/sphinx-guides/source/api/changelog.rst b/doc/sphinx-guides/source/api/changelog.rst index 08e8620ba13..4c91a63f86d 100644 --- a/doc/sphinx-guides/source/api/changelog.rst +++ b/doc/sphinx-guides/source/api/changelog.rst @@ -14,6 +14,7 @@ v6.8 - For PUT /api/datasets/{id}/editMetadata the query parameter "sourceInternalVersionNumber" has been removed and replaced with "sourceLastUpdateTime" to verify that the data being edited hasn't been modified and isn't stale. - For GET /api/dataverses/$dataverse-alias/links the Json response has changed breaking the backward compatibility of the API. - The POST /api/admin/makeDataCount/{id}/updateCitationsForDataset processing is now asynchronous and the response no longer includes the number of citations. The response can be OK if the request is queued or 503 if the queue is full (default queue size is 1000). + v6.7 ----