From 618ee4fd72bfa246bedc7e4e13f1d17e103ec09a Mon Sep 17 00:00:00 2001 From: Eryk Kullikowski Date: Thu, 14 Aug 2025 15:09:21 +0200 Subject: [PATCH 01/29] 11744: CORS: echo request Origin and add Vary: Origin; sanitize CSV lists; prefer comma-separated origins; rely on JVM options/MicroProfile only; add tests and release notes --- .../11744-cors-echo-origin-vary.md | 62 +++++++ .../iq/dataverse/filter/CorsFilter.java | 115 +++++++++--- .../iq/dataverse/filter/CorsFilterTest.java | 170 ++++++++++++++++++ 3 files changed, 320 insertions(+), 27 deletions(-) create mode 100644 doc/release-notes/11744-cors-echo-origin-vary.md create mode 100644 src/test/java/edu/harvard/iq/dataverse/filter/CorsFilterTest.java diff --git a/doc/release-notes/11744-cors-echo-origin-vary.md b/doc/release-notes/11744-cors-echo-origin-vary.md new file mode 100644 index 00000000000..d4a39812388 --- /dev/null +++ b/doc/release-notes/11744-cors-echo-origin-vary.md @@ -0,0 +1,62 @@ +# 11744: CORS header handling fixes (echo single Origin, add Vary: Origin, multi-origin allow, sanitization) + +This branch adjusts the CORS filter so browser clients work correctly when multiple origins are allowed. + +## What changed +- Access-Control-Allow-Origin (ACAO) now echoes the single request `Origin` when it matches an allowlist from `dataverse.cors.origin`. +- `Vary: Origin` is added when echoing a specific origin to keep caches correct across different origins. +- Comma-separated origin lists are supported; surrounding quotes in CSV configs are stripped. +- Sanitization is applied to CORS header lists (methods/allow/expose) to avoid quoted values that can break preflight checks. +- Deprecated DB fallback for enabling CORS is removed; CORS is considered enabled only when `dataverse.cors.origin` is set as a JVM options/Microprofile setting. + +## Upgrade / run notes (non-SQL) +To keep CORS working after pulling this branch: + +1) Configure origins as JVM options/Microprofile settings (no quotes): +- Single origin: + - `dataverse.cors.origin=https://example.org` +- Multiple origins (comma-separated): + - `dataverse.cors.origin=https://libis.github.io,https://gdcc.github.io` +- Wildcard: + - `dataverse.cors.origin=*` + - Note: Browsers reject `*` when credentialed requests are used (cookies/Authorization headers). Prefer explicit origins for those cases. + +2) Optional headers/methods lists (unquoted, comma-separated CSV): +- `dataverse.cors.methods` +- `dataverse.cors.headers.allow` +- `dataverse.cors.headers.expose` + +Avoid surrounding values with quotes (e.g., do not use `"Accept, Content-Type"`). Quotes will be stripped but may cause confusion. + +3) If you previously relied on the database setting to enable CORS (deprecated `AllowCors`), set `dataverse.cors.origin` instead. The DB fallback is no longer used. + +4) Reverse proxies/caches: `Vary: Origin` is now emitted. Ensure your proxy does not drop this header. + +## Verify +Preflight (replace DV_URL with your base URL): + +```bash +curl -i -X OPTIONS \ + -H "Origin: https://libis.github.io" \ + -H "Access-Control-Request-Method: GET" \ + "${DV_URL}/api/info/version" +``` + +Expected: +- `Access-Control-Allow-Origin: https://libis.github.io` +- `Vary: Origin` present + +Actual request: + +```bash +curl -i \ + -H "Origin: https://libis.github.io" \ + "${DV_URL}/api/info/version" +``` + +Expected: +- Same ACAO echo as above + +## Backward compatibility +- Instances relying on the deprecated DB-based CORS enablement must set `dataverse.cors.origin` to keep CORS enabled. +- Quoted CORS configuration values may behave differently; remove quotes going forward. diff --git a/src/main/java/edu/harvard/iq/dataverse/filter/CorsFilter.java b/src/main/java/edu/harvard/iq/dataverse/filter/CorsFilter.java index 7d99d9ee4d2..f564ffb18e1 100644 --- a/src/main/java/edu/harvard/iq/dataverse/filter/CorsFilter.java +++ b/src/main/java/edu/harvard/iq/dataverse/filter/CorsFilter.java @@ -1,56 +1,71 @@ package edu.harvard.iq.dataverse.filter; -import jakarta.inject.Inject; import jakarta.servlet.*; import jakarta.servlet.annotation.WebFilter; +import jakarta.servlet.http.HttpServletRequest; import jakarta.servlet.http.HttpServletResponse; import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; +import java.util.stream.Collectors; import edu.harvard.iq.dataverse.settings.JvmSettings; -import edu.harvard.iq.dataverse.settings.SettingsServiceBean; /** * CorsFilter is a servlet filter that handles Cross-Origin Resource Sharing (CORS) for the Dataverse application. * It configures and applies CORS headers to HTTP responses based on application settings. * * This filter: - * 1. Reads CORS configuration from JVM settings or (deprecated) the SettingsServiceBean. See the Dataverse Configuration Guide for more details. + * 1. Reads CORS configuration from JVM options/Microprofile settings (e.g. dataverse.cors.*). * 2. Determines whether CORS should be allowed based on these settings. - * 3. If CORS is allowed, it adds the appropriate CORS headers to all HTTP responses. The JVMSettings allow customization of the header contents if desired. + * 3. If CORS is allowed, it adds the appropriate CORS headers to all HTTP responses. The JvmSettings allow customization of the header contents if desired. * * The filter is applied to all paths ("/*") in the application. */ @WebFilter("/*") public class CorsFilter implements Filter { - - @Inject - private SettingsServiceBean settingsSvc; - private boolean allowCors; - private String origin; + private String origin; // raw configured origin value private String methods; private String allowHeaders; private String exposeHeaders; + private Set allowedOrigins = Collections.emptySet(); + private boolean allowAllOrigins = false; @Override public void init(FilterConfig filterConfig) throws ServletException { - origin = JvmSettings.CORS_ORIGIN.lookupOptional().orElse(null); - boolean corsSetting = settingsSvc.isTrueForKey(SettingsServiceBean.Key.AllowCors, true); - - if (origin == null && !corsSetting) { - allowCors = false; - } else { - allowCors = true; - origin = (origin != null) ? origin : "*"; - } + origin = sanitize(JvmSettings.CORS_ORIGIN.lookupOptional().orElse(null)); + allowCors = origin != null && !origin.trim().isEmpty(); if (allowCors) { - methods = JvmSettings.CORS_METHODS.lookupOptional().orElse("PUT, GET, POST, DELETE, OPTIONS"); - allowHeaders = JvmSettings.CORS_ALLOW_HEADERS.lookupOptional() - .orElse("Accept, Content-Type, X-Dataverse-key, Range"); - exposeHeaders = JvmSettings.CORS_EXPOSE_HEADERS.lookupOptional() - .orElse("Accept-Ranges, Content-Range, Content-Encoding"); + methods = sanitizeCsv(JvmSettings.CORS_METHODS.lookupOptional().orElse("GET, POST, OPTIONS, PUT, DELETE")); + allowHeaders = sanitizeCsv(JvmSettings.CORS_ALLOW_HEADERS.lookupOptional() + .orElse("Accept, Content-Type, X-Dataverse-key, Range")); + exposeHeaders = sanitizeCsv(JvmSettings.CORS_EXPOSE_HEADERS.lookupOptional() + .orElse("Accept-Ranges, Content-Range, Content-Encoding")); + + // Initialize allowed origins (documented as comma-separated list) + String configured = origin != null ? origin.trim() : null; + if (configured == null || configured.isEmpty() || "*".equals(configured)) { + allowAllOrigins = true; + allowedOrigins = Collections.emptySet(); + } else { + // Parse configured origins; code is tolerant of whitespace but + // docs recommend a comma-separated list (no quotes) + allowedOrigins = Arrays.stream(configured.split(",")) + .flatMap(s -> Arrays.stream(s.split("[\n\r\t ]+"))) + .map(String::trim) + .filter(s -> !s.isEmpty()) + .collect(Collectors.toCollection(HashSet::new)); + // handle a single '"*"' that might slip through incorrectly + if (allowedOrigins.size() == 1 && allowedOrigins.contains("*")) { + allowAllOrigins = true; + allowedOrigins = Collections.emptySet(); + } + } } } @@ -58,12 +73,58 @@ public void init(FilterConfig filterConfig) throws ServletException { public void doFilter(ServletRequest servletRequest, ServletResponse servletResponse, FilterChain chain) throws IOException, ServletException { if (allowCors) { + HttpServletRequest request = (HttpServletRequest) servletRequest; HttpServletResponse response = (HttpServletResponse) servletResponse; - response.addHeader("Access-Control-Allow-Origin", origin); - response.addHeader("Access-Control-Allow-Methods", methods); - response.addHeader("Access-Control-Allow-Headers", allowHeaders); - response.addHeader("Access-Control-Expose-Headers", exposeHeaders); + + String requestOrigin = sanitize(request.getHeader("Origin")); + + // Decide ACAO value + if (allowAllOrigins) { + // Note: Browsers will reject '*' for credentialed requests; this is by design. + response.setHeader("Access-Control-Allow-Origin", "*"); + } else if (requestOrigin != null && allowedOrigins.contains(requestOrigin)) { + response.setHeader("Access-Control-Allow-Origin", requestOrigin); + // Help caches vary based on Origin + response.setHeader("Vary", appendVary(response.getHeader("Vary"), "Origin")); + } + + response.setHeader("Access-Control-Allow-Methods", methods); + response.setHeader("Access-Control-Allow-Headers", allowHeaders); + response.setHeader("Access-Control-Expose-Headers", exposeHeaders); } chain.doFilter(servletRequest, servletResponse); } + + /** Remove surrounding quotes and collapse internal quotes. */ + private String sanitize(String value) { + if (value == null) return null; + String v = value.trim(); + if (v.length() >= 2 && v.startsWith("\"") && v.endsWith("\"")) { + v = v.substring(1, v.length() - 1); + } + return v.replace("\"", "").trim(); + } + + /** Remove quotes from CSV-like header value and trim spaces around commas. */ + private String sanitizeCsv(String value) { + String v = sanitize(value); + // Normalize separators and trim tokens + return Arrays.stream(v.split(",")) + .map(String::trim) + .filter(s -> !s.isEmpty()) + .collect(Collectors.joining(", ")); + } + + private String appendVary(String existing, String token) { + if (existing == null || existing.isEmpty()) { + return token; + } + // Avoid duplicate tokens in Vary + Set tokens = Arrays.stream(existing.split(",")) + .map(String::trim) + .filter(s -> !s.isEmpty()) + .collect(Collectors.toCollection(HashSet::new)); + tokens.add(token); + return String.join(", ", tokens); + } } diff --git a/src/test/java/edu/harvard/iq/dataverse/filter/CorsFilterTest.java b/src/test/java/edu/harvard/iq/dataverse/filter/CorsFilterTest.java new file mode 100644 index 00000000000..2bbcb995362 --- /dev/null +++ b/src/test/java/edu/harvard/iq/dataverse/filter/CorsFilterTest.java @@ -0,0 +1,170 @@ +package edu.harvard.iq.dataverse.filter; + +import jakarta.servlet.FilterChain; +import jakarta.servlet.ServletRequest; +import jakarta.servlet.ServletResponse; +import jakarta.servlet.http.HttpServletRequest; +import jakarta.servlet.http.HttpServletResponse; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.mockito.ArgumentCaptor; + +import java.util.HashMap; +import java.util.Map; + +import static org.junit.jupiter.api.Assertions.*; +import static org.mockito.Mockito.*; + +class CorsFilterTest { + + private final Map sysPropsBackup = new HashMap<>(); + + @BeforeEach + void setUp() { + // backup potentially touched props + backupAndClear("dataverse.cors.origin"); + backupAndClear("dataverse.cors.methods"); + backupAndClear("dataverse.cors.headers.allow"); + backupAndClear("dataverse.cors.headers.expose"); + } + + @AfterEach + void tearDown() { + restore("dataverse.cors.origin"); + restore("dataverse.cors.methods"); + restore("dataverse.cors.headers.allow"); + restore("dataverse.cors.headers.expose"); + } + + @Test + void wildcardOrigin_allowsAny_noVary() throws Exception { + System.setProperty("dataverse.cors.origin", "*"); + + CorsFilter sut = new CorsFilter(); + injectSettingsAllowCors(sut, true); + sut.init(null); + + HttpServletRequest req = mock(HttpServletRequest.class); + when(req.getHeader("Origin")).thenReturn("https://a.example"); + HttpServletResponse res = mock(HttpServletResponse.class); + FilterChain chain = mock(FilterChain.class); + + sut.doFilter(req, res, chain); + + verify(res).setHeader("Access-Control-Allow-Origin", "*"); + // By design, Vary not required for wildcard + verify(res, never()).setHeader(eq("Vary"), anyString()); + verify(chain).doFilter(any(ServletRequest.class), any(ServletResponse.class)); + } + + @Test + void singleOrigin_echoesAndAddsVary() throws Exception { + System.setProperty("dataverse.cors.origin", "https://libis.github.io"); + + CorsFilter sut = new CorsFilter(); + injectSettingsAllowCors(sut, true); + sut.init(null); + + HttpServletRequest req = mock(HttpServletRequest.class); + when(req.getHeader("Origin")).thenReturn("https://libis.github.io"); + HttpServletResponse res = mock(HttpServletResponse.class); + when(res.getHeader("Vary")).thenReturn(null); + FilterChain chain = mock(FilterChain.class); + + sut.doFilter(req, res, chain); + + verify(res).setHeader("Access-Control-Allow-Origin", "https://libis.github.io"); + + ArgumentCaptor varyVal = ArgumentCaptor.forClass(String.class); + verify(res).setHeader(eq("Vary"), varyVal.capture()); + assertTrue(varyVal.getValue().contains("Origin")); + verify(chain).doFilter(any(ServletRequest.class), any(ServletResponse.class)); + } + + @Test + void multipleOrigins_echoesMatch_onlyWhenAllowed() throws Exception { + // Comma-separated list as set via JVM options/Microprofile + System.setProperty("dataverse.cors.origin", "https://a.example, https://b.example"); + + CorsFilter sut = new CorsFilter(); + injectSettingsAllowCors(sut, true); + sut.init(null); + + // allowed origin + HttpServletRequest reqAllowed = mock(HttpServletRequest.class); + when(reqAllowed.getHeader("Origin")).thenReturn("https://b.example"); + HttpServletResponse resAllowed = mock(HttpServletResponse.class); + FilterChain chain = mock(FilterChain.class); + + sut.doFilter(reqAllowed, resAllowed, chain); + verify(resAllowed).setHeader("Access-Control-Allow-Origin", "https://b.example"); + verify(resAllowed).setHeader(eq("Vary"), contains("Origin")); + + // not allowed origin -> no ACAO header set + HttpServletRequest reqDenied = mock(HttpServletRequest.class); + when(reqDenied.getHeader("Origin")).thenReturn("https://c.example"); + HttpServletResponse resDenied = mock(HttpServletResponse.class); + + sut.doFilter(reqDenied, resDenied, chain); + verify(resDenied, never()).setHeader(eq("Access-Control-Allow-Origin"), anyString()); + } + + @Test + void sanitizesQuotedHeaderLists() throws Exception { + System.setProperty("dataverse.cors.origin", "https://x.example"); + System.setProperty("dataverse.cors.headers.allow", "\"Accept, X-Dataverse-key\""); + System.setProperty("dataverse.cors.headers.expose", "\"Accept-Ranges, Content-Range\""); + System.setProperty("dataverse.cors.methods", "GET, POST, OPTIONS"); + + CorsFilter sut = new CorsFilter(); + injectSettingsAllowCors(sut, true); + sut.init(null); + + HttpServletRequest req = mock(HttpServletRequest.class); + when(req.getHeader("Origin")).thenReturn("https://x.example"); + HttpServletResponse res = mock(HttpServletResponse.class); + + sut.doFilter(req, res, mock(FilterChain.class)); + + verify(res).setHeader("Access-Control-Allow-Headers", "Accept, X-Dataverse-key"); + verify(res).setHeader("Access-Control-Expose-Headers", "Accept-Ranges, Content-Range"); + verify(res).setHeader("Access-Control-Allow-Methods", "GET, POST, OPTIONS"); + } + + @Test + void disabledCors_skipsHeaders() throws Exception { + // no origin set -> CORS disabled + CorsFilter sut = new CorsFilter(); + sut.init(null); + + HttpServletRequest req = mock(HttpServletRequest.class); + when(req.getHeader("Origin")).thenReturn("https://any.example"); + HttpServletResponse res = mock(HttpServletResponse.class); + + sut.doFilter(req, res, mock(FilterChain.class)); + + verify(res, never()).setHeader(eq("Access-Control-Allow-Origin"), anyString()); + verify(res, never()).setHeader(eq("Access-Control-Allow-Methods"), anyString()); + verify(res, never()).setHeader(eq("Access-Control-Allow-Headers"), anyString()); + verify(res, never()).setHeader(eq("Access-Control-Expose-Headers"), anyString()); + } + + // No-op since filter no longer depends on SettingsServiceBean + private void injectSettingsAllowCors(CorsFilter sut, boolean allowCors) { /* legacy path removed */ } + + private void backupAndClear(String key) { + String old = System.getProperty(key); + if (old != null) { + sysPropsBackup.put(key, old); + } + System.clearProperty(key); + } + + private void restore(String key) { + System.clearProperty(key); + if (sysPropsBackup.containsKey(key)) { + System.setProperty(key, sysPropsBackup.get(key)); + } + } +} From 75066c52e5f65ad855efd836c3a5ff2aeaab2f09 Mon Sep 17 00:00:00 2001 From: Eryk Kulikowski Date: Mon, 29 Sep 2025 12:44:33 +0200 Subject: [PATCH 02/29] Centralize CSV parsing (CsvUtil) + CORS origin echo & Vary header improvements --- .../11744-cors-echo-origin-vary.md | 79 ++++++------------ .../source/api/external-tools.rst | 3 + .../source/developers/big-data-support.rst | 9 +++ .../source/installation/config.rst | 25 +++--- .../source/user/dataset-management.rst | 3 + .../iq/dataverse/DatasetFieldServiceBean.java | 9 ++- .../harvard/iq/dataverse/FileMetadata.java | 3 +- .../harvard/iq/dataverse/SettingsWrapper.java | 11 +-- .../edu/harvard/iq/dataverse/api/Admin.java | 3 +- .../harvard/iq/dataverse/api/Datasets.java | 6 +- .../importer/filesystem/FileRecordReader.java | 8 +- .../dataaccess/GlobusAccessibleStore.java | 11 ++- .../dataaccess/GlobusOverlayAccessIO.java | 7 +- .../dataaccess/RemoteOverlayAccessIO.java | 3 +- .../DataCaptureModuleUtil.java | 4 +- .../iq/dataverse/dataset/DatasetUtil.java | 3 +- .../command/impl/CreateDataverseCommand.java | 4 +- .../iq/dataverse/filter/CorsFilter.java | 81 +++++++------------ .../pidproviders/AbstractPidProvider.java | 5 +- .../pidproviders/PidProviderFactoryBean.java | 11 +-- .../iq/dataverse/settings/JvmSettings.java | 18 +++++ .../harvard/iq/dataverse/util/CSLUtil.java | 5 +- .../harvard/iq/dataverse/util/CsvUtil.java | 74 +++++++++++++++++ .../iq/dataverse/util/SystemConfig.java | 6 +- .../LDNAnnounceDatasetVersionStep.java | 4 +- .../export/ddi/DdiExportUtilTest.java | 4 +- .../iq/dataverse/filter/CorsFilterTest.java | 63 ++++++++++++++- .../dataverse/pidproviders/PidUtilTest.java | 2 +- .../iq/dataverse/util/CsvUtilTest.java | 48 +++++++++++ 29 files changed, 349 insertions(+), 163 deletions(-) create mode 100644 src/main/java/edu/harvard/iq/dataverse/util/CsvUtil.java create mode 100644 src/test/java/edu/harvard/iq/dataverse/util/CsvUtilTest.java diff --git a/doc/release-notes/11744-cors-echo-origin-vary.md b/doc/release-notes/11744-cors-echo-origin-vary.md index d4a39812388..6dad65efb2e 100644 --- a/doc/release-notes/11744-cors-echo-origin-vary.md +++ b/doc/release-notes/11744-cors-echo-origin-vary.md @@ -1,62 +1,33 @@ -# 11744: CORS header handling fixes (echo single Origin, add Vary: Origin, multi-origin allow, sanitization) +# 11744: CORS handling improvements -This branch adjusts the CORS filter so browser clients work correctly when multiple origins are allowed. +Modernizes CORS so browser integrations (previewers, external tools, JS clients) work correctly with multiple origins and proper caching. -## What changed -- Access-Control-Allow-Origin (ACAO) now echoes the single request `Origin` when it matches an allowlist from `dataverse.cors.origin`. -- `Vary: Origin` is added when echoing a specific origin to keep caches correct across different origins. -- Comma-separated origin lists are supported; surrounding quotes in CSV configs are stripped. -- Sanitization is applied to CORS header lists (methods/allow/expose) to avoid quoted values that can break preflight checks. -- Deprecated DB fallback for enabling CORS is removed; CORS is considered enabled only when `dataverse.cors.origin` is set as a JVM options/Microprofile setting. +## Highlights +* Echoes the request origin (`Access-Control-Allow-Origin`) when it matches `dataverse.cors.origin`. +* Adds `Vary: Origin` for per-origin responses (not for wildcard). +* Supports comma‑separated origin list; any `*` in the list = wildcard mode. +* CORS now only enabled when `dataverse.cors.origin` is set (deprecated `:AllowCors` no longer enables it). +* Sanitizes CORS CSV settings (`dataverse.cors.methods`, `dataverse.cors.headers.allow`, `dataverse.cors.headers.expose`). +* Docs updated (Installation, Big Data Support, External Tools, File Previews); new tests cover edge cases. -## Upgrade / run notes (non-SQL) -To keep CORS working after pulling this branch: +## Admin Action +Set `dataverse.cors.origin` explicitly (required). Use explicit origins (not `*`) for credentialed requests. Ensure proxies keep `Vary: Origin`. -1) Configure origins as JVM options/Microprofile settings (no quotes): -- Single origin: - - `dataverse.cors.origin=https://example.org` -- Multiple origins (comma-separated): - - `dataverse.cors.origin=https://libis.github.io,https://gdcc.github.io` -- Wildcard: - - `dataverse.cors.origin=*` - - Note: Browsers reject `*` when credentialed requests are used (cookies/Authorization headers). Prefer explicit origins for those cases. - -2) Optional headers/methods lists (unquoted, comma-separated CSV): -- `dataverse.cors.methods` -- `dataverse.cors.headers.allow` -- `dataverse.cors.headers.expose` - -Avoid surrounding values with quotes (e.g., do not use `"Accept, Content-Type"`). Quotes will be stripped but may cause confusion. - -3) If you previously relied on the database setting to enable CORS (deprecated `AllowCors`), set `dataverse.cors.origin` instead. The DB fallback is no longer used. - -4) Reverse proxies/caches: `Vary: Origin` is now emitted. Ensure your proxy does not drop this header. - -## Verify -Preflight (replace DV_URL with your base URL): - -```bash -curl -i -X OPTIONS \ - -H "Origin: https://libis.github.io" \ - -H "Access-Control-Request-Method: GET" \ - "${DV_URL}/api/info/version" +Examples: ``` - -Expected: -- `Access-Control-Allow-Origin: https://libis.github.io` -- `Vary: Origin` present - -Actual request: - -```bash -curl -i \ - -H "Origin: https://libis.github.io" \ - "${DV_URL}/api/info/version" +dataverse.cors.origin=https://example.org +dataverse.cors.origin=https://libis.github.io,https://gdcc.github.io +dataverse.cors.origin=* ``` +Optional (unquoted): +``` +dataverse.cors.methods=GET, POST, OPTIONS, PUT, DELETE +``` + +## Compatibility +* Must configure `dataverse.cors.origin`; `:AllowCors` no longer sufficient. +* Any `*` triggers wildcard (no per-origin echo / no Vary header). -Expected: -- Same ACAO echo as above +## Docs +See updated `dataverse.cors.origin` section and related notes in Big Data Support (S3), External Tools, and File Previews. -## Backward compatibility -- Instances relying on the deprecated DB-based CORS enablement must set `dataverse.cors.origin` to keep CORS enabled. -- Quoted CORS configuration values may behave differently; remove quotes going forward. diff --git a/doc/sphinx-guides/source/api/external-tools.rst b/doc/sphinx-guides/source/api/external-tools.rst index ae0e44b36aa..699db13671e 100644 --- a/doc/sphinx-guides/source/api/external-tools.rst +++ b/doc/sphinx-guides/source/api/external-tools.rst @@ -11,6 +11,9 @@ Introduction External tools are additional applications the user can access or open from your Dataverse installation to preview, explore, and manipulate data files and datasets. The term "external" is used to indicate that the tool is not part of the main Dataverse Software. +.. note:: + Browser-based preview or explore tools that make XHR/fetch calls back to the Dataverse API must have CORS explicitly enabled on the Dataverse installation via :ref:`dataverse.cors.origin `. The legacy ``:AllowCors`` database setting is deprecated and no longer enables CORS by itself. Be sure the origins hosting your tool (or ``*`` when appropriate) are included in ``dataverse.cors.origin``; otherwise requests from your tool will be blocked by the browser even if the tool itself loads correctly. + Once you have created the external tool itself (which is most of the work!), you need to teach a Dataverse installation how to construct URLs that your tool needs to operate. For example, if you've deployed your tool to fabulousfiletool.com your tool might want the ID of a file and the siteUrl of the Dataverse installation like this: https://fabulousfiletool.com?fileId=42&siteUrl=https://demo.dataverse.org In short, you will be creating a manifest in JSON format that describes not only how to construct URLs for your tool, but also what types of files your tool operates on, where it should appear in the Dataverse installation web interfaces, etc. diff --git a/doc/sphinx-guides/source/developers/big-data-support.rst b/doc/sphinx-guides/source/developers/big-data-support.rst index 75a50e2513d..989f511685e 100644 --- a/doc/sphinx-guides/source/developers/big-data-support.rst +++ b/doc/sphinx-guides/source/developers/big-data-support.rst @@ -57,6 +57,15 @@ Allow CORS for S3 Buckets **IMPORTANT:** One additional step that is required to enable direct uploads via a Dataverse installation and for direct download to work with previewers and direct upload to work with dvwebloader (:ref:`folder-upload`) is to allow cross site (CORS) requests on your S3 store. The example below shows how to enable CORS rules (to support upload and download) on a bucket using the AWS CLI command line tool. Note that you may want to limit the AllowedOrigins and/or AllowedHeaders further. https://github.com/gdcc/dataverse-previewers/wiki/Using-Previewers-with-download-redirects-from-S3 has some additional information about doing this. +Dataverse itself will only emit the necessary ``Access-Control-*`` headers to browsers when CORS has been explicitly enabled via the JVM/MicroProfile setting :ref:`dataverse.cors.origin `. The legacy database setting ``:AllowCors`` no longer turns CORS on. You must both: + +* Configure an appropriate ``dataverse.cors.origin`` value (single origin, comma-separated list, or ``*``) on the Dataverse application server; and +* Configure a matching/compatible CORS policy on each S3 bucket (and any CDN/proxy in front of it) that will be used for direct upload or for redirect (download-redirect) operations consumed by previewers. + +If you specify multiple origins in ``dataverse.cors.origin`` Dataverse will echo back the requesting origin (when it matches) and will include ``Vary: Origin`` so that shared caches do not serve one origin's response to another. If you configure ``*`` Dataverse will respond with ``Access-Control-Allow-Origin: *`` (note that browsers will not allow credentialed requests with a wildcard). + +Make sure the bucket CORS configuration ``AllowedOrigins`` is at least as permissive as the origins you configure in ``dataverse.cors.origin``. If the bucket allows ``*`` but the Dataverse application only allows a subset, the browser will still enforce the more restrictive application response. + If you'd like to check the CORS configuration on your bucket before making changes: ``aws s3api get-bucket-cors --bucket `` diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index 67621e5eb8c..e0835f4b3b7 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -3667,10 +3667,9 @@ The following settings control Cross-Origin Resource Sharing (CORS) for your Dat dataverse.cors.origin +++++++++++++++++++++ -Allowed origins for CORS requests. The default with no value set is to not include CORS headers. However, if the deprecated :AllowCors setting is explicitly set to true the default is "\*" (all origins). -When the :AllowsCors setting is not used, you must set this setting to "\*" or a list of origins to enable CORS headers. +Allowed origins for CORS requests. If this setting is not defined, CORS headers are not added. Set to ``*`` to allow all origins (note that browsers will not allow credentialed requests with ``*``) or provide a comma-separated list of explicit origins. -Multiple origins can be specified as a comma-separated list. +Multiple origins can be specified as a comma-separated list (whitespace is ignored): Example: @@ -3678,6 +3677,12 @@ Example: Can also be set via any `supported MicroProfile Config API source`_, e.g. the environment variable ``DATAVERSE_CORS_ORIGIN``. +Behavior: + +* When a list of origins is configured, Dataverse echoes the single matching request ``Origin`` value in ``Access-Control-Allow-Origin`` and adds ``Vary: Origin`` to support correct proxy/CDN caching. +* When ``*`` is configured, ``Access-Control-Allow-Origin: *`` is sent and ``Vary`` is not modified. +* The legacy database setting ``:AllowCors`` is deprecated and no longer enables CORS automatically; you must configure ``dataverse.cors.origin``. + .. _dataverse.cors.methods: dataverse.cors.methods @@ -4917,19 +4922,17 @@ This can be helpful in situations where multiple organizations are sharing one D or ``curl -X PUT -d '*' http://localhost:8080/api/admin/settings/:InheritParentRoleAssignments`` -:AllowCors (Deprecated) -+++++++++++++++++++++++ +:AllowCors (Deprecated – no longer used once dataverse.cors.* settings exist) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ .. note:: - This setting is deprecated. Please use the JVM settings above instead. - This legacy setting will only be used if the newer JVM settings are not set. + This legacy database setting has been superseded by the ``dataverse.cors.*`` JVM/MicroProfile settings. In current versions CORS is only enabled when ``dataverse.cors.origin`` is explicitly set. Existing values of ``:AllowCors`` are ignored if ``dataverse.cors.origin`` is unset. -Enable or disable support for Cross-Origin Resource Sharing (CORS) by setting ``:AllowCors`` to ``true`` or ``false``. +Historical behavior (prior versions) allowed setting ``:AllowCors`` to ``true``/``false``. Administrators should migrate to the JVM/MicroProfile setting: -``curl -X PUT -d true http://localhost:8080/api/admin/settings/:AllowCors`` +``./asadmin create-jvm-options '-Ddataverse.cors.origin=*'`` -.. note:: - New values for this setting will only be used after a server restart. +or a comma-separated list of allowed origins. :ChronologicalDateFacets ++++++++++++++++++++++++ diff --git a/doc/sphinx-guides/source/user/dataset-management.rst b/doc/sphinx-guides/source/user/dataset-management.rst index d73459969ce..2961242ca77 100755 --- a/doc/sphinx-guides/source/user/dataset-management.rst +++ b/doc/sphinx-guides/source/user/dataset-management.rst @@ -175,6 +175,9 @@ File Previews Dataverse installations can add previewers for common file types uploaded by their research communities. The previews appear on the file page. If a preview tool for a specific file type is available, the preview will be created and will display automatically, after terms have been agreed to or a guestbook entry has been made, if necessary. File previews are not available for restricted files unless they are being accessed using a Preview URL. See also :ref:`previewUrl`. When the dataset license is not the default license, users will be prompted to accept the license/data use agreement before the preview is shown. See also :ref:`license-terms`. +.. note:: + Some previewers run purely in the browser and make direct (JavaScript) requests back to the Dataverse API endpoints to retrieve file contents, metadata, or signed URLs. For these previewers to function when hosted on a different origin (e.g., a CDN or a separate previewer service), the Dataverse installation must have CORS enabled via :ref:`dataverse.cors.origin `. Administrators should configure the list of allowed origins to include the host serving the previewers. The deprecated ``:AllowCors`` database setting no longer enables CORS. + Previewers are available for the following file types: - Text diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetFieldServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/DatasetFieldServiceBean.java index dce7a98fd75..59543c5476a 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetFieldServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetFieldServiceBean.java @@ -52,6 +52,7 @@ import org.apache.http.protocol.HttpContext; import org.apache.http.util.EntityUtils; import edu.harvard.iq.dataverse.settings.SettingsServiceBean; +import edu.harvard.iq.dataverse.util.CsvUtil; /** * @@ -853,12 +854,12 @@ public String getFieldLanguage(String languages, String localeCode) { // If the fields list of supported languages contains the current locale (e.g. // the lang of the UI, or the current metadata input/display lang (tbd)), use // that. Otherwise, return the first in the list - String[] langStrings = languages.split("\\s*,\\s*"); - if (langStrings.length > 0) { - if (Arrays.asList(langStrings).contains(localeCode)) { + final List langStrings = CsvUtil.split(languages); + if (!langStrings.isEmpty()) { + if (langStrings.contains(localeCode)) { return localeCode; } else { - return langStrings[0]; + return langStrings.get(0); } } return null; diff --git a/src/main/java/edu/harvard/iq/dataverse/FileMetadata.java b/src/main/java/edu/harvard/iq/dataverse/FileMetadata.java index ca3e2d67263..bbc48feef45 100644 --- a/src/main/java/edu/harvard/iq/dataverse/FileMetadata.java +++ b/src/main/java/edu/harvard/iq/dataverse/FileMetadata.java @@ -48,6 +48,7 @@ import edu.harvard.iq.dataverse.datavariable.DataVariable; import edu.harvard.iq.dataverse.datavariable.VarGroup; import edu.harvard.iq.dataverse.datavariable.VariableMetadata; +import edu.harvard.iq.dataverse.util.CsvUtil; import edu.harvard.iq.dataverse.util.DateUtil; import edu.harvard.iq.dataverse.util.StringUtil; import java.util.HashSet; @@ -609,7 +610,7 @@ public int compare(FileMetadata o1, FileMetadata o2) { public static void setCategorySortOrder(String categories) { categoryMap=new HashMap(); long i=1; - for(String cat: categories.split(",\\s*")) { + for(String cat: CsvUtil.split(categories)) { categoryMap.put(cat.toUpperCase(), i); i++; } diff --git a/src/main/java/edu/harvard/iq/dataverse/SettingsWrapper.java b/src/main/java/edu/harvard/iq/dataverse/SettingsWrapper.java index 3ff27699379..df021c402a4 100644 --- a/src/main/java/edu/harvard/iq/dataverse/SettingsWrapper.java +++ b/src/main/java/edu/harvard/iq/dataverse/SettingsWrapper.java @@ -14,7 +14,7 @@ import edu.harvard.iq.dataverse.settings.SettingsServiceBean; import edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key; import edu.harvard.iq.dataverse.util.BundleUtil; -import edu.harvard.iq.dataverse.util.MailUtil; +import edu.harvard.iq.dataverse.util.CsvUtil; import edu.harvard.iq.dataverse.util.StringUtil; import edu.harvard.iq.dataverse.util.SystemConfig; import edu.harvard.iq.dataverse.UserNotification.Type; @@ -396,7 +396,7 @@ public boolean isRsyncOnly() { if (uploadMethods==null){ rsyncOnly = false; } else { - rsyncOnly = Arrays.asList(uploadMethods.toLowerCase().split("\\s*,\\s*")).size() == 1 && uploadMethods.toLowerCase().equals(SystemConfig.FileUploadMethods.RSYNC.toString()); + rsyncOnly = CsvUtil.split(uploadMethods).size() == 1 && uploadMethods.toLowerCase().equals(SystemConfig.FileUploadMethods.RSYNC.toString()); } } } @@ -428,7 +428,7 @@ public Integer getUploadMethodsCount() { if (uploadMethods==null){ uploadMethodsCount = 0; } else { - uploadMethodsCount = Arrays.asList(uploadMethods.toLowerCase().split("\\s*,\\s*")).size(); + uploadMethodsCount = CsvUtil.split(uploadMethods).size(); } } return uploadMethodsCount; @@ -502,7 +502,8 @@ public boolean shouldBeAnonymized(DatasetField df) { if (anonymizedFieldTypes == null) { anonymizedFieldTypes = new ArrayList(); String names = get(SettingsServiceBean.Key.AnonymizedFieldTypeNames.toString(), ""); - anonymizedFieldTypes.addAll(Arrays.asList(names.split(",\\s"))); + // Use CsvUtil for consistent CSV parsing instead of raw regex split + anonymizedFieldTypes.addAll(CsvUtil.split(names)); } return anonymizedFieldTypes.contains(df.getDatasetFieldType().getName()); } @@ -830,7 +831,7 @@ private Boolean getUploadMethodAvailable(String method){ if (uploadMethods==null){ return false; } else { - return Arrays.asList(uploadMethods.toLowerCase().split("\\s*,\\s*")).contains(method); + return CsvUtil.splitToLowerCaseSet(uploadMethods).contains(method); } } diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Admin.java b/src/main/java/edu/harvard/iq/dataverse/api/Admin.java index ac52b5d9fbf..20fa7edbb25 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/Admin.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/Admin.java @@ -111,6 +111,7 @@ import edu.harvard.iq.dataverse.userdata.UserListResult; import edu.harvard.iq.dataverse.util.ArchiverUtil; import edu.harvard.iq.dataverse.util.BundleUtil; +import edu.harvard.iq.dataverse.util.CsvUtil; import edu.harvard.iq.dataverse.util.FileUtil; import edu.harvard.iq.dataverse.util.SystemConfig; import edu.harvard.iq.dataverse.util.URLTokenUtil; @@ -2167,7 +2168,7 @@ public Response addRoleAssignementsToChildren(@Context ContainerRequestContext c boolean inheritAllRoles = false; String rolesString = settingsSvc.getValueForKey(SettingsServiceBean.Key.InheritParentRoleAssignments, ""); if (rolesString.length() > 0) { - ArrayList rolesToInherit = new ArrayList(Arrays.asList(rolesString.split("\\s*,\\s*"))); + ArrayList rolesToInherit = new ArrayList<>(CsvUtil.split(rolesString)); if (!rolesToInherit.isEmpty()) { if (rolesToInherit.contains("*")) { inheritAllRoles = true; diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java b/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java index 729174dedfc..614c869823e 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java @@ -5222,7 +5222,8 @@ public Response getPrivateUrlDatasetVersion(@PathParam("privateUrlToken") String } JsonObjectBuilder responseJson; if (isAnonymizedAccess) { - List anonymizedFieldTypeNamesList = new ArrayList<>(Arrays.asList(anonymizedFieldTypeNames.split(",\\s"))); + // Use CsvUtil for consistent CSV parsing + List anonymizedFieldTypeNamesList = new ArrayList<>(CsvUtil.split(anonymizedFieldTypeNames)); responseJson = json(dsv, anonymizedFieldTypeNamesList, true, returnOwners); } else { responseJson = json(dsv, null, true, returnOwners); @@ -5248,7 +5249,8 @@ public Response getPreviewUrlDatasetVersion(@PathParam("previewUrlToken") String } JsonObjectBuilder responseJson; if (isAnonymizedAccess) { - List anonymizedFieldTypeNamesList = new ArrayList<>(Arrays.asList(anonymizedFieldTypeNames.split(",\\s"))); + // Use CsvUtil for consistent CSV parsing + List anonymizedFieldTypeNamesList = new ArrayList<>(CsvUtil.split(anonymizedFieldTypeNames)); responseJson = json(dsv, anonymizedFieldTypeNamesList, true, returnOwners); } else { responseJson = json(dsv, null, true, returnOwners); diff --git a/src/main/java/edu/harvard/iq/dataverse/batch/jobs/importer/filesystem/FileRecordReader.java b/src/main/java/edu/harvard/iq/dataverse/batch/jobs/importer/filesystem/FileRecordReader.java index 9ce30683a87..c6744922a6d 100644 --- a/src/main/java/edu/harvard/iq/dataverse/batch/jobs/importer/filesystem/FileRecordReader.java +++ b/src/main/java/edu/harvard/iq/dataverse/batch/jobs/importer/filesystem/FileRecordReader.java @@ -25,6 +25,8 @@ import edu.harvard.iq.dataverse.authorization.users.AuthenticatedUser; import edu.harvard.iq.dataverse.batch.jobs.importer.ImportMode; import edu.harvard.iq.dataverse.settings.JvmSettings; +import edu.harvard.iq.dataverse.util.CsvUtil; + import org.apache.commons.io.filefilter.NotFileFilter; import org.apache.commons.io.filefilter.WildcardFileFilter; @@ -43,7 +45,6 @@ import java.io.FileFilter; import java.io.Serializable; import java.util.ArrayList; -import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; import java.util.List; @@ -153,7 +154,10 @@ public File readItem() { */ private List getFiles(final File directory) { // create filter from job xml excludes property - FileFilter excludeFilter = new NotFileFilter(new WildcardFileFilter(Arrays.asList(excludes.split("\\s*,\\s*")))); + // Convert list to array to use non-deprecated constructor + FileFilter excludeFilter = new NotFileFilter(new WildcardFileFilter( + CsvUtil.split(excludes).toArray(new String[0]) + )); List files = new ArrayList<>(); File[] filesList = directory.listFiles(excludeFilter); if (filesList != null) { diff --git a/src/main/java/edu/harvard/iq/dataverse/dataaccess/GlobusAccessibleStore.java b/src/main/java/edu/harvard/iq/dataverse/dataaccess/GlobusAccessibleStore.java index 8bed60d8302..0bf9f85a934 100644 --- a/src/main/java/edu/harvard/iq/dataverse/dataaccess/GlobusAccessibleStore.java +++ b/src/main/java/edu/harvard/iq/dataverse/dataaccess/GlobusAccessibleStore.java @@ -1,5 +1,8 @@ package edu.harvard.iq.dataverse.dataaccess; +import java.util.List; + +import edu.harvard.iq.dataverse.util.CsvUtil; import jakarta.json.Json; import jakarta.json.JsonArray; import jakarta.json.JsonArrayBuilder; @@ -38,10 +41,12 @@ public static String getTransferPath(String driverId) { } public static JsonArray getReferenceEndpointsWithPaths(String driverId) { - String[] endpoints = StorageIO.getConfigParamForDriver(driverId, AbstractRemoteOverlayAccessIO.REFERENCE_ENDPOINTS_WITH_BASEPATHS).split("\\s*,\\s*"); + List endpoints = CsvUtil.split( + StorageIO.getConfigParamForDriver(driverId, AbstractRemoteOverlayAccessIO.REFERENCE_ENDPOINTS_WITH_BASEPATHS) + ); JsonArrayBuilder builder = Json.createArrayBuilder(); - for(int i=0;i rolesToInherit = new ArrayList(Arrays.asList(rolesString.split("\\s*,\\s*"))); + ArrayList rolesToInherit = new ArrayList<>(CsvUtil.split(rolesString)); if (rolesString.length() > 0) { if (!rolesToInherit.isEmpty()) { if (rolesToInherit.contains("*")) { diff --git a/src/main/java/edu/harvard/iq/dataverse/filter/CorsFilter.java b/src/main/java/edu/harvard/iq/dataverse/filter/CorsFilter.java index f564ffb18e1..437abc5913a 100644 --- a/src/main/java/edu/harvard/iq/dataverse/filter/CorsFilter.java +++ b/src/main/java/edu/harvard/iq/dataverse/filter/CorsFilter.java @@ -5,30 +5,37 @@ import jakarta.servlet.http.HttpServletRequest; import jakarta.servlet.http.HttpServletResponse; import java.io.IOException; -import java.util.Arrays; import java.util.Collections; -import java.util.HashSet; import java.util.Set; +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; import java.util.stream.Collectors; import edu.harvard.iq.dataverse.settings.JvmSettings; +import edu.harvard.iq.dataverse.util.CsvUtil; /** - * CorsFilter is a servlet filter that handles Cross-Origin Resource Sharing (CORS) for the Dataverse application. - * It configures and applies CORS headers to HTTP responses based on application settings. + * CorsFilter is a servlet filter that handles Cross-Origin Resource Sharing + * (CORS) for the Dataverse application. + * It configures and applies CORS headers to HTTP responses based on application + * settings. * * This filter: - * 1. Reads CORS configuration from JVM options/Microprofile settings (e.g. dataverse.cors.*). + * 1. Reads CORS configuration from JVM options/Microprofile settings (e.g. + * dataverse.cors.*). * 2. Determines whether CORS should be allowed based on these settings. - * 3. If CORS is allowed, it adds the appropriate CORS headers to all HTTP responses. The JvmSettings allow customization of the header contents if desired. + * 3. If CORS is allowed, it adds the appropriate CORS headers to all HTTP + * responses. The JvmSettings allow customization of the header contents if + * desired. * * The filter is applied to all paths ("/*") in the application. */ @WebFilter("/*") public class CorsFilter implements Filter { + private boolean allowCors; - private String origin; // raw configured origin value private String methods; private String allowHeaders; private String exposeHeaders; @@ -37,35 +44,29 @@ public class CorsFilter implements Filter { @Override public void init(FilterConfig filterConfig) throws ServletException { - origin = sanitize(JvmSettings.CORS_ORIGIN.lookupOptional().orElse(null)); - allowCors = origin != null && !origin.trim().isEmpty(); + // Parse allowed origins list (optional) + List originTokens = JvmSettings.CORS_ORIGIN.lookupCsvList(); + allowCors = !originTokens.isEmpty(); if (allowCors) { - methods = sanitizeCsv(JvmSettings.CORS_METHODS.lookupOptional().orElse("GET, POST, OPTIONS, PUT, DELETE")); - allowHeaders = sanitizeCsv(JvmSettings.CORS_ALLOW_HEADERS.lookupOptional() - .orElse("Accept, Content-Type, X-Dataverse-key, Range")); - exposeHeaders = sanitizeCsv(JvmSettings.CORS_EXPOSE_HEADERS.lookupOptional() - .orElse("Accept-Ranges, Content-Range, Content-Encoding")); - - // Initialize allowed origins (documented as comma-separated list) - String configured = origin != null ? origin.trim() : null; - if (configured == null || configured.isEmpty() || "*".equals(configured)) { + // '*' anywhere means all origins + if (originTokens.contains("*")) { allowAllOrigins = true; allowedOrigins = Collections.emptySet(); } else { - // Parse configured origins; code is tolerant of whitespace but - // docs recommend a comma-separated list (no quotes) - allowedOrigins = Arrays.stream(configured.split(",")) - .flatMap(s -> Arrays.stream(s.split("[\n\r\t ]+"))) - .map(String::trim) - .filter(s -> !s.isEmpty()) + allowedOrigins = originTokens.stream().map(CsvUtil::sanitize) .collect(Collectors.toCollection(HashSet::new)); - // handle a single '"*"' that might slip through incorrectly - if (allowedOrigins.size() == 1 && allowedOrigins.contains("*")) { - allowAllOrigins = true; - allowedOrigins = Collections.emptySet(); - } } + + methods = JvmSettings.CORS_METHODS.lookupCsvListOptional() + .map(l -> String.join(", ", l)) + .orElse("GET, POST, OPTIONS, PUT, DELETE"); + allowHeaders = JvmSettings.CORS_ALLOW_HEADERS.lookupCsvListOptional() + .map(l -> String.join(", ", l)) + .orElse("Accept, Content-Type, X-Dataverse-key, Range"); + exposeHeaders = JvmSettings.CORS_EXPOSE_HEADERS.lookupCsvListOptional() + .map(l -> String.join(", ", l)) + .orElse("Accept-Ranges, Content-Range, Content-Encoding"); } } @@ -76,7 +77,7 @@ public void doFilter(ServletRequest servletRequest, ServletResponse servletRespo HttpServletRequest request = (HttpServletRequest) servletRequest; HttpServletResponse response = (HttpServletResponse) servletResponse; - String requestOrigin = sanitize(request.getHeader("Origin")); + String requestOrigin = CsvUtil.sanitize(request.getHeader("Origin")); // Decide ACAO value if (allowAllOrigins) { @@ -95,26 +96,6 @@ public void doFilter(ServletRequest servletRequest, ServletResponse servletRespo chain.doFilter(servletRequest, servletResponse); } - /** Remove surrounding quotes and collapse internal quotes. */ - private String sanitize(String value) { - if (value == null) return null; - String v = value.trim(); - if (v.length() >= 2 && v.startsWith("\"") && v.endsWith("\"")) { - v = v.substring(1, v.length() - 1); - } - return v.replace("\"", "").trim(); - } - - /** Remove quotes from CSV-like header value and trim spaces around commas. */ - private String sanitizeCsv(String value) { - String v = sanitize(value); - // Normalize separators and trim tokens - return Arrays.stream(v.split(",")) - .map(String::trim) - .filter(s -> !s.isEmpty()) - .collect(Collectors.joining(", ")); - } - private String appendVary(String existing, String token) { if (existing == null || existing.isEmpty()) { return token; diff --git a/src/main/java/edu/harvard/iq/dataverse/pidproviders/AbstractPidProvider.java b/src/main/java/edu/harvard/iq/dataverse/pidproviders/AbstractPidProvider.java index 469b3505165..720cde40275 100644 --- a/src/main/java/edu/harvard/iq/dataverse/pidproviders/AbstractPidProvider.java +++ b/src/main/java/edu/harvard/iq/dataverse/pidproviders/AbstractPidProvider.java @@ -7,6 +7,7 @@ import edu.harvard.iq.dataverse.DatasetVersion; import edu.harvard.iq.dataverse.DvObject; import edu.harvard.iq.dataverse.GlobalId; +import edu.harvard.iq.dataverse.util.CsvUtil; import edu.harvard.iq.dataverse.util.SystemConfig; import jakarta.json.Json; import jakarta.json.JsonObject; @@ -61,10 +62,10 @@ protected AbstractPidProvider(String id, String label, String protocol, String a this.identifierGenerationStyle = identifierGenerationStyle; this.datafilePidFormat = datafilePidFormat; if(!managedList.isEmpty()) { - this.managedSet.addAll(Arrays.asList(managedList.split(",\\s"))); + this.managedSet.addAll(CsvUtil.split(managedList)); } if(!excludedList.isEmpty()) { - this.excludedSet.addAll(Arrays.asList(excludedList.split(",\\s"))); + this.excludedSet.addAll(CsvUtil.split(excludedList)); } if (logger.isLoggable(Level.FINE)) { Iterator iter = managedSet.iterator(); diff --git a/src/main/java/edu/harvard/iq/dataverse/pidproviders/PidProviderFactoryBean.java b/src/main/java/edu/harvard/iq/dataverse/pidproviders/PidProviderFactoryBean.java index 1bd49bc7f6e..66f936a7b21 100644 --- a/src/main/java/edu/harvard/iq/dataverse/pidproviders/PidProviderFactoryBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/pidproviders/PidProviderFactoryBean.java @@ -12,7 +12,6 @@ import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.NoSuchElementException; import java.util.Optional; import java.util.ServiceLoader; import java.util.logging.Level; @@ -23,11 +22,9 @@ import jakarta.ejb.Singleton; import jakarta.ejb.Startup; import jakarta.inject.Inject; -import jakarta.json.JsonObject; import edu.harvard.iq.dataverse.settings.JvmSettings; import edu.harvard.iq.dataverse.settings.SettingsServiceBean; import edu.harvard.iq.dataverse.util.SystemConfig; -import edu.harvard.iq.dataverse.DatasetFieldServiceBean; import edu.harvard.iq.dataverse.DataverseServiceBean; import edu.harvard.iq.dataverse.DvObjectServiceBean; import edu.harvard.iq.dataverse.GlobalId; @@ -121,14 +118,12 @@ private void loadProviderFactories() { } private void loadProviders() { - Optional providers = JvmSettings.PID_PROVIDERS.lookupOptional(String[].class); - if (!providers.isPresent()) { + var providersOpt = JvmSettings.PID_PROVIDERS.lookupCsvListOptional(); + if (!providersOpt.isPresent() || providersOpt.get().isEmpty()) { logger.warning( "No PidProviders configured via dataverse.pid.providers. Please consider updating as older PIDProvider configuration mechanisms will be removed in a future version of Dataverse."); } else { - for (String id : providers.get()) { - //Allows spaces in PID_PROVIDERS setting - id=id.trim(); + for (String id : providersOpt.get()) { Optional type = JvmSettings.PID_PROVIDER_TYPE.lookupOptional(id); if (!type.isPresent()) { logger.warning("PidProvider " + id diff --git a/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java b/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java index 4cdb67955a0..eabc0d57212 100644 --- a/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java +++ b/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java @@ -2,6 +2,8 @@ import org.eclipse.microprofile.config.ConfigProvider; +import edu.harvard.iq.dataverse.util.CsvUtil; + import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -599,4 +601,20 @@ public String insert(String... arguments) { return String.format(this.getScopedKey(), (Object[]) arguments); } + /** Lookup optional CSV value and return immutable List of tokens. */ + public java.util.Optional> lookupCsvListOptional() { + return lookupOptional().map(CsvUtil::split); + } + /** Lookup required CSV value and return immutable List of tokens (throws if missing). */ + public java.util.List lookupCsvList() { + return CsvUtil.split(lookup()); + } + /** Lookup optional CSV value and return lowercased Set (deduplicated, insertion order). */ + public java.util.Optional> lookupCsvLowercaseSetOptional() { + return lookupOptional().map(CsvUtil::splitToLowerCaseSet); + } + /** Lookup required CSV value and return lowercased Set (deduplicated, insertion order). */ + public java.util.Set lookupCsvLowercaseSet() { + return CsvUtil.splitToLowerCaseSet(lookup()); + } } diff --git a/src/main/java/edu/harvard/iq/dataverse/util/CSLUtil.java b/src/main/java/edu/harvard/iq/dataverse/util/CSLUtil.java index fe9e00bd837..cf26676a403 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/CSLUtil.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/CSLUtil.java @@ -119,8 +119,9 @@ public static String getCitationFormat(String styleName) throws IOException { private static String[] getCommonStyles() { if (commonStyles == null) { - commonStyles = JvmSettings.CSL_COMMON_STYLES.lookupOptional().orElse("chicago-author-date, ieee") - .split("\\s*,\\s*"); + commonStyles = CsvUtil.split( + JvmSettings.CSL_COMMON_STYLES.lookupOptional().orElse("chicago-author-date, ieee") + ).toArray(new String[0]); } return commonStyles; } diff --git a/src/main/java/edu/harvard/iq/dataverse/util/CsvUtil.java b/src/main/java/edu/harvard/iq/dataverse/util/CsvUtil.java new file mode 100644 index 00000000000..3a1f85dda5f --- /dev/null +++ b/src/main/java/edu/harvard/iq/dataverse/util/CsvUtil.java @@ -0,0 +1,74 @@ +package edu.harvard.iq.dataverse.util; + +import java.util.Arrays; +import java.util.Collections; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Set; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + +/** + * Lightweight helpers for simple comma separated configuration values (NOT full + * RFC 4180 parsing). + * Intended for admin-entered settings where tokens are simple identifiers + * (origins, methods, headers, etc.). + */ +public final class CsvUtil { + /** Shared split regex allowing arbitrary surrounding whitespace. */ + public static final String COMMA_BETWEEN_OPTIONAL_WHITE_SPACE = "\\s*,\\s*"; + private static final Pattern COMMA_WS = Pattern.compile(COMMA_BETWEEN_OPTIONAL_WHITE_SPACE); + + private CsvUtil() { + } + + /** Strip outer quotes, remove remaining quotes, trim. */ + public static String sanitize(String raw) { + if (raw == null) + return null; + String v = raw.trim(); + if (v.length() >= 2 && v.startsWith("\"") && v.endsWith("\"")) { + v = v.substring(1, v.length() - 1); + } + return v.replace("\"", "").trim(); + } + + /** Split into an ordered immutable list of sanitized non-empty tokens. */ + public static List split(String rawCsv) { + String sanitized = sanitize(rawCsv); + if (sanitized == null || sanitized.isEmpty()) { + return Collections.emptyList(); + } + return Arrays.stream(COMMA_WS.split(sanitized)) + .map(String::trim) + .filter(s -> !s.isEmpty()) + .map(CsvUtil::sanitize) + .collect(Collectors.toUnmodifiableList()); + } + + /** Split into a de-duplicated (in insertion order) set of sanitized tokens. */ + public static Set splitToSet(String rawCsv) { + List list = split(rawCsv); + if (list.isEmpty()) + return Collections.emptySet(); + return list.stream().collect(Collectors.toCollection(LinkedHashSet::new)); + } + + /** Canonical normalized CSV string (tokens joined by ', '). */ + public static String normalize(String rawCsv) { + List list = split(rawCsv); + if (list.isEmpty()) + return ""; + return String.join(", ", list); + } + + /** Convenience: split into a lowercase insertion-ordered set (dedup, case-fold). */ + public static Set splitToLowerCaseSet(String rawCsv) { + if (rawCsv == null || rawCsv.trim().isEmpty()) { + return Collections.emptySet(); + } + return split(rawCsv).stream() + .map(String::toLowerCase) + .collect(Collectors.toCollection(LinkedHashSet::new)); + } +} diff --git a/src/main/java/edu/harvard/iq/dataverse/util/SystemConfig.java b/src/main/java/edu/harvard/iq/dataverse/util/SystemConfig.java index c80e206ec69..8e926c85652 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/SystemConfig.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/SystemConfig.java @@ -937,7 +937,7 @@ public boolean isRsyncOnly(){ if (uploadMethods==null){ return false; } else { - return Arrays.asList(uploadMethods.toLowerCase().split("\\s*,\\s*")).size() == 1 && uploadMethods.toLowerCase().equals(SystemConfig.FileUploadMethods.RSYNC.toString()); + return CsvUtil.split(uploadMethods).size() == 1 && uploadMethods.toLowerCase().equals(SystemConfig.FileUploadMethods.RSYNC.toString()); } } @@ -969,7 +969,7 @@ private Boolean getMethodAvailable(String method, boolean upload) { if (methods == null) { return false; } else { - return Arrays.asList(methods.toLowerCase().split("\\s*,\\s*")).contains(method); + return CsvUtil.split(methods.toLowerCase()).contains(method); } } @@ -978,7 +978,7 @@ public Integer getUploadMethodCount(){ if (uploadMethods==null){ return 0; } else { - return Arrays.asList(uploadMethods.toLowerCase().split("\\s*,\\s*")).size(); + return CsvUtil.split(uploadMethods).size(); } } diff --git a/src/main/java/edu/harvard/iq/dataverse/workflow/internalspi/LDNAnnounceDatasetVersionStep.java b/src/main/java/edu/harvard/iq/dataverse/workflow/internalspi/LDNAnnounceDatasetVersionStep.java index 124eea801d9..666bee01612 100644 --- a/src/main/java/edu/harvard/iq/dataverse/workflow/internalspi/LDNAnnounceDatasetVersionStep.java +++ b/src/main/java/edu/harvard/iq/dataverse/workflow/internalspi/LDNAnnounceDatasetVersionStep.java @@ -5,6 +5,7 @@ import edu.harvard.iq.dataverse.DatasetFieldType; import edu.harvard.iq.dataverse.DatasetVersion; import edu.harvard.iq.dataverse.branding.BrandingUtil; +import edu.harvard.iq.dataverse.util.CsvUtil; import edu.harvard.iq.dataverse.util.SystemConfig; import edu.harvard.iq.dataverse.util.bagit.OREMap; import edu.harvard.iq.dataverse.util.json.JsonLDTerm; @@ -124,8 +125,7 @@ HttpPost buildAnnouncement(boolean qb, WorkflowContext ctxt, JsonObject target) DatasetVersion dv = ctxt.getDataset().getReleasedVersion(); List dvf = dv.getDatasetFields(); Map fields = new HashMap(); - String[] requiredFields = ((String) ctxt.getSettings().getOrDefault(REQUIRED_FIELDS, "")).split(",\\s*"); - for (String field : requiredFields) { + for (String field : CsvUtil.split((String) ctxt.getSettings().getOrDefault(REQUIRED_FIELDS, ""))) { fields.put(field, null); } Set reqFields = fields.keySet(); diff --git a/src/test/java/edu/harvard/iq/dataverse/export/ddi/DdiExportUtilTest.java b/src/test/java/edu/harvard/iq/dataverse/export/ddi/DdiExportUtilTest.java index 360e9dfbafe..040f2e20a1d 100644 --- a/src/test/java/edu/harvard/iq/dataverse/export/ddi/DdiExportUtilTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/export/ddi/DdiExportUtilTest.java @@ -19,8 +19,6 @@ import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; -import java.nio.file.Paths; -import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -93,7 +91,7 @@ public static void setUpClass() throws Exception { PidUtil.clearPidProviders(); //Read list of providers to add - List providers = Arrays.asList(JvmSettings.PID_PROVIDERS.lookup().split(",\\s")); + List providers = JvmSettings.PID_PROVIDERS.lookupCsvList(); //Iterate through the list of providers and add them using the PidProviderFactory of the appropriate type for (String providerId : providers) { System.out.println("Loading provider: " + providerId); diff --git a/src/test/java/edu/harvard/iq/dataverse/filter/CorsFilterTest.java b/src/test/java/edu/harvard/iq/dataverse/filter/CorsFilterTest.java index 2bbcb995362..bb1a7eac781 100644 --- a/src/test/java/edu/harvard/iq/dataverse/filter/CorsFilterTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/filter/CorsFilterTest.java @@ -14,6 +14,11 @@ import java.util.Map; import static org.junit.jupiter.api.Assertions.*; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.anyString; +import static org.mockito.ArgumentMatchers.argThat; +import static org.mockito.ArgumentMatchers.contains; +import static org.mockito.ArgumentMatchers.eq; import static org.mockito.Mockito.*; class CorsFilterTest { @@ -110,6 +115,61 @@ void multipleOrigins_echoesMatch_onlyWhenAllowed() throws Exception { verify(resDenied, never()).setHeader(eq("Access-Control-Allow-Origin"), anyString()); } + @Test + void whitespaceAndMixedCasingParsing() throws Exception { + System.setProperty("dataverse.cors.origin", + " https://one.example ,\n\t https://two.example , https://three.example "); + + CorsFilter sut = new CorsFilter(); + sut.init(null); + + HttpServletRequest req = mock(HttpServletRequest.class); + when(req.getHeader("Origin")).thenReturn("https://two.example"); + HttpServletResponse res = mock(HttpServletResponse.class); + when(res.getHeader("Vary")).thenReturn("Accept-Encoding"); + + sut.doFilter(req, res, mock(FilterChain.class)); + + verify(res).setHeader("Access-Control-Allow-Origin", "https://two.example"); + // ensure existing Vary preserved and Origin added + verify(res).setHeader(eq("Vary"), argThat(v -> v.contains("Origin") && v.contains("Accept-Encoding"))); + } + + @Test + void wildcardAmongOthersTreatsAsWildcard() throws Exception { + System.setProperty("dataverse.cors.origin", "https://a.example,*,https://b.example"); + + CorsFilter sut = new CorsFilter(); + sut.init(null); + + HttpServletRequest req = mock(HttpServletRequest.class); + when(req.getHeader("Origin")).thenReturn("https://random.example"); + HttpServletResponse res = mock(HttpServletResponse.class); + + sut.doFilter(req, res, mock(FilterChain.class)); + + verify(res).setHeader("Access-Control-Allow-Origin", "*"); + verify(res, never()).setHeader(eq("Vary"), anyString()); + } + + @Test + void existingVaryMergedWithoutDuplication() throws Exception { + System.setProperty("dataverse.cors.origin", "https://merge.example"); + + CorsFilter sut = new CorsFilter(); + sut.init(null); + + HttpServletRequest req = mock(HttpServletRequest.class); + when(req.getHeader("Origin")).thenReturn("https://merge.example"); + HttpServletResponse res = mock(HttpServletResponse.class); + when(res.getHeader("Vary")).thenReturn("Accept-Encoding, Origin"); + + sut.doFilter(req, res, mock(FilterChain.class)); + + // Origin should not be duplicated + verify(res).setHeader(eq("Vary"), argThat(v -> v.indexOf("Origin") == v.lastIndexOf("Origin"))); + } + @Test void sanitizesQuotedHeaderLists() throws Exception { System.setProperty("dataverse.cors.origin", "https://x.example"); @@ -151,7 +211,8 @@ void disabledCors_skipsHeaders() throws Exception { } // No-op since filter no longer depends on SettingsServiceBean - private void injectSettingsAllowCors(CorsFilter sut, boolean allowCors) { /* legacy path removed */ } + private void injectSettingsAllowCors(CorsFilter sut, boolean allowCors) { + /* legacy path removed */ } private void backupAndClear(String key) { String old = System.getProperty(key); diff --git a/src/test/java/edu/harvard/iq/dataverse/pidproviders/PidUtilTest.java b/src/test/java/edu/harvard/iq/dataverse/pidproviders/PidUtilTest.java index 2058de1d6c2..29c9052d631 100644 --- a/src/test/java/edu/harvard/iq/dataverse/pidproviders/PidUtilTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/pidproviders/PidUtilTest.java @@ -153,7 +153,7 @@ public static void setUpClass() throws Exception { PidUtil.clearPidProviders(); //Read list of providers to add - List providers = Arrays.asList(JvmSettings.PID_PROVIDERS.lookup().split(",\\s")); + List providers = JvmSettings.PID_PROVIDERS.lookupCsvList(); //Iterate through the list of providers and add them using the PidProviderFactory of the appropriate type for (String providerId : providers) { System.out.println("Loading provider: " + providerId); diff --git a/src/test/java/edu/harvard/iq/dataverse/util/CsvUtilTest.java b/src/test/java/edu/harvard/iq/dataverse/util/CsvUtilTest.java new file mode 100644 index 00000000000..9c1264bf28f --- /dev/null +++ b/src/test/java/edu/harvard/iq/dataverse/util/CsvUtilTest.java @@ -0,0 +1,48 @@ +package edu.harvard.iq.dataverse.util; + +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.Test; + +import java.util.List; +import java.util.Set; + +import static org.junit.jupiter.api.Assertions.*; + +class CsvUtilTest { + + @Test + @DisplayName("split handles whitespace, empty tokens and quotes") + void testSplitBasic() { + List tokens = CsvUtil.split(" a , b, \"c\" , , d "); + assertEquals(List.of("a", "b", "c", "d"), tokens); + } + + @Test + @DisplayName("normalize produces canonical comma+space joined list") + void testNormalize() { + assertEquals("a, b, c", CsvUtil.normalize("a,b, c")); + assertEquals("", CsvUtil.normalize(null)); + assertEquals("", CsvUtil.normalize(" ")); + } + + @Test + @DisplayName("splitToSet deduplicates while preserving first-seen order") + void testSplitToSet() { + Set set = CsvUtil.splitToSet("b, a, b, a, c"); + assertEquals(3, set.size()); + assertTrue(set.containsAll(List.of("b", "a", "c"))); + } + + @Test + @DisplayName("splitToLowerCaseSet lowercases, de-dups case-insensitively and preserves first occurrence order") + void testSplitToLowerCaseSet() { + assertTrue(CsvUtil.splitToLowerCaseSet(null).isEmpty(), "null should yield empty set"); + assertTrue(CsvUtil.splitToLowerCaseSet(" ").isEmpty(), "blank should yield empty set"); + + Set set = CsvUtil.splitToLowerCaseSet("B, a, b, A, C"); + assertEquals(List.of("b", "a", "c"), List.copyOf(set)); + + Set quoted = CsvUtil.splitToLowerCaseSet("\"A\" , \"b\" , \"A\""); + assertEquals(List.of("a", "b"), List.copyOf(quoted)); + } +} From ab19665f1fa5d4baec6f9a82f15a75d53d1f1200 Mon Sep 17 00:00:00 2001 From: Eryk Kulikowski Date: Mon, 29 Sep 2025 12:57:15 +0200 Subject: [PATCH 03/29] Make CORS origin list optional in CorsFilter initialization --- src/main/java/edu/harvard/iq/dataverse/filter/CorsFilter.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/filter/CorsFilter.java b/src/main/java/edu/harvard/iq/dataverse/filter/CorsFilter.java index 437abc5913a..dc207502dbd 100644 --- a/src/main/java/edu/harvard/iq/dataverse/filter/CorsFilter.java +++ b/src/main/java/edu/harvard/iq/dataverse/filter/CorsFilter.java @@ -45,7 +45,8 @@ public class CorsFilter implements Filter { @Override public void init(FilterConfig filterConfig) throws ServletException { // Parse allowed origins list (optional) - List originTokens = JvmSettings.CORS_ORIGIN.lookupCsvList(); + // Treat CORS origin list as optional: when absent, CORS is disabled (see CorsFilterTest.disabledCors_skipsHeaders) + List originTokens = JvmSettings.CORS_ORIGIN.lookupCsvListOptional().orElse(List.of()); allowCors = !originTokens.isEmpty(); if (allowCors) { From 6fbcdc4a3af98875c0506720f17dcb4079aebbc1 Mon Sep 17 00:00:00 2001 From: Eryk Kullikowski Date: Wed, 1 Oct 2025 11:02:21 +0200 Subject: [PATCH 04/29] Refactor GlobusOverlayAccessIO and CsvUtil for improved endpoint handling and CSV parsing --- .../dataaccess/GlobusOverlayAccessIO.java | 24 ++++--- .../iq/dataverse/filter/CorsFilter.java | 38 ++++++----- .../harvard/iq/dataverse/util/CsvUtil.java | 66 ++++--------------- .../iq/dataverse/util/CsvUtilTest.java | 25 ++----- 4 files changed, 51 insertions(+), 102 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/dataaccess/GlobusOverlayAccessIO.java b/src/main/java/edu/harvard/iq/dataverse/dataaccess/GlobusOverlayAccessIO.java index e9d98c2ff12..f616e44f203 100644 --- a/src/main/java/edu/harvard/iq/dataverse/dataaccess/GlobusOverlayAccessIO.java +++ b/src/main/java/edu/harvard/iq/dataverse/dataaccess/GlobusOverlayAccessIO.java @@ -393,26 +393,24 @@ protected void configureGlobusEndpoints() throws IOException { } private static String[] getAllowedEndpoints(String driverId) throws IOException { - String[] allowedEndpoints = null; if (GlobusAccessibleStore.isDataverseManaged(driverId)) { - allowedEndpoints = new String[1]; - allowedEndpoints[0] = getConfigParamForDriver(driverId, TRANSFER_ENDPOINT_WITH_BASEPATH); - if (allowedEndpoints[0] == null) { - throw new IOException( - "dataverse.files." + driverId + "." + TRANSFER_ENDPOINT_WITH_BASEPATH + " is required"); + final String value = getConfigParamForDriver(driverId, TRANSFER_ENDPOINT_WITH_BASEPATH); + if (value == null) { + throw new IOException("dataverse.files." + driverId + "." + TRANSFER_ENDPOINT_WITH_BASEPATH + " is required"); } + return new String[] { value }; } else { - String rawEndpoints = getConfigParamForDriver(driverId, REFERENCE_ENDPOINTS_WITH_BASEPATHS); - if (rawEndpoints != null) { - allowedEndpoints = CsvUtil.split( - getConfigParamForDriver(driverId, REFERENCE_ENDPOINTS_WITH_BASEPATHS) - ).toArray(new String[0]); + final String raw = getConfigParamForDriver(driverId, REFERENCE_ENDPOINTS_WITH_BASEPATHS); + if (raw == null) { + throw new IOException("dataverse.files." + driverId + ".base-url is required"); } - if (rawEndpoints == null || allowedEndpoints == null || allowedEndpoints.length == 0) { + // CsvUtil.split never returns null; may return empty list if raw is blank. + final List list = CsvUtil.split(raw); + if (list.isEmpty()) { throw new IOException("dataverse.files." + driverId + ".base-url is required"); } + return list.toArray(new String[0]); } - return allowedEndpoints; } diff --git a/src/main/java/edu/harvard/iq/dataverse/filter/CorsFilter.java b/src/main/java/edu/harvard/iq/dataverse/filter/CorsFilter.java index dc207502dbd..f739bab85b7 100644 --- a/src/main/java/edu/harvard/iq/dataverse/filter/CorsFilter.java +++ b/src/main/java/edu/harvard/iq/dataverse/filter/CorsFilter.java @@ -1,19 +1,23 @@ package edu.harvard.iq.dataverse.filter; -import jakarta.servlet.*; -import jakarta.servlet.annotation.WebFilter; -import jakarta.servlet.http.HttpServletRequest; -import jakarta.servlet.http.HttpServletResponse; import java.io.IOException; -import java.util.Collections; -import java.util.Set; import java.util.Arrays; +import java.util.Collections; import java.util.HashSet; import java.util.List; +import java.util.Set; import java.util.stream.Collectors; import edu.harvard.iq.dataverse.settings.JvmSettings; -import edu.harvard.iq.dataverse.util.CsvUtil; +import jakarta.servlet.Filter; +import jakarta.servlet.FilterChain; +import jakarta.servlet.FilterConfig; +import jakarta.servlet.ServletException; +import jakarta.servlet.ServletRequest; +import jakarta.servlet.ServletResponse; +import jakarta.servlet.annotation.WebFilter; +import jakarta.servlet.http.HttpServletRequest; +import jakarta.servlet.http.HttpServletResponse; /** * CorsFilter is a servlet filter that handles Cross-Origin Resource Sharing @@ -43,10 +47,10 @@ public class CorsFilter implements Filter { private boolean allowAllOrigins = false; @Override - public void init(FilterConfig filterConfig) throws ServletException { + public void init(final FilterConfig filterConfig) throws ServletException { // Parse allowed origins list (optional) // Treat CORS origin list as optional: when absent, CORS is disabled (see CorsFilterTest.disabledCors_skipsHeaders) - List originTokens = JvmSettings.CORS_ORIGIN.lookupCsvListOptional().orElse(List.of()); + final List originTokens = JvmSettings.CORS_ORIGIN.lookupCsvListOptional().orElse(List.of()); allowCors = !originTokens.isEmpty(); if (allowCors) { @@ -55,8 +59,7 @@ public void init(FilterConfig filterConfig) throws ServletException { allowAllOrigins = true; allowedOrigins = Collections.emptySet(); } else { - allowedOrigins = originTokens.stream().map(CsvUtil::sanitize) - .collect(Collectors.toCollection(HashSet::new)); + allowedOrigins = Set.copyOf(originTokens); } methods = JvmSettings.CORS_METHODS.lookupCsvListOptional() @@ -72,13 +75,14 @@ public void init(FilterConfig filterConfig) throws ServletException { } @Override - public void doFilter(ServletRequest servletRequest, ServletResponse servletResponse, FilterChain chain) + public void doFilter(final ServletRequest servletRequest, final ServletResponse servletResponse, final FilterChain chain) throws IOException, ServletException { if (allowCors) { - HttpServletRequest request = (HttpServletRequest) servletRequest; - HttpServletResponse response = (HttpServletResponse) servletResponse; + final HttpServletRequest request = (HttpServletRequest) servletRequest; + final HttpServletResponse response = (HttpServletResponse) servletResponse; - String requestOrigin = CsvUtil.sanitize(request.getHeader("Origin")); + final String originHeader = request.getHeader("Origin"); + final String requestOrigin = originHeader == null ? null : originHeader.trim(); // Decide ACAO value if (allowAllOrigins) { @@ -97,12 +101,12 @@ public void doFilter(ServletRequest servletRequest, ServletResponse servletRespo chain.doFilter(servletRequest, servletResponse); } - private String appendVary(String existing, String token) { + private String appendVary(final String existing, final String token) { if (existing == null || existing.isEmpty()) { return token; } // Avoid duplicate tokens in Vary - Set tokens = Arrays.stream(existing.split(",")) + final Set tokens = Arrays.stream(existing.split(",")) .map(String::trim) .filter(s -> !s.isEmpty()) .collect(Collectors.toCollection(HashSet::new)); diff --git a/src/main/java/edu/harvard/iq/dataverse/util/CsvUtil.java b/src/main/java/edu/harvard/iq/dataverse/util/CsvUtil.java index 3a1f85dda5f..4c662365ac4 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/CsvUtil.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/CsvUtil.java @@ -2,73 +2,35 @@ import java.util.Arrays; import java.util.Collections; -import java.util.LinkedHashSet; import java.util.List; import java.util.Set; import java.util.regex.Pattern; -import java.util.stream.Collectors; /** - * Lightweight helpers for simple comma separated configuration values (NOT full - * RFC 4180 parsing). - * Intended for admin-entered settings where tokens are simple identifiers - * (origins, methods, headers, etc.). + * Minimal helpers for admin-entered comma separated lists of simple tokens. + * Not a general CSV parser: no support for embedded commas, escapes, or newlines. */ public final class CsvUtil { - /** Shared split regex allowing arbitrary surrounding whitespace. */ - public static final String COMMA_BETWEEN_OPTIONAL_WHITE_SPACE = "\\s*,\\s*"; - private static final Pattern COMMA_WS = Pattern.compile(COMMA_BETWEEN_OPTIONAL_WHITE_SPACE); + /** Split on commas, trimming any adjacent to comma whitespace. */ + private static final Pattern SPLIT = Pattern.compile("\\s*,\\s*"); - private CsvUtil() { - } - - /** Strip outer quotes, remove remaining quotes, trim. */ - public static String sanitize(String raw) { - if (raw == null) - return null; - String v = raw.trim(); - if (v.length() >= 2 && v.startsWith("\"") && v.endsWith("\"")) { - v = v.substring(1, v.length() - 1); + /** Split list of trimmed tokens. */ + public static List split(final String rawCsv) { + if (rawCsv == null) { + return Collections.emptyList(); } - return v.replace("\"", "").trim(); - } - - /** Split into an ordered immutable list of sanitized non-empty tokens. */ - public static List split(String rawCsv) { - String sanitized = sanitize(rawCsv); - if (sanitized == null || sanitized.isEmpty()) { + final String trimmedCsv = rawCsv.trim(); + if (trimmedCsv.isEmpty()) { return Collections.emptyList(); } - return Arrays.stream(COMMA_WS.split(sanitized)) - .map(String::trim) - .filter(s -> !s.isEmpty()) - .map(CsvUtil::sanitize) - .collect(Collectors.toUnmodifiableList()); - } - - /** Split into a de-duplicated (in insertion order) set of sanitized tokens. */ - public static Set splitToSet(String rawCsv) { - List list = split(rawCsv); - if (list.isEmpty()) - return Collections.emptySet(); - return list.stream().collect(Collectors.toCollection(LinkedHashSet::new)); - } - - /** Canonical normalized CSV string (tokens joined by ', '). */ - public static String normalize(String rawCsv) { - List list = split(rawCsv); - if (list.isEmpty()) - return ""; - return String.join(", ", list); + return Arrays.asList(SPLIT.split(trimmedCsv)); } - /** Convenience: split into a lowercase insertion-ordered set (dedup, case-fold). */ - public static Set splitToLowerCaseSet(String rawCsv) { + /** Convenience: split into a lowercase set. */ + public static Set splitToLowerCaseSet(final String rawCsv) { if (rawCsv == null || rawCsv.trim().isEmpty()) { return Collections.emptySet(); } - return split(rawCsv).stream() - .map(String::toLowerCase) - .collect(Collectors.toCollection(LinkedHashSet::new)); + return Set.copyOf(split(rawCsv.toLowerCase())); } } diff --git a/src/test/java/edu/harvard/iq/dataverse/util/CsvUtilTest.java b/src/test/java/edu/harvard/iq/dataverse/util/CsvUtilTest.java index 9c1264bf28f..72bb8635fbe 100644 --- a/src/test/java/edu/harvard/iq/dataverse/util/CsvUtilTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/util/CsvUtilTest.java @@ -11,30 +11,14 @@ class CsvUtilTest { @Test - @DisplayName("split handles whitespace, empty tokens and quotes") + @DisplayName("split handles whitespace and empty tokens; does not alter quotes") void testSplitBasic() { List tokens = CsvUtil.split(" a , b, \"c\" , , d "); - assertEquals(List.of("a", "b", "c", "d"), tokens); + assertEquals(List.of("a", "b", "\"c\"", "d"), tokens); } @Test - @DisplayName("normalize produces canonical comma+space joined list") - void testNormalize() { - assertEquals("a, b, c", CsvUtil.normalize("a,b, c")); - assertEquals("", CsvUtil.normalize(null)); - assertEquals("", CsvUtil.normalize(" ")); - } - - @Test - @DisplayName("splitToSet deduplicates while preserving first-seen order") - void testSplitToSet() { - Set set = CsvUtil.splitToSet("b, a, b, a, c"); - assertEquals(3, set.size()); - assertTrue(set.containsAll(List.of("b", "a", "c"))); - } - - @Test - @DisplayName("splitToLowerCaseSet lowercases, de-dups case-insensitively and preserves first occurrence order") + @DisplayName("splitToLowerCaseSet lowercases, de-dups and preserves first occurrence order (quotes preserved)") void testSplitToLowerCaseSet() { assertTrue(CsvUtil.splitToLowerCaseSet(null).isEmpty(), "null should yield empty set"); assertTrue(CsvUtil.splitToLowerCaseSet(" ").isEmpty(), "blank should yield empty set"); @@ -43,6 +27,7 @@ void testSplitToLowerCaseSet() { assertEquals(List.of("b", "a", "c"), List.copyOf(set)); Set quoted = CsvUtil.splitToLowerCaseSet("\"A\" , \"b\" , \"A\""); - assertEquals(List.of("a", "b"), List.copyOf(quoted)); + // Quotes are preserved then lowercased; duplicates removed based on full token. + assertEquals(List.of("\"a\"", "\"b\""), List.copyOf(quoted)); } } From 012a09d95ac8319dff5983700011917ac8998cd8 Mon Sep 17 00:00:00 2001 From: Eryk Kullikowski Date: Wed, 1 Oct 2025 11:12:20 +0200 Subject: [PATCH 05/29] updated release note and comments --- doc/release-notes/11744-cors-echo-origin-vary.md | 16 ++++++++-------- .../harvard/iq/dataverse/filter/CorsFilter.java | 1 + .../edu/harvard/iq/dataverse/util/CsvUtil.java | 15 ++++++++++++--- 3 files changed, 21 insertions(+), 11 deletions(-) diff --git a/doc/release-notes/11744-cors-echo-origin-vary.md b/doc/release-notes/11744-cors-echo-origin-vary.md index 6dad65efb2e..82bf6089469 100644 --- a/doc/release-notes/11744-cors-echo-origin-vary.md +++ b/doc/release-notes/11744-cors-echo-origin-vary.md @@ -3,12 +3,12 @@ Modernizes CORS so browser integrations (previewers, external tools, JS clients) work correctly with multiple origins and proper caching. ## Highlights -* Echoes the request origin (`Access-Control-Allow-Origin`) when it matches `dataverse.cors.origin`. -* Adds `Vary: Origin` for per-origin responses (not for wildcard). -* Supports comma‑separated origin list; any `*` in the list = wildcard mode. -* CORS now only enabled when `dataverse.cors.origin` is set (deprecated `:AllowCors` no longer enables it). -* Sanitizes CORS CSV settings (`dataverse.cors.methods`, `dataverse.cors.headers.allow`, `dataverse.cors.headers.expose`). -* Docs updated (Installation, Big Data Support, External Tools, File Previews); new tests cover edge cases. +- Echoes the request origin (`Access-Control-Allow-Origin`) when it matches `dataverse.cors.origin`. +- Adds `Vary: Origin` for per-origin responses (not for wildcard). +- Supports comma‑separated origin list; any `*` in the list = wildcard mode. +- CORS now only enabled when `dataverse.cors.origin` is set (deprecated `:AllowCors` no longer enables it). +- Allows readable spacing in CORS list settings (`dataverse.cors.methods`, `dataverse.cors.headers.allow`, `dataverse.cors.headers.expose`): spaces around commas are ignored; tokens are otherwise unchanged (no quote parsing). +- Docs updated (Installation, Big Data Support, External Tools, File Previews); new tests cover edge cases. ## Admin Action Set `dataverse.cors.origin` explicitly (required). Use explicit origins (not `*`) for credentialed requests. Ensure proxies keep `Vary: Origin`. @@ -25,8 +25,8 @@ dataverse.cors.methods=GET, POST, OPTIONS, PUT, DELETE ``` ## Compatibility -* Must configure `dataverse.cors.origin`; `:AllowCors` no longer sufficient. -* Any `*` triggers wildcard (no per-origin echo / no Vary header). +- Must configure `dataverse.cors.origin`; `:AllowCors` no longer sufficient. +- Any `*` triggers wildcard (no per-origin echo / no Vary header). ## Docs See updated `dataverse.cors.origin` section and related notes in Big Data Support (S3), External Tools, and File Previews. diff --git a/src/main/java/edu/harvard/iq/dataverse/filter/CorsFilter.java b/src/main/java/edu/harvard/iq/dataverse/filter/CorsFilter.java index f739bab85b7..82a411f3bea 100644 --- a/src/main/java/edu/harvard/iq/dataverse/filter/CorsFilter.java +++ b/src/main/java/edu/harvard/iq/dataverse/filter/CorsFilter.java @@ -59,6 +59,7 @@ public void init(final FilterConfig filterConfig) throws ServletException { allowAllOrigins = true; allowedOrigins = Collections.emptySet(); } else { + // Origin tokens already had surrounding comma-whitespace removed by CsvUtil; we only trim here when reading the header. allowedOrigins = Set.copyOf(originTokens); } diff --git a/src/main/java/edu/harvard/iq/dataverse/util/CsvUtil.java b/src/main/java/edu/harvard/iq/dataverse/util/CsvUtil.java index 4c662365ac4..201ea64cc81 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/CsvUtil.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/CsvUtil.java @@ -7,14 +7,23 @@ import java.util.regex.Pattern; /** - * Minimal helpers for admin-entered comma separated lists of simple tokens. - * Not a general CSV parser: no support for embedded commas, escapes, or newlines. + * Helpers for simple admin settings that accept comma-separated lists (origins, methods, headers, etc.). + *

+ * Behavior: + * - Leading/trailing whitespace of the whole input is ignored. + * - Whitespace immediately around commas is ignored ("GET, POST" == "GET,POST"). + * - Tokens are otherwise preserved exactly as typed (no quote stripping, no escape processing). + * Not a full CSV parser: embedded commas, quoted fields with separators, and newlines inside tokens are NOT supported. */ public final class CsvUtil { /** Split on commas, trimming any adjacent to comma whitespace. */ private static final Pattern SPLIT = Pattern.compile("\\s*,\\s*"); - /** Split list of trimmed tokens. */ + /** + * Split a comma-separated string into tokens preserving user input (beyond removing cosmetic + * whitespace around commas and overall leading/trailing whitespace). Returns an empty list for + * null or blank input. + */ public static List split(final String rawCsv) { if (rawCsv == null) { return Collections.emptyList(); From 48bbd5368313362efd4651aa074cacf687bbf9c4 Mon Sep 17 00:00:00 2001 From: Eryk Kullikowski Date: Wed, 1 Oct 2025 11:24:34 +0200 Subject: [PATCH 06/29] test fixes --- .../harvard/iq/dataverse/filter/CorsFilterTest.java | 9 +++++---- .../edu/harvard/iq/dataverse/util/CsvUtilTest.java | 12 +++++------- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/src/test/java/edu/harvard/iq/dataverse/filter/CorsFilterTest.java b/src/test/java/edu/harvard/iq/dataverse/filter/CorsFilterTest.java index bb1a7eac781..9b8ac75241e 100644 --- a/src/test/java/edu/harvard/iq/dataverse/filter/CorsFilterTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/filter/CorsFilterTest.java @@ -171,7 +171,7 @@ void existingVaryMergedWithoutDuplication() throws Exception { } @Test - void sanitizesQuotedHeaderLists() throws Exception { + void quotedHeaderListsPreserved() throws Exception { System.setProperty("dataverse.cors.origin", "https://x.example"); System.setProperty("dataverse.cors.headers.allow", "\"Accept, X-Dataverse-key\""); System.setProperty("dataverse.cors.headers.expose", "\"Accept-Ranges, Content-Range\""); @@ -186,9 +186,10 @@ void sanitizesQuotedHeaderLists() throws Exception { HttpServletResponse res = mock(HttpServletResponse.class); sut.doFilter(req, res, mock(FilterChain.class)); - - verify(res).setHeader("Access-Control-Allow-Headers", "Accept, X-Dataverse-key"); - verify(res).setHeader("Access-Control-Expose-Headers", "Accept-Ranges, Content-Range"); + + // With simplified CsvUtil we now preserve surrounding quotes provided by admin config. + verify(res).setHeader("Access-Control-Allow-Headers", "\"Accept, X-Dataverse-key\""); + verify(res).setHeader("Access-Control-Expose-Headers", "\"Accept-Ranges, Content-Range\""); verify(res).setHeader("Access-Control-Allow-Methods", "GET, POST, OPTIONS"); } diff --git a/src/test/java/edu/harvard/iq/dataverse/util/CsvUtilTest.java b/src/test/java/edu/harvard/iq/dataverse/util/CsvUtilTest.java index 72bb8635fbe..9264cbcbe6b 100644 --- a/src/test/java/edu/harvard/iq/dataverse/util/CsvUtilTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/util/CsvUtilTest.java @@ -11,23 +11,21 @@ class CsvUtilTest { @Test - @DisplayName("split handles whitespace and empty tokens; does not alter quotes") + @DisplayName("split preserves empty tokens and quotes") void testSplitBasic() { List tokens = CsvUtil.split(" a , b, \"c\" , , d "); - assertEquals(List.of("a", "b", "\"c\"", "d"), tokens); + assertEquals(List.of("a", "b", "\"c\"", "", "d"), tokens); } @Test - @DisplayName("splitToLowerCaseSet lowercases, de-dups and preserves first occurrence order (quotes preserved)") + @DisplayName("splitToLowerCaseSet lowercases and de-dups (order not asserted)") void testSplitToLowerCaseSet() { assertTrue(CsvUtil.splitToLowerCaseSet(null).isEmpty(), "null should yield empty set"); assertTrue(CsvUtil.splitToLowerCaseSet(" ").isEmpty(), "blank should yield empty set"); - Set set = CsvUtil.splitToLowerCaseSet("B, a, b, A, C"); - assertEquals(List.of("b", "a", "c"), List.copyOf(set)); + assertEquals(Set.of("b", "a", "c"), set); Set quoted = CsvUtil.splitToLowerCaseSet("\"A\" , \"b\" , \"A\""); - // Quotes are preserved then lowercased; duplicates removed based on full token. - assertEquals(List.of("\"a\"", "\"b\""), List.copyOf(quoted)); + assertEquals(Set.of("\"a\"", "\"b\""), quoted); } } From 16720f6e8d9556ad8b447808a4cc26e20498732b Mon Sep 17 00:00:00 2001 From: Eryk Kullikowski Date: Wed, 1 Oct 2025 11:32:06 +0200 Subject: [PATCH 07/29] Clarify CORS requirements for browser-based external tools in documentation --- doc/sphinx-guides/source/api/external-tools.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/sphinx-guides/source/api/external-tools.rst b/doc/sphinx-guides/source/api/external-tools.rst index 699db13671e..63e2a806499 100644 --- a/doc/sphinx-guides/source/api/external-tools.rst +++ b/doc/sphinx-guides/source/api/external-tools.rst @@ -12,7 +12,7 @@ Introduction External tools are additional applications the user can access or open from your Dataverse installation to preview, explore, and manipulate data files and datasets. The term "external" is used to indicate that the tool is not part of the main Dataverse Software. .. note:: - Browser-based preview or explore tools that make XHR/fetch calls back to the Dataverse API must have CORS explicitly enabled on the Dataverse installation via :ref:`dataverse.cors.origin `. The legacy ``:AllowCors`` database setting is deprecated and no longer enables CORS by itself. Be sure the origins hosting your tool (or ``*`` when appropriate) are included in ``dataverse.cors.origin``; otherwise requests from your tool will be blocked by the browser even if the tool itself loads correctly. + Browser-based tools must have CORS explicitly enabled via :ref:`dataverse.cors.origin `. List every origin that will host your tool (or use ``*`` when a wildcard is acceptable). If an origin is not listed, the browser will block that tool's API requests even if the tool page itself loads. Once you have created the external tool itself (which is most of the work!), you need to teach a Dataverse installation how to construct URLs that your tool needs to operate. For example, if you've deployed your tool to fabulousfiletool.com your tool might want the ID of a file and the siteUrl of the Dataverse installation like this: https://fabulousfiletool.com?fileId=42&siteUrl=https://demo.dataverse.org From ec1bccb63604290e52f78b963522f58102a99dc2 Mon Sep 17 00:00:00 2001 From: Eryk Kullikowski Date: Wed, 1 Oct 2025 11:40:19 +0200 Subject: [PATCH 08/29] Update CORS documentation to clarify configuration requirements and deprecate legacy settings --- doc/sphinx-guides/source/developers/big-data-support.rst | 2 +- doc/sphinx-guides/source/installation/config.rst | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/sphinx-guides/source/developers/big-data-support.rst b/doc/sphinx-guides/source/developers/big-data-support.rst index 989f511685e..ef13143be02 100644 --- a/doc/sphinx-guides/source/developers/big-data-support.rst +++ b/doc/sphinx-guides/source/developers/big-data-support.rst @@ -57,7 +57,7 @@ Allow CORS for S3 Buckets **IMPORTANT:** One additional step that is required to enable direct uploads via a Dataverse installation and for direct download to work with previewers and direct upload to work with dvwebloader (:ref:`folder-upload`) is to allow cross site (CORS) requests on your S3 store. The example below shows how to enable CORS rules (to support upload and download) on a bucket using the AWS CLI command line tool. Note that you may want to limit the AllowedOrigins and/or AllowedHeaders further. https://github.com/gdcc/dataverse-previewers/wiki/Using-Previewers-with-download-redirects-from-S3 has some additional information about doing this. -Dataverse itself will only emit the necessary ``Access-Control-*`` headers to browsers when CORS has been explicitly enabled via the JVM/MicroProfile setting :ref:`dataverse.cors.origin `. The legacy database setting ``:AllowCors`` no longer turns CORS on. You must both: +Dataverse itself will only emit the necessary ``Access-Control-*`` headers to browsers when CORS has been explicitly enabled via the JVM/MicroProfile setting :ref:`dataverse.cors.origin `. You must both: * Configure an appropriate ``dataverse.cors.origin`` value (single origin, comma-separated list, or ``*``) on the Dataverse application server; and * Configure a matching/compatible CORS policy on each S3 bucket (and any CDN/proxy in front of it) that will be used for direct upload or for redirect (download-redirect) operations consumed by previewers. diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index 256c91d26ac..3d08a5f8278 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -3692,7 +3692,6 @@ Behavior: * When a list of origins is configured, Dataverse echoes the single matching request ``Origin`` value in ``Access-Control-Allow-Origin`` and adds ``Vary: Origin`` to support correct proxy/CDN caching. * When ``*`` is configured, ``Access-Control-Allow-Origin: *`` is sent and ``Vary`` is not modified. -* The legacy database setting ``:AllowCors`` is deprecated and no longer enables CORS automatically; you must configure ``dataverse.cors.origin``. .. _dataverse.cors.methods: From 53c610ea2f887b0c90e654c531f58346830bcc14 Mon Sep 17 00:00:00 2001 From: Eryk Kullikowski Date: Wed, 1 Oct 2025 11:44:38 +0200 Subject: [PATCH 09/29] Remove unused CSV lookup methods --- .../edu/harvard/iq/dataverse/settings/JvmSettings.java | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java b/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java index 621b3a316a2..bfecdd75db8 100644 --- a/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java +++ b/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java @@ -610,16 +610,9 @@ public String insert(String... arguments) { public java.util.Optional> lookupCsvListOptional() { return lookupOptional().map(CsvUtil::split); } + /** Lookup required CSV value and return immutable List of tokens (throws if missing). */ public java.util.List lookupCsvList() { return CsvUtil.split(lookup()); } - /** Lookup optional CSV value and return lowercased Set (deduplicated, insertion order). */ - public java.util.Optional> lookupCsvLowercaseSetOptional() { - return lookupOptional().map(CsvUtil::splitToLowerCaseSet); - } - /** Lookup required CSV value and return lowercased Set (deduplicated, insertion order). */ - public java.util.Set lookupCsvLowercaseSet() { - return CsvUtil.splitToLowerCaseSet(lookup()); - } } From 01f73c2a76a7c8c80830e7aadf3c8a1ba8988891 Mon Sep 17 00:00:00 2001 From: Eryk Kullikowski Date: Wed, 1 Oct 2025 11:50:20 +0200 Subject: [PATCH 10/29] Update JvmSettings documentation to clarify CSV list return types --- .../java/edu/harvard/iq/dataverse/settings/JvmSettings.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java b/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java index bfecdd75db8..cefbaad240e 100644 --- a/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java +++ b/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java @@ -606,12 +606,12 @@ public String insert(String... arguments) { return String.format(this.getScopedKey(), (Object[]) arguments); } - /** Lookup optional CSV value and return immutable List of tokens. */ + /** Lookup optional CSV value and return a List of tokens. */ public java.util.Optional> lookupCsvListOptional() { return lookupOptional().map(CsvUtil::split); } - - /** Lookup required CSV value and return immutable List of tokens (throws if missing). */ + + /** Lookup required CSV value and return a List of tokens (throws if missing). */ public java.util.List lookupCsvList() { return CsvUtil.split(lookup()); } From 8928d45e519ff9940603ab824cc85125eef83496 Mon Sep 17 00:00:00 2001 From: Eryk Kulikowski Date: Mon, 13 Oct 2025 16:43:41 +0200 Subject: [PATCH 11/29] Refactor doc structure for improved readability and maintainability --- .../11744-cors-echo-origin-vary.md | 12 +- .../source/installation/config.rst | 207 ++++++++++-------- 2 files changed, 124 insertions(+), 95 deletions(-) diff --git a/doc/release-notes/11744-cors-echo-origin-vary.md b/doc/release-notes/11744-cors-echo-origin-vary.md index 82bf6089469..5110bdee648 100644 --- a/doc/release-notes/11744-cors-echo-origin-vary.md +++ b/doc/release-notes/11744-cors-echo-origin-vary.md @@ -3,31 +3,39 @@ Modernizes CORS so browser integrations (previewers, external tools, JS clients) work correctly with multiple origins and proper caching. ## Highlights + - Echoes the request origin (`Access-Control-Allow-Origin`) when it matches `dataverse.cors.origin`. - Adds `Vary: Origin` for per-origin responses (not for wildcard). - Supports comma‑separated origin list; any `*` in the list = wildcard mode. -- CORS now only enabled when `dataverse.cors.origin` is set (deprecated `:AllowCors` no longer enables it). -- Allows readable spacing in CORS list settings (`dataverse.cors.methods`, `dataverse.cors.headers.allow`, `dataverse.cors.headers.expose`): spaces around commas are ignored; tokens are otherwise unchanged (no quote parsing). +- CORS now only enabled when `dataverse.cors.origin` is set (removed `:AllowCors` no longer enables it). +- All comma-separated configuration settings (database properties and MicroProfile config) now ignore spaces around commas; tokens remain unchanged (no quote parsing). Examples: `dataverse.cors.methods`, `dataverse.cors.headers.allow`, `dataverse.cors.headers.expose`. See "Comma-separated configuration values" in the Installation Guide. - Docs updated (Installation, Big Data Support, External Tools, File Previews); new tests cover edge cases. ## Admin Action + Set `dataverse.cors.origin` explicitly (required). Use explicit origins (not `*`) for credentialed requests. Ensure proxies keep `Vary: Origin`. Examples: + ``` dataverse.cors.origin=https://example.org dataverse.cors.origin=https://libis.github.io,https://gdcc.github.io dataverse.cors.origin=* ``` + Optional (unquoted): + ``` dataverse.cors.methods=GET, POST, OPTIONS, PUT, DELETE ``` ## Compatibility + - Must configure `dataverse.cors.origin`; `:AllowCors` no longer sufficient. - Any `*` triggers wildcard (no per-origin echo / no Vary header). ## Docs + See updated `dataverse.cors.origin` section and related notes in Big Data Support (S3), External Tools, and File Previews. + diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index 3d08a5f8278..f2a39f1e8fd 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -10,6 +10,27 @@ Once you have finished securing and configuring your Dataverse installation, you .. contents:: |toctitle| :local: +.. _comma-separated-config-values: + +Comma-separated configuration values +------------------------------------ + +Many configuration options (both MicroProfile/JVM settings and database settings) accept comma-separated lists. For all such settings, Dataverse applies consistent, lightweight parsing: + +- Whitespace immediately around commas is ignored (e.g., ``GET, POST`` is equivalent to ``GET,POST``). +- Tokens are otherwise preserved exactly as typed. There is no quote parsing and no escape processing. +- Embedded commas within a token are not supported. + +Examples include (but are not limited to): + +- :ref:`dataverse.cors.origin ` +- :ref:`dataverse.cors.methods ` +- :ref:`dataverse.cors.headers.allow ` +- :ref:`dataverse.cors.headers.expose ` +- :ref:`:UploadMethods ` + +This behavior is implemented centrally and applies across all Dataverse settings that accept comma-separated values. + .. _securing-your-installation: Securing Your Installation @@ -54,7 +75,7 @@ Nginx Configuration Rule: deny all; return 403; } - + If you are using a load balancer or a reverse proxy, there are some additional considerations. If no additional configurations are made and the upstream is configured to redirect to localhost, the API will be accessible from the outside, as your installation will register as origin the localhost for any requests to the endpoints "admin" and "builtin-users". To prevent this, you have two options: - If your upstream is configured to redirect to localhost, you will need to set the :ref:`JVM option ` to one of the following values ``%client.name% %datetime% %request% %status% %response.length% %header.referer% %header.x-forwarded-for%`` and configure from the load balancer side the chosen header to populate with the client IP address. @@ -77,20 +98,20 @@ To avoid having your users send credentials in the clear, it's strongly recommen Recording User IP Addresses +++++++++++++++++++++++++++ -By default, the Dataverse installation captures the IP address from which requests originate. This is used for multiple purposes including controlling access to the admin API, IP-based user groups and Make Data Count reporting. When the Dataverse installation is configured behind a proxy such as a load balancer, this default setup may not capture the correct IP address. In this case all the incoming requests will be logged in the access logs, MDC logs etc., as if they are all coming from the IP address(es) of the load balancer itself. Proxies usually save the original address in an added HTTP header, from which it can be extracted. For example, AWS LB records the "true" original address in the standard ``X-Forwarded-For`` header. If your Dataverse installation is running behind an IP-masking proxy, but you would like to use IP groups, or record the true geographical location of the incoming requests with Make Data Count, you may enable the IP address lookup from the proxy header using the JVM option ``dataverse.useripaddresssourceheader``, described further below. +By default, the Dataverse installation captures the IP address from which requests originate. This is used for multiple purposes including controlling access to the admin API, IP-based user groups and Make Data Count reporting. When the Dataverse installation is configured behind a proxy such as a load balancer, this default setup may not capture the correct IP address. In this case all the incoming requests will be logged in the access logs, MDC logs etc., as if they are all coming from the IP address(es) of the load balancer itself. Proxies usually save the original address in an added HTTP header, from which it can be extracted. For example, AWS LB records the "true" original address in the standard ``X-Forwarded-For`` header. If your Dataverse installation is running behind an IP-masking proxy, but you would like to use IP groups, or record the true geographical location of the incoming requests with Make Data Count, you may enable the IP address lookup from the proxy header using the JVM option ``dataverse.useripaddresssourceheader``, described further below. Before doing so however, you must absolutely **consider the security risks involved**! This option must be enabled **only** on a Dataverse installation that is in fact fully behind a proxy that properly, and consistently, adds the ``X-Forwarded-For`` (or a similar) header to every request it forwards. Consider the implications of activating this option on a Dataverse installation that is not running behind a proxy, *or running behind one, but still accessible from the insecure locations bypassing the proxy*: Anyone can now add the header above to an incoming request, supplying an arbitrary IP address that the Dataverse installation will trust as the true origin of the call. Thus giving an attacker an easy way to, for example, get in a privileged IP group. The implications could be even more severe if an attacker were able to pretend to be coming from ``localhost``, if a Dataverse installation is configured to trust localhost connections for unrestricted access to the admin API! We have addressed this by making it so that Dataverse installation should never accept ``localhost``, ``127.0.0.1``, ``0:0:0:0:0:0:0:1`` etc. when supplied in such a header. But if you have reasons to still find this risk unacceptable, you may want to consider turning open localhost access to the API off (See :ref:`Securing Your Installation ` for more information.) -This is how to verify that your proxy or load balancer, etc. is handling the originating address headers properly and securely: Make sure access logging is enabled in your application server (Payara) configuration. (```` in the ``domain.xml``). Add the address header to the access log format. For example, on a system behind AWS ELB, you may want to use something like ``%client.name% %datetime% %request% %status% %response.length% %header.referer% %header.x-forwarded-for%``. Once enabled, access the Dataverse installation from outside the LB. You should now see the real IP address of your remote client in the access log. For example, something like: -``"1.2.3.4" "01/Jun/2020:12:00:00 -0500" "GET /dataverse.xhtml HTTP/1.1" 200 81082 "NULL-REFERER" "128.64.32.16"`` +This is how to verify that your proxy or load balancer, etc. is handling the originating address headers properly and securely: Make sure access logging is enabled in your application server (Payara) configuration. (```` in the ``domain.xml``). Add the address header to the access log format. For example, on a system behind AWS ELB, you may want to use something like ``%client.name% %datetime% %request% %status% %response.length% %header.referer% %header.x-forwarded-for%``. Once enabled, access the Dataverse installation from outside the LB. You should now see the real IP address of your remote client in the access log. For example, something like: +``"1.2.3.4" "01/Jun/2020:12:00:00 -0500" "GET /dataverse.xhtml HTTP/1.1" 200 81082 "NULL-REFERER" "128.64.32.16"`` -In this example, ``128.64.32.16`` is your remote address (that you should verify), and ``1.2.3.4`` is the address of your LB. If you're not seeing your remote address in the log, do not activate the JVM option! Also, verify that all the entries in the log have this header populated. The only entries in the access log that you should be seeing without this header (logged as ``"NULL-HEADER-X-FORWARDED-FOR"``) are local requests, made from localhost, etc. In this case, since the request is not coming through the proxy, the local IP address should be logged as the primary one (as the first value in the log entry, ``%client.name%``). If you see any requests coming in from remote, insecure subnets without this header - do not use the JVM option! +In this example, ``128.64.32.16`` is your remote address (that you should verify), and ``1.2.3.4`` is the address of your LB. If you're not seeing your remote address in the log, do not activate the JVM option! Also, verify that all the entries in the log have this header populated. The only entries in the access log that you should be seeing without this header (logged as ``"NULL-HEADER-X-FORWARDED-FOR"``) are local requests, made from localhost, etc. In this case, since the request is not coming through the proxy, the local IP address should be logged as the primary one (as the first value in the log entry, ``%client.name%``). If you see any requests coming in from remote, insecure subnets without this header - do not use the JVM option! Once you are ready, enable the :ref:`JVM option `. Verify that the remote locations are properly tracked in your MDC metrics, and/or your IP groups are working. As a final test, if your Dataverse installation is allowing unrestricted localhost access to the admin API, imitate an attack in which a malicious request is pretending to be coming from ``127.0.0.1``. Try the following from a remote, insecure location: ``curl https://your.dataverse.edu/api/admin/settings --header "X-FORWARDED-FOR: 127.0.0.1"`` -First of all, confirm that access is denied! If you are in fact able to access the settings api from a location outside the proxy, **something is seriously wrong**, so please let us know, and stop using the JVM option. Otherwise check the access log entry for the header value. What you should see is something like ``"127.0.0.1, 128.64.32.16"``. Where the second address should be the real IP of your remote client. The fact that the "fake" ``127.0.0.1`` you sent over is present in the header is perfectly ok. This is the proper proxy behavior - it preserves any incoming values in the ``X-Forwarded-Header``, if supplied, and adds the detected incoming address to it, *on the right*. It is only this rightmost comma-separated value that Dataverse installation should ever be using. +First of all, confirm that access is denied! If you are in fact able to access the settings api from a location outside the proxy, **something is seriously wrong**, so please let us know, and stop using the JVM option. Otherwise check the access log entry for the header value. What you should see is something like ``"127.0.0.1, 128.64.32.16"``. Where the second address should be the real IP of your remote client. The fact that the "fake" ``127.0.0.1`` you sent over is present in the header is perfectly ok. This is the proper proxy behavior - it preserves any incoming values in the ``X-Forwarded-Header``, if supplied, and adds the detected incoming address to it, *on the right*. It is only this rightmost comma-separated value that Dataverse installation should ever be using. Still feel like activating this option in your configuration? - Have fun and be safe! @@ -268,9 +289,9 @@ identifiers using any of several PID types. The most appropriate PIDs for public DataCite or EZID) and Handles. Dataverse also supports PermaLinks which could be useful for intranet or catalog use cases. A DOI provider called "FAKE" is recommended only for testing and development purposes. -Dataverse can be configured with one or more PID providers, each of which can mint and manage PIDs with a given protocol -(e.g., doi, handle, permalink) using a specific service provider/account (e.g. with DataCite, EZId, or HandleNet) -to manage an authority/shoulder combination, aka a "prefix" (PermaLinks also support custom separator characters as part of the prefix), +Dataverse can be configured with one or more PID providers, each of which can mint and manage PIDs with a given protocol +(e.g., doi, handle, permalink) using a specific service provider/account (e.g. with DataCite, EZId, or HandleNet) +to manage an authority/shoulder combination, aka a "prefix" (PermaLinks also support custom separator characters as part of the prefix), along with an optional list of individual PIDs (with different authority/shoulders) than can be managed with that account. Dataverse automatically manages assigning PIDs and making them findable when datasets are published. There are also :ref:`API calls that @@ -278,25 +299,25 @@ allow updating the PID target URLs and metadata of already-published datasets ma moved to a new URL or when the software is updated to generate additional metadata or address schema changes at the PID service. Note that while some forms of PIDs (Handles, PermaLinks) are technically case sensitive, common practice is to avoid creating PIDs that differ only by case. -Dataverse treats PIDs of all types as case-insensitive (as DOIs are by definition). This means that Dataverse will find datasets (in search, to display dataset pages, etc.) +Dataverse treats PIDs of all types as case-insensitive (as DOIs are by definition). This means that Dataverse will find datasets (in search, to display dataset pages, etc.) when the PIDs entered do not match the case of the original but will have a problem if two PIDs that differ only by case exist in one instance. Testing PID Providers +++++++++++++++++++++ By default, the installer configures the Fake DOI provider as the registration provider. Unlike other DOI Providers, the Fake Provider does not involve any -external resolution service and is not appropriate for use beyond development and testing. You may wish instead to test with +external resolution service and is not appropriate for use beyond development and testing. You may wish instead to test with PermaLinks or with a DataCite test account (which uses DataCite's test infrastructure and will help assure your Dataverse instance can make network connections to DataCite. DataCite requires that you register for a test account, which will have a username, password and your own prefix (please contact support@datacite.org for a test account. You may wish to `contact the GDCC `_ instead - GDCC is able to provide DataCite accounts with a group discount and can also provide test accounts.). Once you receive the login name, password, and prefix for the account, -configure the credentials as described below. +configure the credentials as described below. -Alternately, you may wish to configure other providers for testing: +Alternately, you may wish to configure other providers for testing: - EZID is available to University of California scholars and researchers. Testing can be done using the authority 10.5072 and shoulder FK2 with the "apitest" account (contact EZID for credentials) or an institutional account. Configuration in Dataverse is then analogous to using DataCite. - + - The PermaLink provider, like the FAKE DOI provider, does not involve an external account. Unlike the Fake DOI provider, the PermaLink provider creates PIDs that begin with "perma:", making it clearer that they are not DOIs, and that do resolve to the local dataset/file page in Dataverse, making them useful for some production use cases. See :ref:`permalinks` and (for the FAKE DOI provider) the :doc:`/developers/dev-environment` section of the Developer Guide. @@ -306,7 +327,7 @@ Provider-specific configuration is described below. Once all is configured, you will be able to publish datasets and files, but **the persistent identifiers will not be citable** as they, with the exception of PermaLinks, will not redirect to your dataset page in Dataverse. -Note that any datasets or files created using a test configuration cannot be directly migrated to a production PID provider +Note that any datasets or files created using a test configuration cannot be directly migrated to a production PID provider and would need to be created again once a valid PID Provider(s) are configured. One you are done testing, to properly configure persistent identifiers for a production installation, an account and associated namespace (e.g. authority/shoulder) must be @@ -320,12 +341,12 @@ https://www.cdlib.org/cdlinfo/2017/08/04/ezid-doi-service-is-evolving/ . Once you have your DOI or Handle account credentials and a prefix, configure your Dataverse installation using the settings below. - + Configuring PID Providers +++++++++++++++++++++++++ There are two required global settings to configure PID providers - the list of ids of providers and which one of those should be the default. -Per-provider settings are also required - some that are common to all types and some type specific. All of these settings are defined +Per-provider settings are also required - some that are common to all types and some type specific. All of these settings are defined to be compatible with the MicroProfile specification which means that 1. Any of these settings can be set via system properties (see :ref:`jvm-options` for how to do this), environment variables, or other @@ -344,7 +365,7 @@ to be compatible with the MicroProfile specification which means that 3. Environment variables follow the key, replacing any dot, colon, dash, etc. into an underscore "_" and all uppercase letters. Example: ``dataverse.pid.default-provider`` -> ``DATAVERSE_PID_DEFAULT_PROVIDER`` - + Global Settings ^^^^^^^^^^^^^^^ @@ -356,7 +377,7 @@ dataverse.pid.providers ^^^^^^^^^^^^^^^^^^^^^^^ A comma-separated list of the ids of the PID providers to use. IDs should be simple unique text strings, e.g. datacite1, perma1, etc. -IDs are used to scope the provider-specific settings but are not directly visible to users. +IDs are used to scope the provider-specific settings but are not directly visible to users. .. _dataverse.pid.default-provider: @@ -384,7 +405,7 @@ Each Provider listed by id in the dataverse.pid.providers setting must be config dataverse.pid.*.type ^^^^^^^^^^^^^^^^^^^^ -The Provider type, currently one of ``datacite``, ``ezid``, ``FAKE``, ``hdl``, or ``perma``. The type defines which protocol a service supports (DOI, Handle, or PermaLink) and, for DOI Providers, which +The Provider type, currently one of ``datacite``, ``ezid``, ``FAKE``, ``hdl``, or ``perma``. The type defines which protocol a service supports (DOI, Handle, or PermaLink) and, for DOI Providers, which DOI service is used. .. _dataverse.pid.*.label: @@ -407,7 +428,7 @@ dataverse.pid.*.shoulder In general, PIDs are of the form ``:/*`` where ``*`` is the portion unique to an individual PID. PID Providers must define the authority and shoulder (with the protocol defined by the ``dataverse.pid.*.type`` setting) that defines the set of existing PIDs they can manage and the prefix they can use when minting new PIDs. (Often an account with a PID service provider will be limited to using a single authority/shoulder. If your PID service provider account allows more than one combination that you wish to use in Dataverse, configure multiple PID Provider, one for each combination.) - + .. _dataverse.pid.*.identifier-generation-style: dataverse.pid.*.identifier-generation-style @@ -601,7 +622,7 @@ dataverse.pid.*.ezid.password ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Note that use of `EZId `_ is limited primarily to University of California institutions. If you have an EZId account, -you will need to configure the ``api-url`` and your account ``username`` and ``password``. As above, you should use one of the more secure +you will need to configure the ``api-url`` and your account ``username`` and ``password``. As above, you should use one of the more secure options for setting the password. .. _dataverse.pid.*.permalink: @@ -651,7 +672,7 @@ dataverse.pid.*.handlenet.passphrase Note: If you are **minting your own handles** and plan to set up your own handle service, please refer to `Handle.Net documentation `_. Configure your Handle.net ``index`` to be used registering new persistent -identifiers. Defaults to ``300``. +identifiers. Defaults to ``300``. Indices are used to separate concerns within the Handle system. To add data to an index, authentication is mandatory. See also chapter 1.4 "Authentication" of @@ -659,7 +680,7 @@ the `Handle.Net Technical Documentation .profile="`` -Larger installations may want to increase the number of open S3 connections allowed (default is 256): For example, +Larger installations may want to increase the number of open S3 connections allowed (default is 256): For example, ``./asadmin create-jvm-options "-Ddataverse.files..connection-pool-size=4096"`` @@ -1400,21 +1421,21 @@ Reported Working S3-Compatible Storage Set ``dataverse.files..chunked-encoding=false`` and ``dataverse.files..path-style-request=true`` to use Surf Object Store. You will need the Swift client (documented at ) to create the access key and secret key for the S3 interface. -Note that the ``dataverse.files..proxy-url`` setting can be used in installations where the object store is proxied, but it should be considered an advanced option that will require significant expertise to properly configure. +Note that the ``dataverse.files..proxy-url`` setting can be used in installations where the object store is proxied, but it should be considered an advanced option that will require significant expertise to properly configure. For direct uploads and downloads, Dataverse redirects to the proxy-url but presigns the urls based on the ``dataverse.files..custom-endpoint-url``. Additional configuration (appropriate CORS settings, proxy caching/timeout configuration, and proxy settings to pass headers to/from S3 and to avoid adding additional headers) will also be needed to enable use of a proxy with direct upload and download. For Amazon AWS, see comments in the edu.harvard.iq.dataverse.dataaccess.S3AccessIO class about support for AWS's bucket-specific DNS names. - + `SeaweedFS `_ SeaweedFS is a distributed storage system that has S3 compatibility. Set the S3 storage options as explained above. Make sure to set ``dataverse.files..path-style-access`` to ``true``. You will need to create the bucket beforehand. You can do this with the filer API using curl commands. For example, to create an empty bucket called ``dataverse``: - + .. code-block:: bash curl -X POST "http://localhost:8888/buckets/" curl -X POST "http://localhost:8888/buckets/dataverse/" - + You will also need to set an access and secret key. One way to do this is via a `static file `_. As an example, your ``config.json`` might look like this if you're using a bucket called ``dataverse``: - + .. code-block:: json { @@ -1438,13 +1459,13 @@ You will also need to set an access and secret key. One way to do this is via a } And lastly, to start up the SeaweedFS server and various components you could use a command like this: - + .. code-block:: bash weed server -s3 -metricsPort=9327 -dir=/data -s3.config=/config.json `VAST DataStore `_ - VAST DataStore must be configured with an S3 gateway. A Dataverse bucket must be created. + VAST DataStore must be configured with an S3 gateway. A Dataverse bucket must be created. Follow `VAST DataStore documentation `_ to configure the S3 gateway. Set ``dataverse.files..path-style-access=true`` since VAST DataStore uses path style access. @@ -1490,7 +1511,7 @@ Once you have configured a trusted remote store, you can point your users to the dataverse.files..url-expiration-minutes If direct downloads and using signing: time until links expire. Optional. 60 dataverse.files..remote-store-name A short name used in the UI to indicate where a file is located. Optional. (none) dataverse.files..remote-store-url A url to an info page about the remote store used in the UI. Optional. (none) - + =========================================== ================== ========================================================================== =================== .. _globus-storage: @@ -1510,7 +1531,7 @@ There are two types of Globus stores: - remote - where Dataverse references files that remain on trusted remote Globus endpoints A managed Globus store connects to standard/file-based Globus endpoint. It is also possible to configure an S3 store as a managed store, if the managed endpoint uses an underlying S3 store via the Globus S3 Connector. -With the former, Dataverse has no direct access to the file contents and functionality related to ingest, fixity hash validation, etc. are not available. With the latter, Dataverse can access files internally via S3 and the functionality supported is similar to that when using S3 direct upload. +With the former, Dataverse has no direct access to the file contents and functionality related to ingest, fixity hash validation, etc. are not available. With the latter, Dataverse can access files internally via S3 and the functionality supported is similar to that when using S3 direct upload. Once you have configured a globus store, or configured an S3 store for Globus access, it is recommended that you install the `dataverse-globus app `_ to allow transfers in/out of Dataverse to be initated via the Dataverse user interface. Alternately, you can point your users to the :doc:`/developers/globus-api` for information about API support. @@ -1527,13 +1548,13 @@ Once you have configured a globus store, or configured an S3 store for Globus ac dataverse.files..remote-store-url A url to an info page about the remote store used in the UI. Optional. (none) dataverse.files..managed ``true``/``false`` Whether dataverse manages an associated Globus endpoint ``false`` dataverse.files..transfer-endpoint-with-basepath The *managed* Globus endpoint id and associated base path for file storage (none) - dataverse.files..globus-token A Globus token (base64 endcoded : + dataverse.files..globus-token A Globus token (base64 endcoded : for a managed store) - using a microprofile alias is recommended (none) dataverse.files..reference-endpoints-with-basepaths A comma separated list of *remote* trusted Globus endpoint id/s (none) dataverse.files..files-not-accessible-by-dataverse ``true``/``false`` Should be false for S3 Connector-based *managed* stores, true for others ``false`` - + ======================================================= ================== ========================================================================== =================== - + .. _temporary-file-storage: Temporary Upload File Storage @@ -2188,7 +2209,7 @@ These archival Bags include all of the files and metadata in a given dataset ver The Dataverse Software offers an internal archive workflow which may be configured as a PostPublication workflow via an admin API call to manually submit previously published Datasets and prior versions to a configured archive such as Chronopolis. The workflow creates a `JSON-LD `_ serialized `OAI-ORE `_ map file, which is also available as a metadata export format in the Dataverse Software web interface. -At present, archiving classes include the DuraCloudSubmitToArchiveCommand, LocalSubmitToArchiveCommand, GoogleCloudSubmitToArchive, and S3SubmitToArchiveCommand , which all extend the AbstractSubmitToArchiveCommand and use the configurable mechanisms discussed below. (A DRSSubmitToArchiveCommand, which works with Harvard's DRS also exists and, while specific to DRS, is a useful example of how Archivers can support single-version-only semantics and support archiving only from specified collections (with collection specific parameters)). +At present, archiving classes include the DuraCloudSubmitToArchiveCommand, LocalSubmitToArchiveCommand, GoogleCloudSubmitToArchive, and S3SubmitToArchiveCommand , which all extend the AbstractSubmitToArchiveCommand and use the configurable mechanisms discussed below. (A DRSSubmitToArchiveCommand, which works with Harvard's DRS also exists and, while specific to DRS, is a useful example of how Archivers can support single-version-only semantics and support archiving only from specified collections (with collection specific parameters)). All current options support the :ref:`Archival Status API` calls and the same status is available in the dataset page version table (for contributors/those who could view the unpublished dataset, with more detail available to superusers). @@ -2253,7 +2274,7 @@ ArchiverClassName - the fully qualified class to be used for archiving. For exam Google Cloud Configuration ++++++++++++++++++++++++++ -The Google Cloud Archiver can send Dataverse Archival Bags to a bucket in Google's cloud, including those in the 'Coldline' storage class (cheaper, with slower access) +The Google Cloud Archiver can send Dataverse Archival Bags to a bucket in Google's cloud, including those in the 'Coldline' storage class (cheaper, with slower access) ``curl http://localhost:8080/api/admin/settings/:ArchiverClassName -X PUT -d "edu.harvard.iq.dataverse.engine.command.impl.GoogleCloudSubmitToArchiveCommand"`` @@ -2282,7 +2303,7 @@ For example: S3 Configuration ++++++++++++++++ -The S3 Archiver can send Dataverse Archival Bag to a bucket at any S3 endpoint. The configuration for the S3 Archiver is independent of any S3 store that may be configured in Dataverse and may, for example, leverage colder (cheaper, slower access) storage. +The S3 Archiver can send Dataverse Archival Bag to a bucket at any S3 endpoint. The configuration for the S3 Archiver is independent of any S3 store that may be configured in Dataverse and may, for example, leverage colder (cheaper, slower access) storage. ``curl http://localhost:8080/api/admin/settings/:ArchiverClassName -X PUT -d "edu.harvard.iq.dataverse.engine.command.impl.S3SubmitToArchiveCommand"`` @@ -2332,7 +2353,7 @@ A batch version of this admin API call is also available: The archiveAllUnarchivedDatasetVersions call takes 3 optional configuration parameters. * listonly=true will cause the API to list dataset versions that would be archived but will not take any action. -* limit= will limit the number of dataset versions archived in one API call to ``<=`` . +* limit= will limit the number of dataset versions archived in one API call to ``<=`` . * latestonly=true will limit archiving to only the latest published versions of datasets instead of archiving all unarchived versions. Note that because archiving is done asynchronously, the calls above will return OK even if the user does not have the *PublishDataset* permission on the dataset(s) involved. Failures are indicated in the log and the archivalStatus calls in the native API can be used to check the status as well. @@ -2764,7 +2785,7 @@ Specifies when to use a smaller datafile proxy object for the purposes of datase and improve performance when reindexing large datasets (e.g. those with hundreds or thousands of files). (Creating the proxy may slightly slow indexing datasets with only a few files.) This setting represents a number of files for which the datafile procy should be used. By default, this is set to Interger.MAX which disables using the proxy. -A recommended value would be ~1000 but the optimal value may vary depending on details of your installation. +A recommended value would be ~1000 but the optimal value may vary depending on details of your installation. Can also be set via *MicroProfile Config API* sources, e.g. the environment variable ``DATAVERSE_SOLR_MIN_FILES_TO_USE_PROXY``. @@ -2881,7 +2902,7 @@ Without setting an option, always defaults to testing API endpoint. **Notes:** -- See also these related database settings below: :ref:`:DoiProvider`, +- See also these related database settings below: :ref:`:DoiProvider`, :ref:`:Protocol`, :ref:`:Authority`, :ref:`:Shoulder`. - Can also be set via *MicroProfile Config API* sources, e.g. the environment variable ``DATAVERSE_PID_DATACITE_MDS_API_URL``. @@ -2937,7 +2958,7 @@ Once you have a username from DataCite, you can enter it like this: **Notes:** -- Used in conjuction with :ref:`dataverse.pid.datacite.mds-api-url`, +- Used in conjuction with :ref:`dataverse.pid.datacite.mds-api-url`, :ref:`dataverse.pid.datacite.rest-api-url` and :ref:`dataverse.pid.datacite.password`. - Can also be set via *MicroProfile Config API* sources, e.g. the environment variable ``DATAVERSE_PID_DATACITE_USERNAME``. @@ -2953,7 +2974,7 @@ Once you have a password from your provider, you should create a password alias **Notes:** -- Used in conjuction with :ref:`dataverse.pid.datacite.mds-api-url`, +- Used in conjuction with :ref:`dataverse.pid.datacite.mds-api-url`, :ref:`dataverse.pid.datacite.rest-api-url` and :ref:`dataverse.pid.datacite.username`. - Can also be set via *MicroProfile Config API* sources, e.g. the environment variable ``DATAVERSE_PID_DATACITE_PASSWORD`` (although you shouldn't use @@ -3016,7 +3037,7 @@ Legacy Single PID Provider: dataverse.pid.handlenet.index Related to :ref:`Handle.Net PID provider usage `. Configure your *Handle.Net Index* to be used registering new persistent -identifiers. Defaults to ``300``. +identifiers. Defaults to ``300``. Indices are used to separate concerns within the Handle system. To add data to an index, authentication is mandatory. See also chapter 1.4 "Authentication" of @@ -3055,7 +3076,7 @@ Legacy Single PID Provider: dataverse.pid.ezid.api-url ++++++++++++++++++++++++++++++++++++++++++++++++++++++ The EZID DOI provider is likely not an option if you are `not associated with -California Digital Library (CDL) or Purdue University +California Digital Library (CDL) or Purdue University `_. Defaults to ``https://ezid.cdlib.org``. @@ -3070,7 +3091,7 @@ Legacy Single PID Provider: dataverse.pid.ezid.username +++++++++++++++++++++++++++++++++++++++++++++++++++++++ The EZID DOI provider is likely not an option if you are `not associated with -California Digital Library (CDL) or Purdue University +California Digital Library (CDL) or Purdue University `_. Works the same way as :ref:`dataverse.pid.datacite.username`, but for the EZID DOI @@ -3088,7 +3109,7 @@ Legacy Single PID Provider: dataverse.pid.ezid.password +++++++++++++++++++++++++++++++++++++++++++++++++++++++ The EZID DOI provider is likely not an option if you are `not associated with -California Digital Library (CDL) or Purdue University +California Digital Library (CDL) or Purdue University `_. Works the same way as :ref:`dataverse.pid.datacite.password`, but for the EZID DOI @@ -3096,7 +3117,7 @@ provider. Can also be set via *MicroProfile Config API* sources, e.g. the environment variable ``DATAVERSE_PID_EZID_PASSWORD`` (although you shouldn't use -environment variables for passwords). +environment variables for passwords). This setting was formerly known as ``doi.password`` and has been renamed. You should delete the old JVM option and the wrapped password alias, then recreate @@ -3142,14 +3163,14 @@ For more on Schema.org JSON-LD, see the :doc:`/admin/metadataexport` section of dataverse.useripaddresssourceheader +++++++++++++++++++++++++++++++++++ -**Make sure** to read the section about the :ref:`Security Implications +**Make sure** to read the section about the :ref:`Security Implications ` of using this option earlier in the guide! If set, specifies an HTTP Header such as X-Forwarded-For to use to retrieve the user's IP address. For example: ``./asadmin create-jvm-options '-Ddataverse.useripaddresssourceheader=X-Forwarded-For'`` -This setting is useful in cases such as running your Dataverse installation behind load balancers where the default option of getting the Remote Address from the servlet isn't correct (e.g. it would be the load balancer IP address). Note that unless your installation always sets the header you configure here, this could be used as a way to spoof the user's address. Allowed values are: +This setting is useful in cases such as running your Dataverse installation behind load balancers where the default option of getting the Remote Address from the servlet isn't correct (e.g. it would be the load balancer IP address). Note that unless your installation always sets the header you configure here, this could be used as a way to spoof the user's address. Allowed values are: .. code:: @@ -3164,7 +3185,7 @@ This setting is useful in cases such as running your Dataverse installation behi "HTTP_FORWARDED", "HTTP_VIA", "REMOTE_ADDR" - + .. _dataverse.person-or-org.assume-comma-in-person-name: dataverse.person-or-org.assume-comma-in-person-name @@ -3204,11 +3225,11 @@ dataverse.api.signature-secret Context: Dataverse has the ability to create "Signed URLs" for it's API calls. Using a signed URLs is more secure than providing API tokens, which are long-lived and give the holder all of the permissions of the user. In contrast, signed URLs are time limited and only allow the action of the API call in the URL. See :ref:`api-exttools-auth` and -:ref:`api-native-signed-url` for more details. +:ref:`api-native-signed-url` for more details. The key used to sign a URL is created from the API token of the creating user plus a signature-secret provided by an administrator. -**Using a signature-secret is highly recommended.** This setting defaults to an empty string. Using a non-empty -signature-secret makes it impossible for someone who knows an API token from forging signed URLs and provides extra security by +**Using a signature-secret is highly recommended.** This setting defaults to an empty string. Using a non-empty +signature-secret makes it impossible for someone who knows an API token from forging signed URLs and provides extra security by making the overall signing key longer. **WARNING**: @@ -3569,7 +3590,7 @@ See also :ref:`s3-direct-upload-features-disabled`. dataverse.storageuse.disable-storageuse-increments ++++++++++++++++++++++++++++++++++++++++++++++++++ -This setting serves the role of an emergency "kill switch" that will disable maintaining the real time record of storage use for all the datasets and collections in the database. Because of the experimental nature of this feature (see :doc:`/admin/collectionquotas`) that hasn't been used in production setting as of this release, v6.1 this setting is provided in case these updates start causing database race conditions and conflicts on a busy server. +This setting serves the role of an emergency "kill switch" that will disable maintaining the real time record of storage use for all the datasets and collections in the database. Because of the experimental nature of this feature (see :doc:`/admin/collectionquotas`) that hasn't been used in production setting as of this release, v6.1 this setting is provided in case these updates start causing database race conditions and conflicts on a busy server. dataverse.auth.oidc.* +++++++++++++++++++++ @@ -3620,7 +3641,7 @@ Can also be set via *MicroProfile Config API* sources, e.g. the environment vari dataverse.files.globus-monitoring-server ++++++++++++++++++++++++++++++++++++++++ -This setting is required in conjunction with the ``globus-use-experimental-async-framework`` feature flag (see :ref:`feature-flags`). Setting it to true designates the Dataverse instance to serve as the dedicated polling server. It is needed so that the new framework can be used in a multi-node installation. +This setting is required in conjunction with the ``globus-use-experimental-async-framework`` feature flag (see :ref:`feature-flags`). Setting it to true designates the Dataverse instance to serve as the dedicated polling server. It is needed so that the new framework can be used in a multi-node installation. .. _dataverse.csl.common-styles: @@ -3629,7 +3650,7 @@ dataverse.csl.common-styles This setting allows admins to highlight a few of the 1000+ CSL citation styles available from the dataset page. The value should be a comma-separated list of styles. These will be listed above the alphabetical list of all styles in the "View Styled Citations" pop-up. -The default value when not set is "chicago-author-date, ieee". +The default value when not set is "chicago-author-date, ieee". .. _localcontexts: @@ -3774,28 +3795,28 @@ please find all known feature flags below. Any of these flags can be activated u - Allows the use of an OAuth user account (GitHub, Google, or ORCID) when an identity match is found during API bearer authentication. This feature enables automatic association of an incoming IdP identity with an existing OAuth user account, bypassing the need for additional user registration steps. This feature only works when the feature flag ``api-bearer-auth`` is also enabled. **Caution: Enabling this flag could result in impersonation risks if (and only if) used with a misconfigured IdP.** - ``Off`` * - avoid-expensive-solr-join - - Changes the way Solr queries are constructed for public content (published Collections, Datasets and Files). It removes a very expensive Solr join on all such documents, improving overall performance, especially for large instances under heavy load. Before this feature flag is enabled, the corresponding indexing feature (see next feature flag) must be turned on and a full reindex performed (otherwise public objects are not going to be shown in search results). See :doc:`/admin/solr-search-index`. + - Changes the way Solr queries are constructed for public content (published Collections, Datasets and Files). It removes a very expensive Solr join on all such documents, improving overall performance, especially for large instances under heavy load. Before this feature flag is enabled, the corresponding indexing feature (see next feature flag) must be turned on and a full reindex performed (otherwise public objects are not going to be shown in search results). See :doc:`/admin/solr-search-index`. - ``Off`` * - add-publicobject-solr-field - - Adds an extra boolean field `PublicObject_b:true` for public content (published Collections, Datasets and Files). Once reindexed with these fields, we can rely on it to remove a very expensive Solr join on all such documents in Solr queries, significantly improving overall performance (by enabling the feature flag above, `avoid-expensive-solr-join`). These two flags are separate so that an instance can reindex their holdings before enabling the optimization in searches, thus avoiding having their public objects temporarily disappear from search results while the reindexing is in progress. + - Adds an extra boolean field `PublicObject_b:true` for public content (published Collections, Datasets and Files). Once reindexed with these fields, we can rely on it to remove a very expensive Solr join on all such documents in Solr queries, significantly improving overall performance (by enabling the feature flag above, `avoid-expensive-solr-join`). These two flags are separate so that an instance can reindex their holdings before enabling the optimization in searches, thus avoiding having their public objects temporarily disappear from search results while the reindexing is in progress. - ``Off`` * - reduce-solr-deletes - - Avoids deleting and recreating solr documents for dataset files when reindexing. + - Avoids deleting and recreating solr documents for dataset files when reindexing. - ``Off`` * - disable-return-to-author-reason - - Removes the reason field in the `Publish/Return To Author` dialog that was added as a required field in v6.2 and makes the reason an optional parameter in the :ref:`return-a-dataset` API call. + - Removes the reason field in the `Publish/Return To Author` dialog that was added as a required field in v6.2 and makes the reason an optional parameter in the :ref:`return-a-dataset` API call. - ``Off`` * - disable-dataset-thumbnail-autoselect - Turns off automatic selection of a dataset thumbnail from image files in that dataset. When set to ``On``, a user can still manually pick a thumbnail image or upload a dedicated thumbnail image. - ``Off`` * - globus-use-experimental-async-framework - - Activates a new experimental implementation of Globus polling of ongoing remote data transfers that does not rely on the instance staying up continuously for the duration of the transfers and saves the state information about Globus upload requests in the database. Added in v6.4; extended in v6.6 to cover download transfers, in addition to uploads. Affects :ref:`:GlobusPollingInterval`. Note that the JVM option :ref:`dataverse.files.globus-monitoring-server` described above must also be enabled on one (and only one, in a multi-node installation) Dataverse instance. + - Activates a new experimental implementation of Globus polling of ongoing remote data transfers that does not rely on the instance staying up continuously for the duration of the transfers and saves the state information about Globus upload requests in the database. Added in v6.4; extended in v6.6 to cover download transfers, in addition to uploads. Affects :ref:`:GlobusPollingInterval`. Note that the JVM option :ref:`dataverse.files.globus-monitoring-server` described above must also be enabled on one (and only one, in a multi-node installation) Dataverse instance. - ``Off`` * - index-harvested-metadata-source - Index the nickname or the source name (See the optional ``sourceName`` field in :ref:`create-a-harvesting-client`) of the harvesting client as the "metadata source" of harvested datasets and files. If enabled, the Metadata Source facet will show separate groupings of the content harvested from different sources (by harvesting client nickname or source name) instead of the default behavior where there is one "Harvested" grouping for all harvested content. - ``Off`` * - enable-version-note - - Turns on the ability to add/view/edit/delete per-dataset-version notes intended to provide :ref:`provenance` information about why the dataset/version was created. + - Turns on the ability to add/view/edit/delete per-dataset-version notes intended to provide :ref:`provenance` information about why the dataset/version was created. - ``Off`` * - shibboleth-use-wayfinder - This flag allows an instance to use Shibboleth with InCommon federation services. Our original Shibboleth implementation that relies on DiscoFeed can no longer be used since InCommon discontinued their old-style metadata feed. An alternative mechanism had to be implemented in order to use WayFinder service, their recommended replacements, instead. @@ -3909,7 +3930,7 @@ Below is an example of setting ``localhost-only``. +++++++++++++++++++++++++++++++++ .. note:: - This setting is deprecated. Please use the JvmSetting :ref:`dataverse.api.blocked.endpoints` instead. This legacy setting will only be used if the newer JvmSettings are not set. + This setting is deprecated. Please use the JvmSetting :ref:`dataverse.api.blocked.endpoints` instead. This legacy setting will only be used if the newer JvmSettings are not set. A comma-separated list of API endpoints to be blocked. For a standard production installation, the installer blocks both "admin" and "builtin-users" by default per the security section above: @@ -4158,7 +4179,7 @@ timestamps. Toggles publishing of file-level PIDs for the entire installation. By default this setting is absent and Dataverse Software assumes it to be false. If enabled, the registration will be performed asynchronously (in the background) during publishing of a dataset. -It is possible to override the installation-wide setting for specific collections, see :ref:`:AllowEnablingFilePIDsPerCollection <:AllowEnablingFilePIDsPerCollection>`. For example, registration of PIDs for files can be enabled in a specific collection when it is disabled instance-wide. Or it can be disabled in specific collections where it is enabled by default. See :ref:`collection-attributes-api` for details. +It is possible to override the installation-wide setting for specific collections, see :ref:`:AllowEnablingFilePIDsPerCollection <:AllowEnablingFilePIDsPerCollection>`. For example, registration of PIDs for files can be enabled in a specific collection when it is disabled instance-wide. Or it can be disabled in specific collections where it is enabled by default. See :ref:`collection-attributes-api` for details. To enable file-level PIDs for the entire installation:: @@ -4176,7 +4197,7 @@ If you don't want to register file-based PIDs for your entire installation:: Toggles whether superusers can change the File PIDs policy per collection. By default this setting is absent and Dataverse Software assumes it to be false. -For example, if this setting is true, registration of PIDs for files can be enabled in a specific collection when it is disabled instance-wide. Or it can be disabled in specific collections where it is enabled by default. See :ref:`collection-attributes-api` for details. +For example, if this setting is true, registration of PIDs for files can be enabled in a specific collection when it is disabled instance-wide. Or it can be disabled in specific collections where it is enabled by default. See :ref:`collection-attributes-api` for details. To enable setting file-level PIDs per collection:: @@ -4211,7 +4232,7 @@ For this handle the prefix is '21.T12996' and the suffix is 'USER01'. The comman :FileValidationOnPublishEnabled +++++++++++++++++++++++++++++++ -Toggles validation of the physical files in the dataset when it's published, by recalculating the checksums and comparing against the values stored in the DataFile table. By default this setting is absent and the Dataverse Software assumes it to be true. If enabled, the validation will be performed asynchronously, similarly to how we handle assigning persistent identifiers to datafiles, with the dataset locked for the duration of the publishing process. +Toggles validation of the physical files in the dataset when it's published, by recalculating the checksums and comparing against the values stored in the DataFile table. By default this setting is absent and the Dataverse Software assumes it to be true. If enabled, the validation will be performed asynchronously, similarly to how we handle assigning persistent identifiers to datafiles, with the dataset locked for the duration of the publishing process. If you don't want the datafiles to be validated on publish, set: @@ -4372,7 +4393,7 @@ For performance reasons, your Dataverse installation will only allow creation of ``curl -X PUT -d 1000000000 http://localhost:8080/api/admin/settings/:ZipDownloadLimit`` -In the UI, users trying to download a zip file larger than the Dataverse installation's :ZipDownloadLimit will receive messaging that the zip file is too large, and the user will be presented with alternate access options. +In the UI, users trying to download a zip file larger than the Dataverse installation's :ZipDownloadLimit will receive messaging that the zip file is too large, and the user will be presented with alternate access options. :TabularIngestSizeLimit +++++++++++++++++++++++ @@ -4452,19 +4473,19 @@ To enable the setting:: :DisableSolrFacetsForGuestUsers +++++++++++++++++++++++++++++++ -Similar to the above, but will disable the facets for Guest (unauthenticated) users only. +Similar to the above, but will disable the facets for Guest (unauthenticated) users only. :DisableSolrFacetsWithoutJsession +++++++++++++++++++++++++++++++++ -Same idea as with the 2 settings above. For the purposes of this setting, a request is considered "anonymous", if it came in without the JSESSION cookie supplied. A UI user who is browsing the holdings without logging in will have a valid JSESSION cookie, tied to a guest session. The main purpose of this setting is to hide the facets from bots, scripted crawlers and such (most of which - though not all - do not use cookies). Not letting the bots anywhere near the facets can serve a dual purpose on a busy instance experiencing problems with such abuse - some CPU cycles and resources can be saved by not having to generate the facets. And, even more importantly, it can prevent bots from attempting to crawl the facet trees, which has a potential for multiplying the service load. +Same idea as with the 2 settings above. For the purposes of this setting, a request is considered "anonymous", if it came in without the JSESSION cookie supplied. A UI user who is browsing the holdings without logging in will have a valid JSESSION cookie, tied to a guest session. The main purpose of this setting is to hide the facets from bots, scripted crawlers and such (most of which - though not all - do not use cookies). Not letting the bots anywhere near the facets can serve a dual purpose on a busy instance experiencing problems with such abuse - some CPU cycles and resources can be saved by not having to generate the facets. And, even more importantly, it can prevent bots from attempting to crawl the facet trees, which has a potential for multiplying the service load. .. _:DisableUncheckedTypesFacet: :DisableUncheckedTypesFacet +++++++++++++++++++++++++++ -Another option for reducing the load on solr on a busy instance. Rather than disabling all the search facets, this setting affects only one - the facet on the upper left of the collection page, where users can select the type of objects to search - Collections ("Dataverses"), Datasets and/or Files. With this option set to true, the numbers of results will only be shown for the types actually selected (i.e. only for the search results currently shown to the user). This minor feature - being able to tell the user how many files (for example) they *would* find, *if* they chose to search for files, by clicking the Files facet - essentially doubles the expense of running the search. That may still be negligible on an instance with lighter holdings, but can make a significant difference for a large and heavily used archive. +Another option for reducing the load on solr on a busy instance. Rather than disabling all the search facets, this setting affects only one - the facet on the upper left of the collection page, where users can select the type of objects to search - Collections ("Dataverses"), Datasets and/or Files. With this option set to true, the numbers of results will only be shown for the types actually selected (i.e. only for the search results currently shown to the user). This minor feature - being able to tell the user how many files (for example) they *would* find, *if* they chose to search for files, by clicking the Files facet - essentially doubles the expense of running the search. That may still be negligible on an instance with lighter holdings, but can make a significant difference for a large and heavily used archive. .. _:SignUpUrl: @@ -4758,7 +4779,7 @@ To check the current value of ``:ShibAffiliationAttribute``: +++++++++++++++++++++++++++++++++++++++++++ It seems that the application server (usually Glassfish or Payara) will interpret all Shibboleth attributes that come through AJP as ISO-8859-1, even if they where originally UTF-8. -To circumvent that, we re-encode all received Shibboleth attributes manually as UTF-8 by default. +To circumvent that, we re-encode all received Shibboleth attributes manually as UTF-8 by default. In the case you get garbled characters in Shibboleth-supplied fields (e.g. given name, surname, affiliation), you can disable this behaviour by setting ShibAttributeCharacterSetConversionEnabled to false: ``curl -X PUT -d false http://localhost:8080/api/admin/settings/:ShibAttributeCharacterSetConversionEnabled`` @@ -4770,7 +4791,7 @@ If you managed to get correct accented characters from shibboleth while this set Will select the last or first value of an array in affiliation, the array separator can be set using ``:ShibAffiliationSeparator`` . -To select the last value : +To select the last value : ``curl -X PUT -d "lastAffiliation" http://localhost:8080/api/admin/settings/:ShibAffiliationOrder`` @@ -4962,7 +4983,7 @@ or a comma-separated list of allowed origins. :ChronologicalDateFacets ++++++++++++++++++++++++ -Unlike other facets, those indexed by Date/Year are sorted chronologically by default, with the most recent value first. To have them sorted by number of hits, e.g. with the year with the most results first, set this to false +Unlike other facets, those indexed by Date/Year are sorted chronologically by default, with the most recent value first. To have them sorted by number of hits, e.g. with the year with the most results first, set this to false If you don’t want date facets to be sorted chronologically, set: @@ -4976,11 +4997,11 @@ redirecing bulk/multi-file zip download requests to that location, instead of se See :ref:`zipdownloader` of the Advanced Installation guide for information on how to install the external zipper. (This is still an **experimental** feature, as of Dataverse Software 5.0). -To enable redirects to the zipper installed on the same server as the main Dataverse Software application: +To enable redirects to the zipper installed on the same server as the main Dataverse Software application: ``curl -X PUT -d '/cgi-bin/zipdownload' http://localhost:8080/api/admin/settings/:CustomZipDownloadServiceUrl`` -To enable redirects to the zipper on a different server: +To enable redirects to the zipper on a different server: ``curl -X PUT -d 'https://zipper.example.edu/cgi-bin/zipdownload' http://localhost:8080/api/admin/settings/:CustomZipDownloadServiceUrl`` @@ -5034,20 +5055,20 @@ Your Dataverse installation can export archival "Bag" files to an extensible set This setting specifies which storage system to use by identifying the particular Java class that should be run. Current configuration options include DuraCloudSubmitToArchiveCommand, LocalSubmitToArchiveCommand, GoogleCloudSubmitToArchiveCommand, and S3SubmitToArchiveCommand. For examples, see the specific configuration above in :ref:`BagIt Export`. - + :ArchiverSettings +++++++++++++++++ Each Archiver class may have its own custom settings. Along with setting which Archiver class to use, one must use this setting to identify which setting values should be sent to it when it is invoked. The value should be a comma-separated list of setting names. For example, the LocalSubmitToArchiveCommand only uses the :BagItLocalPath setting. To allow the class to use that setting, this setting must set as: -``curl -X PUT -d ':BagItLocalPath' http://localhost:8080/api/admin/settings/:ArchiverSettings`` +``curl -X PUT -d ':BagItLocalPath' http://localhost:8080/api/admin/settings/:ArchiverSettings`` :BagGeneratorThreads ++++++++++++++++++++ An archiver setting shared by several implementations (e.g. DuraCloud, Google, and Local) that can make Bag generation use fewer or more threads in zipping datafiles that the default of 2 - + ``curl http://localhost:8080/api/admin/settings/:BagGeneratorThreads -X PUT -d '8'`` :DuraCloudHost @@ -5087,14 +5108,14 @@ As explained under :ref:`Branding Your Installation`, by default, the name of th :ExportInstallationAsDistributorOnlyWhenNotSet ++++++++++++++++++++++++++++++++++++++++++++++ -In the DDI metadata exports, the default behavior is to always add the repository (using its brandname - the root collection name or the value of :ref:`:InstallationName <:InstallationName>`) to the stdyDscr/distStmt/distrbtr element. If this setting is true, this will only be done when a Distributor is not already defined in the Dataset metadata. (Note that, since metadata export files are cached, they will have to be reexported (see :doc:`/admin/metadataexport`) before they incorporate a change in this setting.) +In the DDI metadata exports, the default behavior is to always add the repository (using its brandname - the root collection name or the value of :ref:`:InstallationName <:InstallationName>`) to the stdyDscr/distStmt/distrbtr element. If this setting is true, this will only be done when a Distributor is not already defined in the Dataset metadata. (Note that, since metadata export files are cached, they will have to be reexported (see :doc:`/admin/metadataexport`) before they incorporate a change in this setting.) .. _:AnonymizedFieldTypeNames: :AnonymizedFieldTypeNames +++++++++++++++++++++++++ -A comma-separated list of field type names that should be 'withheld' when dataset access occurs via a Private Url with Anonymized Access (e.g. to support anonymized review). +A comma-separated list of field type names that should be 'withheld' when dataset access occurs via a Private Url with Anonymized Access (e.g. to support anonymized review). A suggested minimum includes author, datasetContact, and contributor, but additional fields such as depositor, grantNumber, and publication might also need to be included. ``curl -X PUT -d 'author, datasetContact, contributor, depositor, grantNumber, publication' http://localhost:8080/api/admin/settings/:AnonymizedFieldTypeNames`` @@ -5133,7 +5154,7 @@ Also refer to the "Datafile Integrity" API :ref:`datafile-integrity` ++++++++++++++++++++++++++++++++++ A boolean setting that, if true, will send an email and notification to users when a Dataset is created. Messages go to those, other than the dataset creator, who have the ability/permission necessary to publish the dataset. The intent of this functionality is to simplify tracking activity and planning to follow-up contact. - + ``curl -X PUT -d true http://localhost:8080/api/admin/settings/:SendNotificationOnDatasetCreation`` .. _:CVocConf: @@ -5241,7 +5262,7 @@ For example, once the following setting is created: Specifies a custom error message shown to the user when a Dataverse collection fails an external metadata validation (as specified in the setting above) during an attempt to publish. If not specified, the default message "This dataverse collection cannot be published because it has failed an external metadata validation test" will be used. -For example: +For example: ``curl -X PUT -d "This content needs to go through an additional review by the Curation Team before it can be published." http://localhost:8080/api/admin/settings/:DataverseMetadataPublishValidationFailureMsg`` @@ -5268,7 +5289,7 @@ In some ways this duplicates a workflow mechanism, since it is possible to defin Specifies a custom error message shown to the user when a dataset fails an external metadata validation (as specified in the setting above) during an attempt to publish. If not specified, the default message "This dataset cannot be published because it has failed an external metadata validation test" will be used. -For example: +For example: ``curl -X PUT -d "This content needs to go through an additional review by the Curation Team before it can be published." http://localhost:8080/api/admin/settings/:DatasetMetadataValidationFailureMsg`` @@ -5276,7 +5297,7 @@ For example: :ExternalValidationAdminOverride ++++++++++++++++++++++++++++++++ -When set to ``true``, this setting allows a superuser to publish and/or update Dataverse collections and datasets bypassing the external validation checks (specified by the settings above). In an event where an external script is reporting validation failures that appear to be in error, this option gives an admin with superuser privileges a quick way to publish the dataset or update a collection for the user. +When set to ``true``, this setting allows a superuser to publish and/or update Dataverse collections and datasets bypassing the external validation checks (specified by the settings above). In an event where an external script is reporting validation failures that appear to be in error, this option gives an admin with superuser privileges a quick way to publish the dataset or update a collection for the user. .. _:FileCategories: @@ -5357,7 +5378,7 @@ The interval in seconds between Dataverse calls to Globus to check on upload pro :GlobusBatchLookupSize ++++++++++++++++++++++ -In the initial implementation, when files were added to the dataset upon completion of a Globus upload task, Dataverse would make a separate Globus API call to look up the size of every new file. This proved to be a significant bottleneck at Harvard Dataverse with users transferring batches of many thousands of files (this in turn was made possible by the Globus improvements in v6.4). An optimized lookup mechanism was added in response, where the Globus Service makes a listing API call on the entire remote folder, then populates the file sizes for all the new file entries before passing them to the Ingest service. This approach however may in fact slow things down in a scenario where there are already thousands of files in the Globus folder for the dataset, and only a small number of new files are being added. To address this, the number of files in a batch for which this method should be used was made configurable. If not set, it will default to 50 (a completely arbitrary number). Setting it to 0 will always use this method with Globus uploads. Setting it to some very large number will disable it completely. This was made a database setting, as opposed to a JVM option, in order to make it configurable in real time. +In the initial implementation, when files were added to the dataset upon completion of a Globus upload task, Dataverse would make a separate Globus API call to look up the size of every new file. This proved to be a significant bottleneck at Harvard Dataverse with users transferring batches of many thousands of files (this in turn was made possible by the Globus improvements in v6.4). An optimized lookup mechanism was added in response, where the Globus Service makes a listing API call on the entire remote folder, then populates the file sizes for all the new file entries before passing them to the Ingest service. This approach however may in fact slow things down in a scenario where there are already thousands of files in the Globus folder for the dataset, and only a small number of new files are being added. To address this, the number of files in a batch for which this method should be used was made configurable. If not set, it will default to 50 (a completely arbitrary number). Setting it to 0 will always use this method with Globus uploads. Setting it to some very large number will disable it completely. This was made a database setting, as opposed to a JVM option, in order to make it configurable in real time. :GlobusSingleFileTransfer +++++++++++++++++++++++++ @@ -5378,7 +5399,7 @@ To use the current GDCC version directly: :CategoryOrder ++++++++++++++ -A comma separated list of Category/Tag names defining the order in which files with those tags should be displayed. +A comma separated list of Category/Tag names defining the order in which files with those tags should be displayed. The setting can include custom tag names along with the pre-defined tags (Documentation, Data, and Code are the defaults but the :ref:`:FileCategories` setting can be used to use a different set of tags). The default is category ordering disabled. @@ -5394,7 +5415,7 @@ A true(default)/false option determining whether datafiles listed on the dataset :AllowUserManagementOfOrder +++++++++++++++++++++++++++ -A true/false (default) option determining whether the dataset datafile table display includes checkboxes enabling users to turn folder ordering and/or category ordering (if an order is defined by :CategoryOrder) on and off dynamically. +A true/false (default) option determining whether the dataset datafile table display includes checkboxes enabling users to turn folder ordering and/or category ordering (if an order is defined by :CategoryOrder) on and off dynamically. ``curl -X PUT -d true http://localhost:8080/api/admin/settings/:AllowUserManagementOfOrder`` @@ -5418,7 +5439,7 @@ Access API will be able to take advantage of Direct Download for tab. files saved with these headers on S3 - since they no longer have to be generated and added to the streamed file on the fly. -The setting is ``false`` by default, preserving the legacy behavior. +The setting is ``false`` by default, preserving the legacy behavior. :RateLimitingDefaultCapacityTiers +++++++++++++++++++++++++++++++++ From 4c34917ba4f373441152eec8066bcea5549ff2b4 Mon Sep 17 00:00:00 2001 From: Eryk Kulikowski Date: Mon, 13 Oct 2025 16:46:11 +0200 Subject: [PATCH 12/29] wording --- doc/release-notes/11744-cors-echo-origin-vary.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/release-notes/11744-cors-echo-origin-vary.md b/doc/release-notes/11744-cors-echo-origin-vary.md index 5110bdee648..48eaa3b96f9 100644 --- a/doc/release-notes/11744-cors-echo-origin-vary.md +++ b/doc/release-notes/11744-cors-echo-origin-vary.md @@ -31,7 +31,7 @@ dataverse.cors.methods=GET, POST, OPTIONS, PUT, DELETE ## Compatibility -- Must configure `dataverse.cors.origin`; `:AllowCors` no longer sufficient. +- Must configure `dataverse.cors.origin`; `:AllowCors` was deprecated and has now been removed. - Any `*` triggers wildcard (no per-origin echo / no Vary header). ## Docs From 7208c83f21111b85ae36988307edf59117b7d4a2 Mon Sep 17 00:00:00 2001 From: Eryk Kulikowski Date: Mon, 13 Oct 2025 16:50:32 +0200 Subject: [PATCH 13/29] Removed deprecated (and removed from code) AllowCors setting from doc --- doc/sphinx-guides/source/installation/config.rst | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index 4694bd32467..c78c4b11272 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -4968,18 +4968,6 @@ This can be helpful in situations where multiple organizations are sharing one D or ``curl -X PUT -d '*' http://localhost:8080/api/admin/settings/:InheritParentRoleAssignments`` -:AllowCors (Deprecated – no longer used once dataverse.cors.* settings exist) -+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - -.. note:: - This legacy database setting has been superseded by the ``dataverse.cors.*`` JVM/MicroProfile settings. In current versions CORS is only enabled when ``dataverse.cors.origin`` is explicitly set. Existing values of ``:AllowCors`` are ignored if ``dataverse.cors.origin`` is unset. - -Historical behavior (prior versions) allowed setting ``:AllowCors`` to ``true``/``false``. Administrators should migrate to the JVM/MicroProfile setting: - -``./asadmin create-jvm-options '-Ddataverse.cors.origin=*'`` - -or a comma-separated list of allowed origins. - :ChronologicalDateFacets ++++++++++++++++++++++++ From fe15c0cd00f31fdae58ed86fc1a1588fa2d18dc1 Mon Sep 17 00:00:00 2001 From: Eryk Kulikowski Date: Mon, 13 Oct 2025 16:53:05 +0200 Subject: [PATCH 14/29] Fix formatting inconsistencies in dataset management documentation --- .../source/user/dataset-management.rst | 88 +++++++++---------- 1 file changed, 44 insertions(+), 44 deletions(-) diff --git a/doc/sphinx-guides/source/user/dataset-management.rst b/doc/sphinx-guides/source/user/dataset-management.rst index 2961242ca77..765d737d847 100755 --- a/doc/sphinx-guides/source/user/dataset-management.rst +++ b/doc/sphinx-guides/source/user/dataset-management.rst @@ -12,7 +12,7 @@ A dataset in a Dataverse installation is a container for your data, documentatio Supported Metadata ================== -A dataset contains three levels of metadata: +A dataset contains three levels of metadata: #. **Citation Metadata**: any metadata that would be needed for generating a data citation and other general metadata that could be applied to any dataset; #. **Domain Specific Metadata**: with specific support currently for Social Science, Life Science, Geospatial, and Astronomy datasets; and @@ -56,11 +56,11 @@ Adding a New Dataset #. When entering author identifiers, select the type from the dropdown (e.g. "ORCID") and under "Identifier" enter the full URL (e.g. "https://orcid.org/0000-0002-1825-0097") for identifiers that have a URL form. The shorter form of the unique identifier (e.g. "0000-0002-1825-0097") can also be entered, but URL form is preferred when available. -#. Scroll down to the "Files" section and click on "Select Files to Add" to add all the relevant files to your Dataset. +#. Scroll down to the "Files" section and click on "Select Files to Add" to add all the relevant files to your Dataset. You can also upload your files directly from your Dropbox. **Tip:** You can drag and drop or select multiple files at a time from your desktop directly into the upload widget. Your files will appear below the "Select Files to Add" button where you can add a description and tags (via the "Edit Tag" button) for each file. Additionally, an MD5 checksum will be added for each file. If you upload a tabular file a :ref:`Universal Numerical Fingerprint (UNF) ` will be added to this file. -#. Click the "Save Dataset" button when you are done. Your unpublished dataset is now created. +#. Click the "Save Dataset" button when you are done. Your unpublished dataset is now created. Note: You can add additional metadata once you have completed the initial dataset creation by going to clicking the Edit button and selecting Metadata from the dropdown menu. @@ -69,8 +69,8 @@ Note: You can add additional metadata once you have completed the initial datase Supported HTML Tags ------------------- -We currently only support the following HTML tags for any of our textbox metadata fields (i.e., Description) : , ,

, -
, , ,
,
,
, ,
,

-

, , , ,
  • ,
      ,

      ,

      , , , , 
      +We currently only support the following HTML tags for any of our textbox metadata fields (i.e., Description) : , , 
      , +
      , , ,
      ,
      ,
      , ,
      ,

      -

      , , , ,
    1. ,
        ,

        ,

        , , , ,
         , , ,