diff --git a/doc/sphinx-guides/source/api/native-api.rst b/doc/sphinx-guides/source/api/native-api.rst index 1a1604886c6..4a7760be882 100644 --- a/doc/sphinx-guides/source/api/native-api.rst +++ b/doc/sphinx-guides/source/api/native-api.rst @@ -3245,7 +3245,7 @@ The fully expanded example above (without environment variables) looks like this .. code-block:: bash - curl -H "X-Dataverse-key:xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" -X POST "https://demo.dataverse.org/api/datasets/:persistentId/files/metadata?:persistentId=doi:10.5072/FK2/J8SJZB" --upload-file file-metadata-update.json + curl -H "X-Dataverse-key:xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" -X POST "https://demo.dataverse.org/api/datasets/:persistentId/files/metadata?persistentId=doi:10.5072/FK2/J8SJZB" --upload-file file-metadata-update.json The ``file-metadata-update.json`` file should contain a JSON array of objects, each representing a file to be updated. Here's an example structure: diff --git a/doc/sphinx-guides/source/user/dataset-management.rst b/doc/sphinx-guides/source/user/dataset-management.rst index 9c389ef4be3..ebe9270e79f 100755 --- a/doc/sphinx-guides/source/user/dataset-management.rst +++ b/doc/sphinx-guides/source/user/dataset-management.rst @@ -145,7 +145,7 @@ Beginning with Dataverse Software 5.0, the way a Dataverse installation handles - Files with the same checksum can be included in a dataset, even if the files are in the same directory. - Files with the same filename can be included in a dataset as long as the files are in different directories. - If a user uploads a file to a directory where a file already exists with that directory/filename combination, the Dataverse installation will adjust the file path and names by adding "-1" or "-2" as applicable. This change will be visible in the list of files being uploaded. -- If the directory or name of an existing or newly uploaded file is edited in such a way that would create a directory/filename combination that already exists, the Dataverse installation will display an error. +- If the directory or name of an existing or newly uploaded file is edited in such a way that would create a directory/filename combination that already exists, or the new directory/filename exists as directory, the Dataverse installation will display an error. - If a user attempts to replace a file with another file that has the same checksum, an error message will be displayed and the file will not be able to be replaced. - If a user attempts to replace a file with a file that has the same checksum as a different file in the dataset, a warning will be displayed. diff --git a/scripts/issues/12407/find_duplicates.py b/scripts/issues/12407/find_duplicates.py new file mode 100644 index 00000000000..89290311cc9 --- /dev/null +++ b/scripts/issues/12407/find_duplicates.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python3 +import argparse +import psycopg2 +from pathlib import Path +from textwrap import dedent + +def read_sql(path: Path) -> str: + text = path.read_text(encoding="utf-8") + return "\n".join( + line for line in text.splitlines() if not line.lstrip().startswith("\\") + ) + + +def fetch_dv_ids(conn, find_dv_ids_sql: str) -> list[int]: + with conn.cursor() as cur: + cur.execute(find_dv_ids_sql) + rows = cur.fetchall() + + # Query returns dv_id as first selected column in your file. + return [int(row[0]) for row in rows] + + +def fetch_dataset_info(conn, datasetversion_id: int): + dataset_query = """ + SELECT dso.protocol, dso.authority, dso.identifier, dv.versionnumber, dv.minorversionnumber + FROM datasetversion dv + JOIN dvobject dso ON dso.id = dv.dataset_id + WHERE dv.id = %s \ + """ + with conn.cursor() as cur: + cur.execute(dataset_query, (datasetversion_id,)) + return cur.fetchone() + return None + + +def run_find_duplicates(conn, find_duplicates_sql: str): + last_dv_id = None + last_info = ("", "", "", "", "") + + with conn.cursor() as cur: + cur.execute(find_duplicates_sql) + cols = [d[0] for d in cur.description] + + extra_cols = ["protocol", "authority", "dataset_id", "versionnumber", "minorversionnumber"] + print("\t".join(cols + extra_cols)) + + for row in cur: + dv_id = int(row[0]) # datasetversion_id + + if dv_id != last_dv_id: + fetched = fetch_dataset_info(conn, dv_id) + last_info = fetched if fetched is not None else ("", "", "", "", "") + last_dv_id = dv_id + + print("\t".join("" if v is None else str(v) for v in (tuple(row) + tuple(last_info)))) + + +def main(): + class RawDefaultsFormatter( + argparse.ArgumentDefaultsHelpFormatter, + argparse.RawDescriptionHelpFormatter, + ): + pass + + parser = argparse.ArgumentParser( + description=dedent(""" + Execute as owner of dvndb. + + `find_duplicates.sql` is executed for dv_ids returned by `find_dv_ids.sql`. + `find_dv_ids.sql` returns the latest version per dataset. + """), + formatter_class=RawDefaultsFormatter, + ) + parser.add_argument("--min-id", type=int, default=0, help="first dataset-version-id examined by `find_dv_ids.sql`") + parser.add_argument("--nr-of-ids", type=int, default=50, help="number of ID's returned by `find_dv_ids.sql`") + args = parser.parse_args() + conn_kwargs = {"dbname": 'dvndb'} + + script_dir = Path(__file__).resolve().parent + + dup_sql_raw = read_sql(script_dir / "find_duplicates.sql") + + dv_sql = read_sql(script_dir / "find_dv_ids.sql") + dv_sql = dv_sql.replace(":min_id", str(args.min_id)) + dv_sql = dv_sql.replace(":nr_of_ids", str(args.nr_of_ids)) + + try: + with psycopg2.connect(**conn_kwargs) as conn: + dv_ids = fetch_dv_ids(conn, dv_sql) + + if not dv_ids: + print("No dv_id values returned by find_dv_ids.sql") + return + + ids_csv = ",".join(str(i) for i in dv_ids) + print(f"dataset version ids: {ids_csv}") + run_find_duplicates(conn, dup_sql_raw.replace(":ids", ids_csv)) + except psycopg2.OperationalError as e: + msg = str(e) + if "no password supplied" in msg.lower(): + parser.print_help() + raise SystemExit(2) + print(f"Database connection failed: {e}") + raise SystemExit(1) + +if __name__ == "__main__": + main() diff --git a/scripts/issues/12407/find_duplicates.sql b/scripts/issues/12407/find_duplicates.sql new file mode 100644 index 00000000000..e96420e9684 --- /dev/null +++ b/scripts/issues/12407/find_duplicates.sql @@ -0,0 +1,35 @@ +\set ids 5,7,9 +WITH dir_ancestors AS ( + SELECT DISTINCT + datasetversion_id, + array_to_string((string_to_array(path, '/'))[1:n], '/') AS path + FROM ( + SELECT DISTINCT + datasetversion_id, + NULLIF(BTRIM(directorylabel), '') AS path + FROM filemetadata + WHERE datasetversion_id IN (:ids) + AND NULLIF(BTRIM(directorylabel), '') IS NOT NULL + ) dirs + CROSS JOIN LATERAL generate_series( + 1, cardinality(string_to_array(path, '/')) + ) AS g(n) + ), + file_paths AS ( + SELECT DISTINCT + datasetversion_id, + CASE + WHEN NULLIF(BTRIM(directorylabel), '') IS NULL THEN label + ELSE NULLIF(BTRIM(directorylabel), '') || '/' || label + END AS path + FROM filemetadata + WHERE datasetversion_id IN (:ids) + ) +SELECT datasetversion_id, path +FROM dir_ancestors + +INTERSECT + +SELECT datasetversion_id, path +FROM file_paths +ORDER BY datasetversion_id, path; diff --git a/scripts/issues/12407/find_dv_ids.sql b/scripts/issues/12407/find_dv_ids.sql new file mode 100644 index 00000000000..52cb99aef6e --- /dev/null +++ b/scripts/issues/12407/find_dv_ids.sql @@ -0,0 +1,35 @@ +\set min_id 0 +\set nr_of_ids 50 + +WITH ranked AS ( + SELECT + dso.id AS dso_id, + dso.protocol, + dso.authority, + dso.identifier, + dv.id AS dv_id, + dv.versionnumber, + dv.minorversionnumber, + ROW_NUMBER() OVER ( + PARTITION BY dso.id + ORDER BY + dv.versionnumber DESC, + dv.minorversionnumber DESC, + dv.id DESC + ) AS rn + FROM datasetversion dv + JOIN dvobject dso ON dso.id = dv.dataset_id +) +SELECT + dv_id, + dso_id, + protocol, + authority, + identifier, + versionnumber, + minorversionnumber +FROM ranked +WHERE rn = 1 + AND dv_id >= :min_id +ORDER BY dv_id + LIMIT :nr_of_ids; diff --git a/scripts/issues/12407/test-apis.py b/scripts/issues/12407/test-apis.py new file mode 100644 index 00000000000..2b60b884cd0 --- /dev/null +++ b/scripts/issues/12407/test-apis.py @@ -0,0 +1,158 @@ +import requests +from requests_toolbelt.multipart.encoder import MultipartEncoder +from datetime import datetime +import json + +########################## configuration for a draft dataset without files + +dataverse_server = 'https://dev.archaeology.datastations.nl' +api_key = '5623d6e3-bc94-40a5-8de0-8ebdf9f58cbc' +persistentId = 'doi:10.5072/DAR/HBGPN5' + +#################### +print (' preparation: add file foo/bar ' + ('-' * 40)) + +url = '%s/api/datasets/:persistentId/add?persistentId=%s' % (dataverse_server, persistentId) +files = {'file': ('bar', ('content2: %s' % datetime.now()))} +jason_data = {"jsonData": json.dumps({"directoryLabel": "foo"})}# conflicting dir +r = requests.post(url, headers={'X-Dataverse-key': api_key}, data=jason_data, files=files, verify=False) +print (r.status_code) +print (r.json()) + +#################### +print (' preparation: add file foo.tab/bar ' + ('-' * 40)) + +url = '%s/api/datasets/:persistentId/add?persistentId=%s' % (dataverse_server, persistentId) +files = {'file': ('bar', ('content2: %s' % datetime.now()))} +jason_data = {"jsonData": json.dumps({"directoryLabel": "foo.tab"})}# conflicting dir +r = requests.post(url, headers={'X-Dataverse-key': api_key}, data=jason_data, files=files, verify=False) +print (r.status_code) +print (r.json()) + +#################### +print (' preparation: add file x to have a file to change ' + ('-' * 40)) + +### +url = '%s/api/datasets/:persistentId/add?&persistentId=%s' % (dataverse_server, persistentId) +unique_content = 'content2: %s' % datetime.now() +files = {'file': ('x', unique_content)} +jason_data = {"jsonData": json.dumps({"label": "x"})} +r = requests.post(url, headers={'X-Dataverse-key': api_key}, data=jason_data, files=files, verify=False) +print (r.status_code) +print (r.json()) + +file_id = r.json()['data']['files'][0]['dataFile']['id'] + +#################### +print (' file conflicting with existing dir gets sequence number ' + ('-' * 40)) + +### +url = '%s/api/datasets/:persistentId/add?persistentId=%s' % (dataverse_server, persistentId) +files = {'file': ('foo', ('content2: %s' % datetime.now()))} +jason_data = {"jsonData": json.dumps({"label": "foo"})} +r = requests.post(url, headers={'X-Dataverse-key': api_key}, data=jason_data, files=files, verify=False) + +print (r.json()) +print (r.status_code) + +#################### +print (' tabular file conflicting with existing dir gets seq nr once converted to .tab ' + ('-' * 40)) + +url = '%s/api/datasets/:persistentId/add?persistentId=%s' % (dataverse_server, persistentId) +files = {'file': ('foo.csv', ('header1,header2\nvalue1,%s' % datetime.now()))} +jason_data = {"jsonData": json.dumps({"label": "foo.csv"})} +r = requests.post(url, headers={'X-Dataverse-key': api_key}, data=jason_data, files=files, verify=False) +print (r.status_code) +print (r.json()) + +#################### +print (' files API metadata: dir foo/bar conflicts with previously created file foo/bar: returns bad-request ' + ('-' * 40)) + +### files API https://guides.dataverse.org/en/latest/api/native-api.html#updating-file-metadata +url = f'{dataverse_server}/api/files/{file_id}/metadata' +files = {'jsonData': (None, '{"directoryLabel": "foo/bar", "label": "files-api.txt"} ' + ('-' * 40))} +r = requests.post(url, headers={'X-Dataverse-key': api_key}, files=files, verify=False) + +print(r.status_code) +print(r.text) + +#################### +print ('datasets API update existing file into name conflicting with existing dir: returns bad-request ' + ('-' * 40)) + +### datasets API https://guides.dataverse.org/en/latest/api/native-api.html#update-file-metadata +url = f'{dataverse_server}/api/datasets/:persistentId/files/metadata?key={api_key}&persistentId={persistentId}' +json_content = [{"dataFileId": file_id, "directoryLabel": "foo/bar", "label": "datasets-api.txt"}] +headers = {'X-Dataverse-key': api_key, 'Content-Type': 'application/json'} +r = requests.post(url, headers=headers, json=json_content, verify=False) + +print(r.status_code) +print(r.text) + +#################### +print ('datasets API add file conflicting with existing file: gets seq nr ' + ('-' * 40)) + +url = '%s/api/datasets/:persistentId/add?persistentId=%s' % (dataverse_server, persistentId) +files = {'file': ('fox', ('content2: %s' % datetime.now()))} +jason_data = {"jsonData": json.dumps({"label": "x"})} +r = requests.post(url, headers={'X-Dataverse-key': api_key}, data=jason_data, files=files, verify=False) + +print (r.json()) +print (r.status_code) + +#################### +print ('dataset API add dir conflicting with existing file: returns bad-request ' + ('-' * 40)) + +url = '%s/api/datasets/:persistentId/add?persistentId=%s' % (dataverse_server, persistentId) +files = {'file': ('foo', ('content2: %s' % datetime.now()))} +jason_data = {"jsonData": json.dumps({"label": "dir-conflicts-with-file.txt", "directoryLabel": "foo/bar"})} +r = requests.post(url, headers={'X-Dataverse-key': api_key}, data=jason_data, files=files, verify=False) + +print (r.json()) +print (r.status_code) + +#################### +print (' datasets API: another file on existing dir is OK ' + ('-' * 40)) + +url = '%s/api/datasets/:persistentId/add?persistentId=%s' % (dataverse_server, persistentId) +files = {'file': ('beer', ('content2: %s' % datetime.now()))} +jason_data = {"jsonData": json.dumps({"directoryLabel": "foo"})}# conflicting dir +r = requests.post(url, headers={'X-Dataverse-key': api_key}, data=jason_data, files=files, verify=False) +print (r.status_code) +print (r.json()) + +#################### +print (' datasets API: a file with different capitalization is OK ' + ('-' * 40)) + +url = '%s/api/datasets/:persistentId/add?persistentId=%s' % (dataverse_server, persistentId) +files = {'file': ('Beer', ('content2: %s' % datetime.now()))} +jason_data = {"jsonData": json.dumps({"directoryLabel": "foo"})}# conflicting dir +r = requests.post(url, headers={'X-Dataverse-key': api_key}, data=jason_data, files=files, verify=False) +print (r.status_code) +print (r.json()) + +#################### +print (' files API replace: dir foo/bar conflicts with previously created file: returns bad-request ' + ('-' * 40)) + +url = f'{dataverse_server}/api/files/{file_id}/replace' +files = { + 'jsonData': (None, '{"directoryLabel": "foo/bar", "label": "x", "forceReplace":true} ' + ('-' * 40)), + 'file': ('foo', ('content2: %s' % datetime.now())) +} +r = requests.post(url, headers={'X-Dataverse-key': api_key}, files=files, verify=False) + +print(r.status_code) +print(r.text) + +#################### +# not configured on DANS VM? Might also have no added value over previous test. +# +# print (' datasets API remote file: file foo conflicts with previously created dir: returns bad-request ???? ' + ('-' * 40)) +# +# url = '%s/api/datasets/:persistentId/add?persistentId=%s' % (dataverse_server, persistentId) +# files = { +# 'jsonData': (None, '{"directoryLabel": "foo/bar", "label": "x", "forceReplace":true, "description":"A remote image.","storageIdentifier":"file://themes/custom/qdr/images/01234567890-012345678901","checksumType":"MD5","md5Hash":"509ef88afa907eaf2c17c1c8d8fde77e","fileName":"testlogo.png","mimeType":"image/png"} ' + ('-' * 40)), +# } +# r = requests.post(url, headers={'X-Dataverse-key': api_key}, files=files, verify=False) +# +# print(r.status_code) +# print(r.text) diff --git a/src/main/java/edu/harvard/iq/dataverse/EditDatafilesPage.java b/src/main/java/edu/harvard/iq/dataverse/EditDatafilesPage.java index 46d17d05363..f2b263f7ab3 100644 --- a/src/main/java/edu/harvard/iq/dataverse/EditDatafilesPage.java +++ b/src/main/java/edu/harvard/iq/dataverse/EditDatafilesPage.java @@ -1068,7 +1068,9 @@ public String save() { storageSizeStr = null; // Let this re-calculate after the calling save() Collection duplicates = IngestUtil.findDuplicateFilenames(workingVersion, newFiles); if (!duplicates.isEmpty()) { - JH.addMessage(FacesMessage.SEVERITY_ERROR, BundleUtil.getStringFromBundle("dataset.message.filesFailure"), BundleUtil.getStringFromBundle("dataset.message.editMetadata.duplicateFilenames", new ArrayList<>(duplicates))); + var arguments = List.of(String.join(", ", duplicates)); + JH.addMessage(FacesMessage.SEVERITY_ERROR, BundleUtil.getStringFromBundle("dataset.message.filesFailure"), BundleUtil.getStringFromBundle("dataset.message.editMetadata.duplicateFilenames", + arguments)); return null; } if (!saveEnabled) { diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java b/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java index 136b6dbb69b..183b1e9b7ac 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java @@ -4899,9 +4899,10 @@ public Response updateMultipleFileMetadata(@Context ContainerRequestContext crc, List fmdListMinusCurrentFile = new ArrayList<>(fileMetadataMapCopy.values()); - if (IngestUtil.conflictsWithExistingFilenames(pathPlusFilename, fmdListMinusCurrentFile)) { + var conflictingPart = IngestUtil.findConflictingPathPart(pathPlusFilename, fmdListMinusCurrentFile); + if (conflictingPart.isPresent()) { return error(BAD_REQUEST, BundleUtil.getStringFromBundle("files.api.metadata.update.duplicateFile", - Arrays.asList(pathPlusFilename))); + conflictingPart.stream().toList())); } // Apply optional params diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Files.java b/src/main/java/edu/harvard/iq/dataverse/api/Files.java index 0a1b19985a4..2aad5533c00 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/Files.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/Files.java @@ -512,8 +512,10 @@ public Response updateFileMetadata(@Context ContainerRequestContext crc, @FormDa } } - if (IngestUtil.conflictsWithExistingFilenames(pathPlusFilename, fmdListMinusCurrentFile)) { - return error(BAD_REQUEST, BundleUtil.getStringFromBundle("files.api.metadata.update.duplicateFile", Arrays.asList(pathPlusFilename))); + var conflictingPart = IngestUtil.findConflictingPathPart(pathPlusFilename, fmdListMinusCurrentFile); + if (conflictingPart.isPresent()) { + return error(BAD_REQUEST, BundleUtil.getStringFromBundle("files.api.metadata.update.duplicateFile", + conflictingPart.stream().toList())); } optionalFileParams.addOptionalParams(upFmd); diff --git a/src/main/java/edu/harvard/iq/dataverse/ingest/IngestUtil.java b/src/main/java/edu/harvard/iq/dataverse/ingest/IngestUtil.java index 3d30f7e6ec3..a2eb0f70ec3 100644 --- a/src/main/java/edu/harvard/iq/dataverse/ingest/IngestUtil.java +++ b/src/main/java/edu/harvard/iq/dataverse/ingest/IngestUtil.java @@ -31,11 +31,10 @@ import java.util.HashSet; import java.util.Iterator; import java.util.List; +import java.util.Optional; import java.util.Set; import java.util.logging.Logger; -import jakarta.json.Json; -import jakarta.json.JsonArrayBuilder; -import jakarta.json.JsonObjectBuilder; + import org.dataverse.unf.UNFUtil; import org.dataverse.unf.UnfException; @@ -60,7 +59,8 @@ public static void checkForDuplicateFileNamesFinal(DatasetVersion version, List< // Step 1: create list of existing path names from all FileMetadata in the DatasetVersion // unique path name: directoryLabel + file separator + fileLabel - Set pathNamesExisting = existingPathNamesAsSet(version, ((fileToReplace == null) ? null : fileToReplace.getFileMetadata())); + Set pathNamesExisting = existingPathNamesAsSet(version, ((fileToReplace == null) ? null : fileToReplace.getFileMetadata())); + var existingWithoutNew = new HashSet<>(pathNamesExisting); // avoid side effect of duplicateFilenameCheck // Step 2: check each new DataFile against the list of path names, if a duplicate create a new unique file name for (Iterator dfIt = newFiles.iterator(); dfIt.hasNext();) { @@ -68,11 +68,23 @@ public static void checkForDuplicateFileNamesFinal(DatasetVersion version, List< fm.setLabel(duplicateFilenameCheck(fm, pathNamesExisting)); } + // Step 3: get all potential new directories + var newDirs = new HashSet(); + for (Iterator dfIt = newFiles.iterator(); dfIt.hasNext();) { + FileMetadata fm = dfIt.next().getFileMetadata(); + newDirs.addAll(getPathAndParents(fm.getDirectoryLabel())); + } + // Step 4: check if new directories do not yet exist as filename + newDirs.retainAll(existingWithoutNew); + if (!newDirs.isEmpty()) { + logger.warning("Incoming file(s) have one or more directories conflicting with an existing path: " + newDirs); + newFiles.clear(); + } } /** * Checks if the unique file path of the supplied fileMetadata is already on - * the list of the existing files; and if so, keeps generating a new name + * the list of the existing files and directories; and if so, keeps generating a new name * until it is unique. Returns the final file name. (i.e., it only modifies * the filename, and not the folder name, in order to achieve uniqueness) * @@ -84,12 +96,14 @@ public static String duplicateFilenameCheck(FileMetadata fileMetadata, Set fileMetadatas) { - List filePathsAndNames = getPathsAndFileNames(fileMetadatas); - return filePathsAndNames.contains(pathPlusFilename); + public static Optional findConflictingPathPart(String newPathPlusFilename, List fileMetadatas) { + var newPathAndParents = getPathAndParents(newPathPlusFilename); + var existingPathsAndParents = getPathsAndFileNames(fileMetadatas); + for (var pathOrDir : existingPathsAndParents) { + if (newPathAndParents.contains(pathOrDir)) { + return Optional.of(pathOrDir); + } + } + return Optional.empty(); } + /** * Given a DatasetVersion, and the newFiles about to be added to the * version iterate across all the files (including their @@ -164,9 +186,11 @@ private static Set findDuplicates(Collection collection) { /** * @return A List of Strings in the form of path/to/file.txt + * path and path/to are also added to the list, but only once when they are parents for multiple files. */ public static List getPathsAndFileNames(List fileMetadatas) { List allFileNamesWithPaths = new ArrayList<>(); + Set allPaths = new HashSet<>(); for (FileMetadata fileMetadata : fileMetadatas) { String directoryLabel = fileMetadata.getDirectoryLabel(); String path = ""; @@ -174,11 +198,30 @@ public static List getPathsAndFileNames(List fileMetadatas path = directoryLabel + "/"; } String pathAndfileName = path + fileMetadata.getLabel(); - allFileNamesWithPaths.add(pathAndfileName); + allFileNamesWithPaths.add((pathAndfileName)); + allPaths.addAll(getPathAndParents(directoryLabel)); } + allFileNamesWithPaths.addAll(allPaths); return allFileNamesWithPaths; } + private static List getPathAndParents(String directory) { + List paths = new ArrayList<>(); + if (directory == null || directory.isEmpty()) { + return paths; + } + String current = directory; + while (current != null && !current.isEmpty()) { + paths.add(current); + int lastSlash = current.lastIndexOf('/'); + if (lastSlash == -1) { + break; + } + current = current.substring(0, lastSlash); + } + return paths; + } + // This method is called on a single file, when we need to modify the name // of an already ingested/persisted datafile. For ex., when we have converted // a file to tabular data, and want to update the extension accordingly. @@ -187,6 +230,7 @@ public static void modifyExistingFilename(DatasetVersion version, FileMetadata f // unique path name: directoryLabel + file separator + fileLabel fileMetadata.setLabel(newFilename); Set pathNamesExisting = existingPathNamesAsSet(version, fileMetadata); + pathNamesExisting.addAll(dirsOfFullPaths(pathNamesExisting)); fileMetadata.setLabel(duplicateFilenameCheck(fileMetadata, pathNamesExisting)); } @@ -240,6 +284,17 @@ public static String generateNewFileName(final String fileName) { return newName; } + public static Set dirsOfFullPaths(Collection fullPaths) { + Set dirs = new HashSet<>(); + fullPaths.forEach(fullPath -> { + int lastSlash = fullPath.lastIndexOf('/'); + if (lastSlash != -1) { + dirs.addAll(getPathAndParents(fullPath.substring(0, lastSlash))); + } + }); + return dirs; + } + // list of existing unique path name: directoryLabel + file separator + fileLabel public static Set existingPathNamesAsSet(DatasetVersion version) { return existingPathNamesAsSet(version, null); diff --git a/src/main/java/propertyFiles/Bundle.properties b/src/main/java/propertyFiles/Bundle.properties index 9a8c97fe429..54d95b9cbe0 100644 --- a/src/main/java/propertyFiles/Bundle.properties +++ b/src/main/java/propertyFiles/Bundle.properties @@ -1721,7 +1721,7 @@ dataset.message.uploadFilesSingle.message=All file types are supported for uploa dataset.message.uploadFilesMultiple.message=Multiple file upload/download methods are available for this dataset. Once you upload a file using one of these methods, your choice will be locked in for this dataset. dataset.message.editMetadata.label=Edit Dataset Metadata dataset.message.editMetadata.message=Add more metadata about this dataset to help others easily find it. -dataset.message.editMetadata.duplicateFilenames=Duplicate filenames: {0} +dataset.message.editMetadata.duplicateFilenames=File path and name duplicate those of an existing file or directory: {0} dataset.message.editMetadata.invalid.TOUA.message=Datasets with restricted files are required to have Request Access enabled or Terms of Access to help people access the data. Please edit the dataset to confirm Request Access or provide Terms of Access to be in compliance with the policy. dataset.message.toua.invalid=Terms of Use and Access are invalid. You must enable request access or add terms of access in datasets with restricted files. diff --git a/src/test/java/edu/harvard/iq/dataverse/ingest/IngestUtilTest.java b/src/test/java/edu/harvard/iq/dataverse/ingest/IngestUtilTest.java index 955070a662a..19f839ce55e 100644 --- a/src/test/java/edu/harvard/iq/dataverse/ingest/IngestUtilTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/ingest/IngestUtilTest.java @@ -13,16 +13,20 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Date; +import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.logging.Level; import java.util.logging.Logger; +import java.util.stream.Stream; import jakarta.validation.ConstraintViolation; import org.dataverse.unf.UNFUtil; import org.dataverse.unf.UnfException; import org.junit.jupiter.api.Test; +import static org.assertj.core.api.Assertions.as; +import static org.assertj.core.api.Assertions.assertThat; import static org.junit.jupiter.api.Assertions.*; public class IngestUtilTest { @@ -376,6 +380,158 @@ public void testCheckForDuplicateFileNamesWithDirectories() throws Exception { assertTrue(file3NameAltered); } + @Test + /** + * Test adding a file with a full path duplicating an existing directory + * or with an ancestor that duplicates the full path of an existing file. + */ + public void testCheckFilesDuplicatingDirectories() throws Exception { + + class Params { + final int iteration; + final FileMetadata fmd; + + public Params(int iteration, String dir, String fileLabel) { + this.iteration = iteration; + + var datafile = new DataFile("application/octet-stream"); + + fmd = new FileMetadata(); + fmd.setLabel(fileLabel); + fmd.setDirectoryLabel(dir); + fmd.setDataFile(datafile); + datafile.getFileMetadatas().add(fmd); + } + } + // each iteration adds one or more files to the dataset and + // verifies what names would be added if all would be added (again) + // the adjusted names are used to add files in the next iteration + var paramsList = Arrays.asList( + new Params(0, "foo","bar"), + new Params(1, null, "foo"), // file/dir conflict: "foo" + new Params(1, null, "bar"), + new Params(2, null, "bar"), + new Params(3, "bar/foo","pint"), // dir/file conflict: "bar" + new Params(4, "bar/foo/pint", "beer") // dir-ancestor/file conflict: "bar/foo/pint" + ); + // more than 10 List.of elements cause subtle type problems for the assertions + var expectedFilesInDataset = Arrays.asList( + List.of("foo/bar"), + List.of("foo/bar", "null/foo-1", "null/bar"), + List.of("foo/bar", "null/foo-1", "null/bar", "null/bar-2"), + List.of("foo/bar", "null/foo-1", "null/bar", "null/bar-2", "bar/foo/pint"), + List.of("foo/bar", "null/foo-1", "null/bar", "null/bar-2", "bar/foo/pint", "bar/foo/pint/beer") + ); + var expectedLabelsAfterTest = Arrays.asList( + List.of( "foo/bar-1", "null/foo-1", "null/bar", "null/bar-1", "bar/foo/pint", "bar/foo/pint/beer"), + List.of( "foo/bar-1", "null/foo-2", "null/bar-1", "null/bar-2", "bar/foo/pint", "bar/foo/pint/beer"), + List.of( "foo/bar-1", "null/foo-2", "null/bar-1", "null/bar-3", "bar/foo/pint", "bar/foo/pint/beer"), + List.of( "foo/bar-1", "null/foo-2", "null/bar-1", "null/bar-3", "bar/foo/pint-1", "bar/foo/pint/beer"), + List.of( "foo/bar-1", "null/foo-2", "null/bar-1", "null/bar-3", "bar/foo/pint-1", "bar/foo/pint/beer-1") + ); + List> expectedNewDataFilesAfterTest = Arrays.asList( + List.of( "foo/bar-1", "null/foo-1", "null/bar","null/bar-1", "bar/foo/pint", "bar/foo/pint/beer"), + List.of(), + List.of(), // ??? + List.of(), + List.of() + ); + + // create dataset version + var dataset = makeDataset(); + var datasetVersion = dataset.getLatestVersion(); + datasetVersion.setFileMetadatas(new ArrayList<>()); + + for (int i=0; i(); + paramsList.forEach(p -> newDataFiles.add(p.fmd.getDataFile())); + // select files to add to the dataset for the current iteration + var ii = i; + var filesToAdd = paramsList.stream() + .filter(p -> p.iteration == ii) + .map(p -> p.fmd).toList(); + // add files to dataset + for (var fmd: filesToAdd) { + var fmdClone = new FileMetadata(); + fmdClone.setId(MocksFactory.nextId()); + fmdClone.setLabel(fmd.getLabel()); + fmdClone.setDirectoryLabel(fmd.getDirectoryLabel()); + fmdClone.setDatasetVersion(fmd.getDatasetVersion()); + if (fmd.getDataFile() != null) { + var df = fmd.getDataFile(); + var dfClone = new DataFile(df.getContentType()); + fmdClone.setDataFile(dfClone); + dfClone.getFileMetadatas().add(fmdClone); + } + datasetVersion.getFileMetadatas().add(fmdClone); + fmdClone.setDatasetVersion(datasetVersion); + } + + // precondition + var actualFilesInDataset = datasetVersion.getFileMetadatas().stream() + .map(fmd -> fmd.getDirectoryLabel() + "/" + fmd.getLabel()).toList(); + assertThat(actualFilesInDataset) + .withFailMessage("expectedFilesInDataset %d \n expected %s \n but got %s", i, expectedFilesInDataset.get(i), actualFilesInDataset) + .containsExactlyInAnyOrderElementsOf(expectedFilesInDataset.get(i)); + + // method under test + IngestUtil.checkForDuplicateFileNamesFinal(datasetVersion, newDataFiles, null); + + // postconditions + var actualPaths = paramsList.stream() + .map(p -> p.fmd.getDirectoryLabel() + "/" + p.fmd.getLabel()).toList(); + assertThat(actualPaths) + .withFailMessage("expectedLabelsAfterTest %d \n expected %s \n but got %s", i, expectedLabelsAfterTest.get(i), actualPaths) + .containsExactlyInAnyOrderElementsOf(expectedLabelsAfterTest.get(i)); + var actualNewDataFiles = newDataFiles.stream() + .map(p -> p.getFileMetadata().getDirectoryLabel() + "/" + p.getFileMetadata().getLabel()).toList(); + assertThat(actualNewDataFiles) + .withFailMessage("expectedNewDataFilesAfterTest %d \n expected %s \n but got %s", i, expectedNewDataFilesAfterTest.get(i), actualNewDataFiles) + .containsExactlyInAnyOrderElementsOf(expectedNewDataFilesAfterTest.get(i)); + + } + } + + @Test + /** + * Test adding files to a dataset having a file with a full path duplicating a directory. + */ + public void testExistingFilesDuplicatingDirectories() throws Exception { + + // create dataset version + var dataset = makeDataset(); + var datasetVersion = dataset.getLatestVersion(); + datasetVersion.setFileMetadatas(new ArrayList<>()); + + // add files to dataset + Stream.of( + Arrays.asList("foo","bar"), + Arrays.asList(null, "foo"), // file/dir conflict: "foo" + Arrays.asList(null, "bar"), + Arrays.asList("bar/foo","pint"), // dir/file conflict: "bar" + Arrays.asList("bar/foo/pint", "beer") // subdir/file conflict: "bar/foo/pint" + ).forEach(l -> { + var dir = l.get(0); + var fileLabel = l.get(1); + var datafile = new DataFile("application/octet-stream"); + var fmd = new FileMetadata(); + fmd.setId(MocksFactory.nextId()); + fmd.setLabel(fileLabel); + fmd.setDirectoryLabel(dir); + fmd.setDataFile(datafile); + datafile.getFileMetadatas().add(fmd); + + // add file to dataset + datasetVersion.getFileMetadatas().add(fmd); + fmd.setDatasetVersion(datasetVersion); + }); + + // EditDataFilesPage.save() would create an error message if result is not empty + var duplicates = IngestUtil.findDuplicateFilenames(datasetVersion, List.of()); + + assertThat(duplicates).containsExactlyInAnyOrderElementsOf(List.of("bar", "foo", "bar/foo/pint")); + } + @Test /** * Test tabular files (e.g., .dta) are changed when .tab files with the same @@ -727,7 +883,36 @@ public void renameFileToSameName() { FileMetadata file2 = new FileMetadata(); file2.setLabel("README2.md"); List fileMetadatas = Arrays.asList(file1, file2); - assertTrue(IngestUtil.conflictsWithExistingFilenames(pathPlusFilename, fileMetadatas)); + assertThat(IngestUtil.findConflictingPathPart(pathPlusFilename, fileMetadatas)) + .hasValue("README.md"); + } + + @Test + public void addDirConflictingWithFile() { + FileMetadata fmd = new FileMetadata(); + fmd.setDirectoryLabel("foo"); + fmd.setLabel("bar"); + var fileMetadatas = Arrays.asList(fmd); + assertThat(IngestUtil.findConflictingPathPart("foo/bar/pint", fileMetadatas)) + .hasValue("foo/bar"); + } + + @Test + public void addParentDirConflictingWithFile() { + FileMetadata fmd = new FileMetadata(); + fmd.setLabel("foo"); + var fileMetadatas = Arrays.asList(fmd); + assertThat(IngestUtil.findConflictingPathPart("foo/bar/pint", fileMetadatas)) + .hasValue("foo"); + } + + @Test + public void noConflict() { + FileMetadata fmd = new FileMetadata(); + fmd.setLabel("foo"); + var fileMetadatas = Arrays.asList(fmd); + assertThat(IngestUtil.findConflictingPathPart("bar", fileMetadatas)) + .isEmpty(); } }