Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/sphinx-guides/source/api/native-api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3245,7 +3245,7 @@ The fully expanded example above (without environment variables) looks like this

.. code-block:: bash

curl -H "X-Dataverse-key:xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" -X POST "https://demo.dataverse.org/api/datasets/:persistentId/files/metadata?:persistentId=doi:10.5072/FK2/J8SJZB" --upload-file file-metadata-update.json
curl -H "X-Dataverse-key:xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" -X POST "https://demo.dataverse.org/api/datasets/:persistentId/files/metadata?persistentId=doi:10.5072/FK2/J8SJZB" --upload-file file-metadata-update.json

The ``file-metadata-update.json`` file should contain a JSON array of objects, each representing a file to be updated. Here's an example structure:

Expand Down
2 changes: 1 addition & 1 deletion doc/sphinx-guides/source/user/dataset-management.rst
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ Beginning with Dataverse Software 5.0, the way a Dataverse installation handles
- Files with the same checksum can be included in a dataset, even if the files are in the same directory.
- Files with the same filename can be included in a dataset as long as the files are in different directories.
- If a user uploads a file to a directory where a file already exists with that directory/filename combination, the Dataverse installation will adjust the file path and names by adding "-1" or "-2" as applicable. This change will be visible in the list of files being uploaded.
- If the directory or name of an existing or newly uploaded file is edited in such a way that would create a directory/filename combination that already exists, the Dataverse installation will display an error.
- If the directory or name of an existing or newly uploaded file is edited in such a way that would create a directory/filename combination that already exists, or the new directory/filename exists as directory, the Dataverse installation will display an error.
- If a user attempts to replace a file with another file that has the same checksum, an error message will be displayed and the file will not be able to be replaced.
- If a user attempts to replace a file with a file that has the same checksum as a different file in the dataset, a warning will be displayed.

Expand Down
107 changes: 107 additions & 0 deletions scripts/issues/12407/find_duplicates.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
#!/usr/bin/env python3
import argparse
import psycopg2
from pathlib import Path
from textwrap import dedent

def read_sql(path: Path) -> str:
text = path.read_text(encoding="utf-8")
return "\n".join(
line for line in text.splitlines() if not line.lstrip().startswith("\\")
)


def fetch_dv_ids(conn, find_dv_ids_sql: str) -> list[int]:
with conn.cursor() as cur:
cur.execute(find_dv_ids_sql)
rows = cur.fetchall()

# Query returns dv_id as first selected column in your file.
return [int(row[0]) for row in rows]


def fetch_dataset_info(conn, datasetversion_id: int):
dataset_query = """
SELECT dso.protocol, dso.authority, dso.identifier, dv.versionnumber, dv.minorversionnumber
FROM datasetversion dv
JOIN dvobject dso ON dso.id = dv.dataset_id
WHERE dv.id = %s \
"""
with conn.cursor() as cur:
cur.execute(dataset_query, (datasetversion_id,))
return cur.fetchone()
return None


def run_find_duplicates(conn, find_duplicates_sql: str):
last_dv_id = None
last_info = ("", "", "", "", "")

with conn.cursor() as cur:
cur.execute(find_duplicates_sql)
cols = [d[0] for d in cur.description]

extra_cols = ["protocol", "authority", "dataset_id", "versionnumber", "minorversionnumber"]
print("\t".join(cols + extra_cols))

for row in cur:
dv_id = int(row[0]) # datasetversion_id

if dv_id != last_dv_id:
fetched = fetch_dataset_info(conn, dv_id)
last_info = fetched if fetched is not None else ("", "", "", "", "")
last_dv_id = dv_id

print("\t".join("" if v is None else str(v) for v in (tuple(row) + tuple(last_info))))


def main():
class RawDefaultsFormatter(
argparse.ArgumentDefaultsHelpFormatter,
argparse.RawDescriptionHelpFormatter,
):
pass

parser = argparse.ArgumentParser(
description=dedent("""
Execute as owner of dvndb.

`find_duplicates.sql` is executed for dv_ids returned by `find_dv_ids.sql`.
`find_dv_ids.sql` returns the latest version per dataset.
"""),
formatter_class=RawDefaultsFormatter,
)
parser.add_argument("--min-id", type=int, default=0, help="first dataset-version-id examined by `find_dv_ids.sql`")
parser.add_argument("--nr-of-ids", type=int, default=50, help="number of ID's returned by `find_dv_ids.sql`")
args = parser.parse_args()
conn_kwargs = {"dbname": 'dvndb'}

script_dir = Path(__file__).resolve().parent

dup_sql_raw = read_sql(script_dir / "find_duplicates.sql")

dv_sql = read_sql(script_dir / "find_dv_ids.sql")
dv_sql = dv_sql.replace(":min_id", str(args.min_id))
dv_sql = dv_sql.replace(":nr_of_ids", str(args.nr_of_ids))

try:
with psycopg2.connect(**conn_kwargs) as conn:
dv_ids = fetch_dv_ids(conn, dv_sql)

if not dv_ids:
print("No dv_id values returned by find_dv_ids.sql")
return

ids_csv = ",".join(str(i) for i in dv_ids)
print(f"dataset version ids: {ids_csv}")
run_find_duplicates(conn, dup_sql_raw.replace(":ids", ids_csv))
except psycopg2.OperationalError as e:
msg = str(e)
if "no password supplied" in msg.lower():
parser.print_help()
raise SystemExit(2)
print(f"Database connection failed: {e}")
raise SystemExit(1)

if __name__ == "__main__":
main()
35 changes: 35 additions & 0 deletions scripts/issues/12407/find_duplicates.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
\set ids 5,7,9
WITH dir_ancestors AS (
SELECT DISTINCT
datasetversion_id,
array_to_string((string_to_array(path, '/'))[1:n], '/') AS path
FROM (
SELECT DISTINCT
datasetversion_id,
NULLIF(BTRIM(directorylabel), '') AS path
FROM filemetadata
WHERE datasetversion_id IN (:ids)
AND NULLIF(BTRIM(directorylabel), '') IS NOT NULL
) dirs
CROSS JOIN LATERAL generate_series(
1, cardinality(string_to_array(path, '/'))
) AS g(n)
),
file_paths AS (
SELECT DISTINCT
datasetversion_id,
CASE
WHEN NULLIF(BTRIM(directorylabel), '') IS NULL THEN label
ELSE NULLIF(BTRIM(directorylabel), '') || '/' || label
END AS path
FROM filemetadata
WHERE datasetversion_id IN (:ids)
)
SELECT datasetversion_id, path
FROM dir_ancestors

INTERSECT

SELECT datasetversion_id, path
FROM file_paths
ORDER BY datasetversion_id, path;
35 changes: 35 additions & 0 deletions scripts/issues/12407/find_dv_ids.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
\set min_id 0
\set nr_of_ids 50

WITH ranked AS (
SELECT
dso.id AS dso_id,
dso.protocol,
dso.authority,
dso.identifier,
dv.id AS dv_id,
dv.versionnumber,
dv.minorversionnumber,
ROW_NUMBER() OVER (
PARTITION BY dso.id
ORDER BY
dv.versionnumber DESC,
dv.minorversionnumber DESC,
dv.id DESC
) AS rn
FROM datasetversion dv
JOIN dvobject dso ON dso.id = dv.dataset_id
)
SELECT
dv_id,
dso_id,
protocol,
authority,
identifier,
versionnumber,
minorversionnumber
FROM ranked
WHERE rn = 1
AND dv_id >= :min_id
ORDER BY dv_id
LIMIT :nr_of_ids;
158 changes: 158 additions & 0 deletions scripts/issues/12407/test-apis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
import requests
from requests_toolbelt.multipart.encoder import MultipartEncoder
from datetime import datetime
import json

########################## configuration for a draft dataset without files

dataverse_server = 'https://dev.archaeology.datastations.nl'
api_key = '5623d6e3-bc94-40a5-8de0-8ebdf9f58cbc'
persistentId = 'doi:10.5072/DAR/HBGPN5'

####################
print (' preparation: add file foo/bar ' + ('-' * 40))

url = '%s/api/datasets/:persistentId/add?persistentId=%s' % (dataverse_server, persistentId)
files = {'file': ('bar', ('content2: %s' % datetime.now()))}
jason_data = {"jsonData": json.dumps({"directoryLabel": "foo"})}# conflicting dir
r = requests.post(url, headers={'X-Dataverse-key': api_key}, data=jason_data, files=files, verify=False)
print (r.status_code)
print (r.json())

####################
print (' preparation: add file foo.tab/bar ' + ('-' * 40))

url = '%s/api/datasets/:persistentId/add?persistentId=%s' % (dataverse_server, persistentId)
files = {'file': ('bar', ('content2: %s' % datetime.now()))}
jason_data = {"jsonData": json.dumps({"directoryLabel": "foo.tab"})}# conflicting dir
r = requests.post(url, headers={'X-Dataverse-key': api_key}, data=jason_data, files=files, verify=False)
print (r.status_code)
print (r.json())

####################
print (' preparation: add file x to have a file to change ' + ('-' * 40))

###
url = '%s/api/datasets/:persistentId/add?&persistentId=%s' % (dataverse_server, persistentId)
unique_content = 'content2: %s' % datetime.now()
files = {'file': ('x', unique_content)}
jason_data = {"jsonData": json.dumps({"label": "x"})}
r = requests.post(url, headers={'X-Dataverse-key': api_key}, data=jason_data, files=files, verify=False)
print (r.status_code)
print (r.json())

file_id = r.json()['data']['files'][0]['dataFile']['id']

####################
print (' file conflicting with existing dir gets sequence number ' + ('-' * 40))

###
url = '%s/api/datasets/:persistentId/add?persistentId=%s' % (dataverse_server, persistentId)
files = {'file': ('foo', ('content2: %s' % datetime.now()))}
jason_data = {"jsonData": json.dumps({"label": "foo"})}
r = requests.post(url, headers={'X-Dataverse-key': api_key}, data=jason_data, files=files, verify=False)

print (r.json())
print (r.status_code)

####################
print (' tabular file conflicting with existing dir gets seq nr once converted to .tab ' + ('-' * 40))

url = '%s/api/datasets/:persistentId/add?persistentId=%s' % (dataverse_server, persistentId)
files = {'file': ('foo.csv', ('header1,header2\nvalue1,%s' % datetime.now()))}
jason_data = {"jsonData": json.dumps({"label": "foo.csv"})}
r = requests.post(url, headers={'X-Dataverse-key': api_key}, data=jason_data, files=files, verify=False)
print (r.status_code)
print (r.json())

####################
print (' files API metadata: dir foo/bar conflicts with previously created file foo/bar: returns bad-request ' + ('-' * 40))

### files API https://guides.dataverse.org/en/latest/api/native-api.html#updating-file-metadata
url = f'{dataverse_server}/api/files/{file_id}/metadata'
files = {'jsonData': (None, '{"directoryLabel": "foo/bar", "label": "files-api.txt"} ' + ('-' * 40))}
r = requests.post(url, headers={'X-Dataverse-key': api_key}, files=files, verify=False)

print(r.status_code)
print(r.text)

####################
print ('datasets API update existing file into name conflicting with existing dir: returns bad-request ' + ('-' * 40))

### datasets API https://guides.dataverse.org/en/latest/api/native-api.html#update-file-metadata
url = f'{dataverse_server}/api/datasets/:persistentId/files/metadata?key={api_key}&persistentId={persistentId}'
json_content = [{"dataFileId": file_id, "directoryLabel": "foo/bar", "label": "datasets-api.txt"}]
headers = {'X-Dataverse-key': api_key, 'Content-Type': 'application/json'}
r = requests.post(url, headers=headers, json=json_content, verify=False)

print(r.status_code)
print(r.text)

Check failure

Code scanning / CodeQL

Clear-text logging of sensitive information High test

This expression logs
sensitive data (password)
as clear text.

####################
print ('datasets API add file conflicting with existing file: gets seq nr ' + ('-' * 40))

url = '%s/api/datasets/:persistentId/add?persistentId=%s' % (dataverse_server, persistentId)
files = {'file': ('fox', ('content2: %s' % datetime.now()))}
jason_data = {"jsonData": json.dumps({"label": "x"})}
r = requests.post(url, headers={'X-Dataverse-key': api_key}, data=jason_data, files=files, verify=False)

print (r.json())
print (r.status_code)

####################
print ('dataset API add dir conflicting with existing file: returns bad-request ' + ('-' * 40))

url = '%s/api/datasets/:persistentId/add?persistentId=%s' % (dataverse_server, persistentId)
files = {'file': ('foo', ('content2: %s' % datetime.now()))}
jason_data = {"jsonData": json.dumps({"label": "dir-conflicts-with-file.txt", "directoryLabel": "foo/bar"})}
r = requests.post(url, headers={'X-Dataverse-key': api_key}, data=jason_data, files=files, verify=False)

print (r.json())
print (r.status_code)

####################
print (' datasets API: another file on existing dir is OK ' + ('-' * 40))

url = '%s/api/datasets/:persistentId/add?persistentId=%s' % (dataverse_server, persistentId)
files = {'file': ('beer', ('content2: %s' % datetime.now()))}
jason_data = {"jsonData": json.dumps({"directoryLabel": "foo"})}# conflicting dir
r = requests.post(url, headers={'X-Dataverse-key': api_key}, data=jason_data, files=files, verify=False)
print (r.status_code)
print (r.json())

####################
print (' datasets API: a file with different capitalization is OK ' + ('-' * 40))

url = '%s/api/datasets/:persistentId/add?persistentId=%s' % (dataverse_server, persistentId)
files = {'file': ('Beer', ('content2: %s' % datetime.now()))}
jason_data = {"jsonData": json.dumps({"directoryLabel": "foo"})}# conflicting dir
r = requests.post(url, headers={'X-Dataverse-key': api_key}, data=jason_data, files=files, verify=False)
print (r.status_code)
print (r.json())

####################
print (' files API replace: dir foo/bar conflicts with previously created file: returns bad-request ' + ('-' * 40))

url = f'{dataverse_server}/api/files/{file_id}/replace'
files = {
'jsonData': (None, '{"directoryLabel": "foo/bar", "label": "x", "forceReplace":true} ' + ('-' * 40)),
'file': ('foo', ('content2: %s' % datetime.now()))
}
r = requests.post(url, headers={'X-Dataverse-key': api_key}, files=files, verify=False)

print(r.status_code)
print(r.text)

####################
# not configured on DANS VM? Might also have no added value over previous test.
#
# print (' datasets API remote file: file foo conflicts with previously created dir: returns bad-request ???? ' + ('-' * 40))
#
# url = '%s/api/datasets/:persistentId/add?persistentId=%s' % (dataverse_server, persistentId)
# files = {
# 'jsonData': (None, '{"directoryLabel": "foo/bar", "label": "x", "forceReplace":true, "description":"A remote image.","storageIdentifier":"file://themes/custom/qdr/images/01234567890-012345678901","checksumType":"MD5","md5Hash":"509ef88afa907eaf2c17c1c8d8fde77e","fileName":"testlogo.png","mimeType":"image/png"} ' + ('-' * 40)),
# }
# r = requests.post(url, headers={'X-Dataverse-key': api_key}, files=files, verify=False)
#
# print(r.status_code)
# print(r.text)
Original file line number Diff line number Diff line change
Expand Up @@ -1068,7 +1068,9 @@ public String save() {
storageSizeStr = null; // Let this re-calculate after the calling save()
Collection<String> duplicates = IngestUtil.findDuplicateFilenames(workingVersion, newFiles);
if (!duplicates.isEmpty()) {
JH.addMessage(FacesMessage.SEVERITY_ERROR, BundleUtil.getStringFromBundle("dataset.message.filesFailure"), BundleUtil.getStringFromBundle("dataset.message.editMetadata.duplicateFilenames", new ArrayList<>(duplicates)));
var arguments = List.of(String.join(", ", duplicates));
JH.addMessage(FacesMessage.SEVERITY_ERROR, BundleUtil.getStringFromBundle("dataset.message.filesFailure"), BundleUtil.getStringFromBundle("dataset.message.editMetadata.duplicateFilenames",
arguments));
return null;
}
if (!saveEnabled) {
Expand Down
5 changes: 3 additions & 2 deletions src/main/java/edu/harvard/iq/dataverse/api/Datasets.java
Original file line number Diff line number Diff line change
Expand Up @@ -4899,9 +4899,10 @@ public Response updateMultipleFileMetadata(@Context ContainerRequestContext crc,

List<FileMetadata> fmdListMinusCurrentFile = new ArrayList<>(fileMetadataMapCopy.values());

if (IngestUtil.conflictsWithExistingFilenames(pathPlusFilename, fmdListMinusCurrentFile)) {
var conflictingPart = IngestUtil.findConflictingPathPart(pathPlusFilename, fmdListMinusCurrentFile);
if (conflictingPart.isPresent()) {
return error(BAD_REQUEST, BundleUtil.getStringFromBundle("files.api.metadata.update.duplicateFile",
Arrays.asList(pathPlusFilename)));
conflictingPart.stream().toList()));
}

// Apply optional params
Expand Down
6 changes: 4 additions & 2 deletions src/main/java/edu/harvard/iq/dataverse/api/Files.java
Original file line number Diff line number Diff line change
Expand Up @@ -512,8 +512,10 @@ public Response updateFileMetadata(@Context ContainerRequestContext crc, @FormDa
}
}

if (IngestUtil.conflictsWithExistingFilenames(pathPlusFilename, fmdListMinusCurrentFile)) {
return error(BAD_REQUEST, BundleUtil.getStringFromBundle("files.api.metadata.update.duplicateFile", Arrays.asList(pathPlusFilename)));
var conflictingPart = IngestUtil.findConflictingPathPart(pathPlusFilename, fmdListMinusCurrentFile);
if (conflictingPart.isPresent()) {
return error(BAD_REQUEST, BundleUtil.getStringFromBundle("files.api.metadata.update.duplicateFile",
conflictingPart.stream().toList()));
}

optionalFileParams.addOptionalParams(upFmd);
Expand Down
Loading
Loading