diff --git a/python/pid_reports/README.md b/python/pid_reports/README.md new file mode 100644 index 0000000..bbb3d98 --- /dev/null +++ b/python/pid_reports/README.md @@ -0,0 +1,71 @@ +# PID Reports + +## Overview + +This folder contains scripts that generate reports on Persistent Identifier (PID) usage in a Dataverse instance. These scripts specifically identify cases where PIDs were not found, which may indicate: + +- In-the-wild use of draft PIDs +- Posting of PIDs with typos +- PIDs with extra characters (e.g., trailing periods) +- Other malformed PID references + +## Scripts + +The main script in this folder is `dcpidreport.py`, which checks DataCite for DOI resolution and generates reports on failures. Anything reported via this script indicates that someone tried to resolve the specified DOI, i.e. via https://doi.org/* . DataCite can sometimes be more than a month delayed in updating its reports - the script is able to handle this. + +A second script, `pidreport.py` performs similar functions for any PIDs. However, it relies on [functionality to create an initial PIDFailures report](https://github.com/IQSS/dataverse/pull/11601) that is not yet merged into the standard Dataverse distribution from https://github.org/IQSS/dataverse. +The benefits of this report are that the results are available for any kind of PID, are available every month, and capture any call to Dataverse requiring a PID (i.e. where someone may have posted a direct, incorrect link to a dataset page, versus DataCite only reporting DOI resolution failures). + +## Purpose + +These reports help maintain the integrity of your Dataverse's persistent identifier system by: +- Identifying problematic PID references +- Alerting administrators to potential issues +- Providing data for troubleshooting and correction + +## Usage + +The scripts are designed to be run periodically (typically monthly) via a cron job. + +### Configuration + +Before using these scripts, you need to configure several variables in each script: + +#### For dcpidreport.py: + +1. **File paths**: + - Update the `filename` variable to point to your desired state file location + +2. **Dataverse configuration**: + - `doi_account`: Your DataCite account prefix (e.g., "GDCC.YOUR_ACCOUNT") + - `dataverse_base_url`: The base URL of your Dataverse installation (e.g., "https://data.yourdataverse.org") + +3. **Email configuration**: + - `receivers`: Email addresses that should receive the reports + - `smtp_server`: Your SMTP server address + - `port`: SMTP port (default is 465 for SSL) + - `sender_email`: Email address from which reports will be sent + - `username`: SMTP authentication username + - `password`: SMTP authentication password + +#### For pidreport.py: + +1. **File paths**: + - `log_dir`: Directory where PID failure logs are stored + +2. **Dataverse configuration**: + - `dataverse_base_url`: The base URL of your Dataverse installation + +3. **Email configuration**: + - Same as dcpidreport.py (receivers, smtp_server, port, sender_email, username, password) + +4. **IP blacklist configuration**: + - `blacklist`: List of IP addresses to exclude from reports (e.g., known scanners such as UT Dorkbot / autoscan.infosec.utexas.edu that test with incorrect PIDs) + +### Cron Configuration + +To set up a monthly cron job that runs on the 1st day of each month, add something similar to the following to your crontab: + +10 5 1 * * python3 /opt/pidreporting/pidreport.py >> /var/log/pidreport.log 2>&1 +12 5 1 * * /usr/bin/python3 /opt/pidreporting/dcpidreport.py >> /var/log/dcpidreport.log 2>&1 + diff --git a/python/pid_reports/dcpidreport.py b/python/pid_reports/dcpidreport.py new file mode 100644 index 0000000..ed6a9cf --- /dev/null +++ b/python/pid_reports/dcpidreport.py @@ -0,0 +1,83 @@ +import shutil +import tempfile +import urllib.request +import gzip +from datetime import datetime +import os.path +from urllib.error import HTTPError +from dateutil.relativedelta import * +import ssl, smtplib + +currentmonth=datetime.now().replace(day=1) + relativedelta(days=-1) +processmonth=currentmonth + +filename="/opt/pidreporting/dcpidreportstate" +if os.path.exists(filename): + with open(filename) as f: + line=next(f) + processmonth = datetime.strptime(line.strip('\n'),"%m_%Y") + processmonth = processmonth + relativedelta(months=+1) + +# Configuration variables +receivers = "admin@mydataverse.org,support@myinstitution.org" # Enter receiver address +doi_account = "GDCC.YOUR_ACCOUNT" # Replace with your DataCite account prefix +dataverse_base_url = "https://data.yourdataverse.org" # Replace with your Dataverse installation URL + +# mail config +port = 465 # For SSL +smtp_server = "" #Enter your SMTP server address, e.g. email-smtp.us-east-1.amazonaws.com +sender_email = "" # Enter your address +username = "" +password = "" + +message = "Subject: DataCite DOI Resolution Failure Reports\nTo: " + receivers + "\n\n" + +found=True +somereports=False +while (processmonth <= currentmonth) and found: + monthstr = processmonth.strftime("%m_%Y") + + try: + with urllib.request.urlopen('https://stats.datacite.org/stats/resolution-report/resolutions_' + monthstr + '.html') as response: + with tempfile.NamedTemporaryFile(delete=False) as tmp_file: + gzip_fd = gzip.GzipFile(fileobj=response) + shutil.copyfileobj(gzip_fd, tmp_file) + message = message + "Report for " + monthstr +"\n\nHits\tDOI\tURI\n(Note: clicking links will record new failures unless these are drafts)\n" + with open(tmp_file.name) as html: + for line in html: + if doi_account in line: + rightlist=False; + done=False + for line in html: + if "" in line: + rightlist=True; + elif ("
    " in line) and rightlist: + somereports=True + for line in html: + if line.startswith("" in line: + done=True + break + if done: + break + if done: + break + with open(filename, "w") as f: + f.write(processmonth.strftime("%m_%Y")) + message= message + "\n\n" + processmonth=processmonth + relativedelta(months=+1) + except urllib.error.HTTPError as err: + found=False +if not somereports: + message=message + "No new monthly reports from DataCite. Next report expected: " + monthstr + "\n\n" + + +context = ssl.create_default_context() +with smtplib.SMTP_SSL(smtp_server, port, context=context) as server: + server.login(username, password) + server.sendmail(sender_email, receivers.split(","), message) + diff --git a/python/pid_reports/pidreport.py b/python/pid_reports/pidreport.py new file mode 100644 index 0000000..edff1e0 --- /dev/null +++ b/python/pid_reports/pidreport.py @@ -0,0 +1,65 @@ +import smtplib, ssl, datetime, os.path + +# Configuration variables +# File paths +log_dir = "/usr/local/payara6/domains/domain1/logs" # Replace with your log directory path + +# Dataverse configuration +dataverse_base_url = "https://data.yourdataverse.org" # Replace with your Dataverse installation URL + +# Email configuration +receivers = "admin@mydataverse.org,support@myinstitution.org" # Enter receiver addresses +port = 465 # For SSL +smtp_server = "smtp.example.com" # Replace with your SMTP server +sender_email = "sender@example.com" # Enter your address +username = "your_username" # Replace with your SMTP username +password = "your_password" # Replace with your SMTP password + +blacklist=[] +blacklist.append("146.6.15.11") #UT Dorkbot / autoscan.infosec.utexas.edu + + +def numSort(s): + return int(s[0:s.index("_")]) + + +if os.path.exists(filename): + d={} + blcount=0 + with open(filename) as f: + for line in f.readlines()[1:]: + (pid, uri, method, ip, time)=line.split("\t") + if pid not in d and ip not in blacklist: + d[pid] = [] + if ip not in blacklist: + d[pid].append(method + " " + uri + " from " + ip + " at " + time) + else: + blcount = blcount + 1 + + l=[] + for key in d: + l.append(str(len(d[key])) + "_" + key) + + l.sort(reverse=True, key=numSort) + + message = message + "Hits\tDOI\tURI\n(Note: clicking links will record new failures unless these are drafts)\n" + + for val in l: + doi = val[val.index("_")+1:] + message = message + "\n" + str(numSort(val)) + "\t" + doi + "\t" + dataverse_base_url + "/dataset.xhtml?persistentId=" + doi + + message = message + "\n\nDetails:\n\n" + + if blcount is not 0: + message = message + str(blcount) + "entries (not reported) from blacklisted IP addresses (e.g. UT Dorkbot)\n\n" + for val in l: + doi = val[val.index("_")+1:] + message = message + doi + "\n\t" + "\n\t".join(d[doi]) + "\n" +else: + message= message + "No Failures this month\n\n" + +context = ssl.create_default_context() +with smtplib.SMTP_SSL(smtp_server, port, context=context) as server: + server.login(username, password) + server.sendmail(sender_email, receivers.split(","), message) +