From 1270afe424df6bae9174cda18a48ce48b50aceb8 Mon Sep 17 00:00:00 2001 From: tjacovich Date: Thu, 26 Jun 2025 14:18:45 -0400 Subject: [PATCH 1/4] Allow reparsing of metadata without re-harvesting. --- ADSCitationCapture/tasks.py | 9 ++++++--- run.py | 12 +++++++++--- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/ADSCitationCapture/tasks.py b/ADSCitationCapture/tasks.py index 610c0b5..77519de 100644 --- a/ADSCitationCapture/tasks.py +++ b/ADSCitationCapture/tasks.py @@ -521,7 +521,7 @@ def task_maintenance_canonical(dois, bibcodes): task_output_results.delay(custom_citation_change, parsed_metadata, existing_citation_bibcodes, db_versions=registered_record.get('associated_works', {"":""}), readers=readers) @app.task(queue='maintenance_metadata') -def task_maintenance_metadata(dois, bibcodes, reset=False): +def task_maintenance_metadata(dois, bibcodes, reparse=False): """ Maintenance operation: - Get all the registered citation targets (or only a subset of them if DOIs and/or bibcodes are specified) @@ -545,8 +545,11 @@ def task_maintenance_metadata(dois, bibcodes, reset=False): curated_metadata = registered_record.get('curated_metadata', {}) - logger.debug("Curated metadata for {} is {}".format(registered_record['content'], registered_record['curated_metadata'])) - raw_metadata = doi.fetch_metadata(app.conf['DOI_URL'], app.conf['DATACITE_URL'], registered_record['content']) + logger.debug("Curated metadata for {} is {}".format(registered_record['content'], registered_record['curated_metadata'])) + if not reparse: + raw_metadata = doi.fetch_metadata(app.conf['DOI_URL'], app.conf['DATACITE_URL'], registered_record['content']) + else: + raw_metadata = registered_record['raw_cited_metadata'] if raw_metadata: parsed_metadata = doi.parse_metadata(raw_metadata) is_software = parsed_metadata.get('doctype', '').lower() == "software" diff --git a/run.py b/run.py index e49370d..9d773e0 100755 --- a/run.py +++ b/run.py @@ -73,7 +73,7 @@ def maintenance_canonical(dois, bibcodes): # Send to master updated citation bibcodes in their canonical form tasks.task_maintenance_canonical.delay(dois, bibcodes) -def maintenance_metadata(dois, bibcodes): +def maintenance_metadata(dois, bibcodes, reparse=False): """ Refetch metadata and send updates to master (if any) """ @@ -84,7 +84,7 @@ def maintenance_metadata(dois, bibcodes): logger.info("MAINTENANCE task: requested a metadata update for '{}' records".format(n_requested)) # Send to master updated metadata - tasks.task_maintenance_metadata.delay(dois, bibcodes) + tasks.task_maintenance_metadata.delay(dois, bibcodes, reparse=reparse) def maintenance_resend(dois, bibcodes, broker=False, only_nonbib=False): """ @@ -352,6 +352,12 @@ def _build_diagnostics(bibcodes=None, json_payloads=None): action='store_true', default=False, help='Update DOI metadata for the provided list of citation target bibcodes, or if none is provided, for all the current existing citation targets.') + maintenance_parser.add_argument( + '--reparse', + dest='reparse', + action='store_true', + default=False, + help='Calls maintenance task to reparse existing metadata.') maintenance_parser.add_argument( '--readers', dest='import_readers', @@ -445,7 +451,7 @@ def _build_diagnostics(bibcodes=None, json_payloads=None): bibcodes = args.bibcodes # Process if args.metadata: - maintenance_metadata(dois, bibcodes) + maintenance_metadata(dois, bibcodes, args.reparse) elif args.canonical: maintenance_canonical(dois, bibcodes) elif args.resend: From 7550b6c35c1e7312d6340df58b77c14d17bd09b5 Mon Sep 17 00:00:00 2001 From: tjacovich Date: Thu, 26 Jun 2025 15:55:10 -0400 Subject: [PATCH 2/4] Add parsing to create orcid_pub field if applicable and remove from affiliations. --- ADSCitationCapture/doi.py | 17 ++++++++++++ ADSCitationCapture/forward.py | 12 ++++++++- .../tests/data/sample_bib_record_orcid.json | 1 + .../data/sample_nonbib_record_orcid.json | 1 + ADSCitationCapture/tests/test_base.py | 27 +++++++++++++++++++ ADSCitationCapture/tests/test_forward.py | 18 +++++++++++++ 6 files changed, 75 insertions(+), 1 deletion(-) create mode 100644 ADSCitationCapture/tests/data/sample_bib_record_orcid.json create mode 100644 ADSCitationCapture/tests/data/sample_nonbib_record_orcid.json diff --git a/ADSCitationCapture/doi.py b/ADSCitationCapture/doi.py index 29f8fce..2481ce5 100644 --- a/ADSCitationCapture/doi.py +++ b/ADSCitationCapture/doi.py @@ -107,6 +107,23 @@ def fetch_metadata(base_doi_url, base_datacite_url, doi): return content if record_found else None +def extract_orcids_from_affs(affiliations): + orcids = [] + stripped_affs = [] + orcid_tag_regex = re.compile('(.*)[^\s]*([\d]{4}-[\d]{4}-[\d]{4}-[\d]{4})<\/ID>') + orcid_tag_regex_legacy = re.compile('(.*)[^\s]*([\d]{4}-[\d]{4}-[\d]{4}-[\d]{4})<\/ORCID>') + + for aff in affiliations: + if orcid_tag_regex.match(aff): + orcids.append(orcid_tag_regex.match(aff).groups()[-1]) + stripped_affs.append(orcid_tag_regex.match(aff).groups()[0]) + elif orcid_tag_regex_legacy.match(aff): + orcids.append(orcid_tag_regex_legacy.match(aff).groups()[-1]) + stripped_affs.append(orcid_tag_regex_legacy.match(aff).groups()[0]) + else: + stripped_affs.append(aff) + orcids.append("-") + return orcids, stripped_affs def build_bibcode(metadata, doi_re, bibstem): """ diff --git a/ADSCitationCapture/forward.py b/ADSCitationCapture/forward.py index 946cbfc..a99fbb0 100644 --- a/ADSCitationCapture/forward.py +++ b/ADSCitationCapture/forward.py @@ -7,6 +7,8 @@ from adsmsg import DenormalizedRecord, NonBibRecord, Status, CitationChangeContentType from bs4 import BeautifulSoup from adsputils import setup_logging +from ADSCitationCapture.doi import extract_orcids_from_affs + # ============================= INITIALIZATION ==================================== # # - Use app logger: @@ -125,7 +127,8 @@ def build_record(app, citation_change, parsed_metadata, citations, db_versions, 'read_count': len(readers), 'title': [title], 'publisher': source, - 'version': version + 'version': version, + 'orcid_pub': [] } if version is None: # Concept DOIs may not contain version del record_dict['version'] @@ -147,6 +150,13 @@ def build_record(app, citation_change, parsed_metadata, citations, db_versions, if is_release: record_dict['property'].append('RELEASE') + if record_dict["aff"] != ["-" for i in range(0, len(record_dict["aff"]))]: + record_dict['orcid_pub'], record_dict['aff'] = extract_orcids_from_affs(record_dict['aff']) + if set(record_dict['orcid_pub'])=={'-'}: + del record_dict['orcid_pub'] + else: + del record_dict['orcid_pub'] + record = DenormalizedRecord(**record_dict) nonbib_record = _build_nonbib_record(app, citation_change, record, db_versions, status, readers=readers) return record, nonbib_record diff --git a/ADSCitationCapture/tests/data/sample_bib_record_orcid.json b/ADSCitationCapture/tests/data/sample_bib_record_orcid.json new file mode 100644 index 0000000..1f64962 --- /dev/null +++ b/ADSCitationCapture/tests/data/sample_bib_record_orcid.json @@ -0,0 +1 @@ +{"abstract": "Cubix is a ROOT based graphical interface providing a large number of tools for gamma-ray spectrocopy analysis like peak fits, calibrations, gamma-gamma analysis... It is linked with the TkN library to provide a direct access to nuclear databases. The source code is available on Gitlab: https://gitlab.in2p3.fr/ip2igamma/cubix/cubix The documentation website : https://cubix.in2p3.fr The Cubix project is governed by the CeCILL-B license under French law and abiding by the rules of distribution of free software. You can use, modify and/or redistribute the software under the terms of the CeCILL-B license as circulated by CEA, CNRS and INRIA at the following link www.cecill.info.", "aff": ["CNRS D\u00e9l\u00e9gation Rh\u00f4ne-Auvergne "], "author": ["Dudouet, J\u00e9r\u00e9mie"], "author_count": 1, "author_facet": ["Dudouet, J"], "author_facet_hier": ["0/Dudouet, J", "1/Dudouet, J/Dudouet, J\u00e9r\u00e9mie"], "author_norm": ["Dudouet, J"], "bibcode": "2023zndo..10683242D", "bibstem": ["zndo"], "bibstem_facet": "zndo", "citation": [""], "citation_count": "1", "database": ["general", "astronomy"], "date": "2023-01-01T00:30:00.000000Z", "doctype": "software", "doctype_facet_hier": ["0/Non-Article", "1/Non-Article/Software"], "doi": ["10.5281/zenodo.11020"], "eid": "10.5281/zenodo.11020", "email": ["-"], "first_author": "Dudouet, J\u00e9r\u00e9mie", "first_author_facet_hier": ["0/Dudouet, J", "1/Dudouet, J/Dudouet, J\u00e9r\u00e9mie"], "first_author_norm": "Dudouet, J", "identifier": ["2023zndo..10683242D", "10.5281/zenodo.11020"], "keyword": ["GUI", "gamma-ray spectroscopy", "ROOT", "fits"], "keyword_facet": ["GUI", "gamma-ray spectroscopy", "ROOT", "fits"], "keyword_norm": ["-", "-", "-", "-"], "keyword_schema": ["-", "-", "-", "-"], "links_data": ["{\"access\": \"\", \"instances\": \"\", \"title\": \"\", \"type\": \"electr\", \"url\": \"https://doi.org/10.5281/zenodo.11020\"}"], "orcid_pub": ["0000-0002-9018-6763"], "property": ["ESOURCE", "NONARTICLE", "NOT REFEREED", "PUB_OPENACCESS", "OPENACCESS", "RELEASE"], "pub": "Zenodo", "pub_raw": "Zenodo", "pubdate": "2023-01-01", "title": ["Cubix"], "year": "2023", "entry_date": "1970-01-01T00:00:00.000000Z", "data_count": 1, "esources": ["PUB_HTML"], "citation_count_norm": 1.0, "publisher": "Zenodo", "version": "1.0"} \ No newline at end of file diff --git a/ADSCitationCapture/tests/data/sample_nonbib_record_orcid.json b/ADSCitationCapture/tests/data/sample_nonbib_record_orcid.json new file mode 100644 index 0000000..0f5df7e --- /dev/null +++ b/ADSCitationCapture/tests/data/sample_nonbib_record_orcid.json @@ -0,0 +1 @@ +{"bibcode": "2023zndo..10683242D", "boost": 0.5, "citation_count": 1, "esource": ["PUB_HTML"], "property": ["ESOURCE", "NONARTICLE", "NOT REFEREED", "PUB_OPENACCESS", "OPENACCESS", "RELEASE"], "data_links_rows": [{"link_type": "ESOURCE", "link_sub_type": "PUB_HTML", "url": ["https://doi.org/10.5281/zenodo.11020"], "title": [""]}], "status": "updated", "citation_count_norm": 1.0} \ No newline at end of file diff --git a/ADSCitationCapture/tests/test_base.py b/ADSCitationCapture/tests/test_base.py index 722311c..7016d8a 100644 --- a/ADSCitationCapture/tests/test_base.py +++ b/ADSCitationCapture/tests/test_base.py @@ -350,3 +350,30 @@ def _init_mock_data(self): 'associated_works': "", 'version': "v2.0.0" }] + self.mock_data["10.5281/zenodo.10683242"] = { + 'raw': ' 10.5281/ZENODO.10683242 Dudouet, Jérémie Jérémie Dudouet 0000-0002-9018-6763 CNRS Délégation Rhône-Auvergne Cubix Zenodo 2024 GUI gamma-ray spectroscopy ROOT fits 2024-02-20 2023 2024 en oai:zenodo.org:10683242 10.5281/zenodo.10255692 10.5281/zenodo.10683241 1.0 CeCILL-B Free Software License Agreement Cubix is a ROOT based graphical interface providing a large number of tools for gamma-ray spectrocopy analysis like peak fits, calibrations, gamma-gamma analysis... It is linked with the TkN library to provide a direct access to nuclear databases. The source code is available on Gitlab: https://gitlab.in2p3.fr/ip2igamma/cubix/cubix The documentation website : https://cubix.in2p3.fr The Cubix project is governed by the CeCILL-B license under French law and abiding by the rules of distribution of free software. You can use, modify and/or redistribute the software under the terms of the CeCILL-B license as circulated by CEA, CNRS and INRIA at the following link www.cecill.info. ', + 'parsed': { + "bibcode": "2023zndo..10683242D", + "authors": ["Dudouet, J\u00e9r\u00e9mie"], + "normalized_authors": ["Dudouet, J"], + "affiliations": ["CNRS D\u00e9l\u00e9gation Rh\u00f4ne-Auvergne 0000-0002-9018-6763"], + "title": "Cubix", + "pubdate": "2023", + "properties": {"DOI": "10.5281/ZENODO.10683242"}, + "keywords": ["GUI", "gamma-ray spectroscopy", "ROOT", "fits"], + "abstract": "Cubix is a ROOT based graphical interface providing a large number of tools for gamma-ray spectrocopy analysis like peak fits, calibrations, gamma-gamma analysis... It is linked with the TkN library to provide a direct access to nuclear databases.\n\nThe source code is available on Gitlab: https://gitlab.in2p3.fr/ip2igamma/cubix/cubix\n\nThe documentation website : https://cubix.in2p3.fr\n\n\n\nThe Cubix project is governed by the CeCILL-B license under French law and abiding by the rules of distribution of free software. You can use, modify and/or redistribute the software under the terms of the CeCILL-B license as circulated by CEA, CNRS and INRIA at the following link www.cecill.info.", + "references": [], + "citations": [], + "doctype": "software", + "version": "1.0", + "versions": [], + "version_of": ["10.5281/zenodo.10683241"], + "forked_from": [], + "forks": [], + "described_by": [], + "description_of": [], + "source": "Zenodo"}, + 'status': 'REGISTERED', + 'content': "10.5281/zenodo.248351", + 'content_type': "DOI", + } diff --git a/ADSCitationCapture/tests/test_forward.py b/ADSCitationCapture/tests/test_forward.py index 79c6200..f04ebd8 100644 --- a/ADSCitationCapture/tests/test_forward.py +++ b/ADSCitationCapture/tests/test_forward.py @@ -52,6 +52,24 @@ def test_build_bib_record_no_associated_works(self): self.assertEqual(bib_record.toJSON(),expect_bib_record) self.assertEqual(nonbib_record.toJSON(),expect_nonbib_record) + def test_build_bib_record_orcid(self): + content_filename = os.path.join(self.app.conf['PROJ_HOME'], "ADSCitationCapture/tests/data/sample_bib_record_orcid.json") + with open(content_filename) as f: + expect_bib_record = json.load(f) + content_filename = os.path.join(self.app.conf['PROJ_HOME'], "ADSCitationCapture/tests/data/sample_nonbib_record_orcid.json") + with open(content_filename) as f: + expect_nonbib_record = json.load(f) + + citation_changes = self._common_citation_changes_doi(adsmsg.Status.updated) + citation_change = tasks._protobuf_to_adsmsg_citation_change(citation_changes.changes[0]) + doi_id = "10.5281/zenodo.10683242" # software + parsed_metadata = self.mock_data[doi_id]['parsed'] + citations =[''] + db_versions = {"":""} + bib_record, nonbib_record = forward.build_record(self.app, citation_change, parsed_metadata, citations, db_versions) + self.assertEqual(bib_record.toJSON(),expect_bib_record) + self.assertEqual(nonbib_record.toJSON(),expect_nonbib_record) + def test_build_bib_record_associated_works(self): content_filename = os.path.join(self.app.conf['PROJ_HOME'], "ADSCitationCapture/tests/data/sample_bib_record_associated.json") with open(content_filename) as f: From b24ff6d1f452f4a7ceddd1da00c72309c35d36d4 Mon Sep 17 00:00:00 2001 From: tjacovich Date: Thu, 26 Jun 2025 16:21:53 -0400 Subject: [PATCH 3/4] Fix raw metadata import. --- ADSCitationCapture/tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ADSCitationCapture/tasks.py b/ADSCitationCapture/tasks.py index 77519de..0f6584d 100644 --- a/ADSCitationCapture/tasks.py +++ b/ADSCitationCapture/tasks.py @@ -549,7 +549,7 @@ def task_maintenance_metadata(dois, bibcodes, reparse=False): if not reparse: raw_metadata = doi.fetch_metadata(app.conf['DOI_URL'], app.conf['DATACITE_URL'], registered_record['content']) else: - raw_metadata = registered_record['raw_cited_metadata'] + raw_metadata = db.get_citation_target_metadata(app, registered_record['content']) if raw_metadata: parsed_metadata = doi.parse_metadata(raw_metadata) is_software = parsed_metadata.get('doctype', '').lower() == "software" From 4cd83bbf1248c4fcaa68760ff5b2106ef3314ab4 Mon Sep 17 00:00:00 2001 From: tjacovich Date: Thu, 26 Jun 2025 17:03:27 -0400 Subject: [PATCH 4/4] Fix raw-metadata import to actually get raw metadata. --- ADSCitationCapture/tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ADSCitationCapture/tasks.py b/ADSCitationCapture/tasks.py index 0f6584d..68c48e8 100644 --- a/ADSCitationCapture/tasks.py +++ b/ADSCitationCapture/tasks.py @@ -549,7 +549,7 @@ def task_maintenance_metadata(dois, bibcodes, reparse=False): if not reparse: raw_metadata = doi.fetch_metadata(app.conf['DOI_URL'], app.conf['DATACITE_URL'], registered_record['content']) else: - raw_metadata = db.get_citation_target_metadata(app, registered_record['content']) + raw_metadata = db.get_citation_target_metadata(app, registered_record['content']).get("raw") if raw_metadata: parsed_metadata = doi.parse_metadata(raw_metadata) is_software = parsed_metadata.get('doctype', '').lower() == "software"