From 45cddf89a8d5a4203853c992625e28fe220e1af3 Mon Sep 17 00:00:00 2001 From: tjacovich Date: Tue, 24 Jun 2025 19:41:18 -0400 Subject: [PATCH 1/4] Initial take on patching orcid tags. --- ADSCitationCapture/doi.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/ADSCitationCapture/doi.py b/ADSCitationCapture/doi.py index 29f8fce..cb0dfc3 100644 --- a/ADSCitationCapture/doi.py +++ b/ADSCitationCapture/doi.py @@ -209,6 +209,24 @@ def _parse_metadata_zenodo_doi(raw_metadata): parsed_metadata['bibcode'] = bibcode return parsed_metadata +def _normalize_orcid_tags(parsed_metadata): + orcid_regex = re.compile('(*)([^\s]*)<\ORCID>(*)') + affs = parsed_metadata.get('affiliations') + corrected_affs = [] + if affs: + for aff in affs: + orcid_id = orcid_regex.match(aff).groups()[0] + if not orcid_id: + corrected_affs.append(aff) + else: + orcid_aff = '' + orcid_id + '' + #TODO This will remove anything outside of the tag from the affiliation. Need to fix. + corrected_affs.append(orcid_aff) + + + + return parsed_metadata + def fetch_all_versions_doi(base_doi_url, base_datacite_url, parsed_metadata): """ Takes zenodo parsed metadata and fetches DOI for base repository as well as DOI for all versions. From df2dfb93b6e113c31a64e62495be6d1766ddad67 Mon Sep 17 00:00:00 2001 From: tjacovich Date: Wed, 25 Jun 2025 11:36:55 -0400 Subject: [PATCH 2/4] More detailed regex handling of affiliations with orcids. --- ADSCitationCapture/doi.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/ADSCitationCapture/doi.py b/ADSCitationCapture/doi.py index cb0dfc3..322447a 100644 --- a/ADSCitationCapture/doi.py +++ b/ADSCitationCapture/doi.py @@ -207,22 +207,35 @@ def _parse_metadata_zenodo_doi(raw_metadata): bibcode = build_bibcode(parsed_metadata, zenodo_doi_re, zenodo_bibstem) if bibcode not in (None, ""): parsed_metadata['bibcode'] = bibcode + parsed_metadata = _normalize_orcid_tags(parsed_metadata) return parsed_metadata def _normalize_orcid_tags(parsed_metadata): - orcid_regex = re.compile('(*)([^\s]*)<\ORCID>(*)') + """ + This method takes the parsed metadata from the DataCite parser and converts tags + so they conform to the tags the ADS system expects. + """ + orcid_regex = re.compile('(.*)([^\s]*)') + affs = parsed_metadata.get('affiliations') + corrected_affs = [] + if affs: for aff in affs: - orcid_id = orcid_regex.match(aff).groups()[0] + regex_groups = orcid_regex.match(aff).groups() + orcid_id = regex_groups[-1] if not orcid_id: corrected_affs.append(aff) else: orcid_aff = '' + orcid_id + '' - #TODO This will remove anything outside of the tag from the affiliation. Need to fix. + if len(regex_groups) == 2: + orcid_aff = regex_groups[0] + orcid_aff corrected_affs.append(orcid_aff) + parsed_metadata['affiliations'] = corrected_affs + + return parsed_metadata return parsed_metadata From c50891bdf084f5ce9c5853ff57160318d80766e9 Mon Sep 17 00:00:00 2001 From: tjacovich Date: Wed, 25 Jun 2025 13:34:21 -0400 Subject: [PATCH 3/4] Revert changes to move them into pyingest-v1.2.6 --- ADSCitationCapture/doi.py | 31 ----------- .../tests/data/datacite_decoded_orcid.xml | 55 +++++++++++++++++++ .../data/datacite_parsed_metadata_orcid.json | 1 + ADSCitationCapture/tests/test_doi.py | 10 ++++ requirements.txt | 4 +- scripts/support/pytest.sh | 2 +- 6 files changed, 69 insertions(+), 34 deletions(-) create mode 100644 ADSCitationCapture/tests/data/datacite_decoded_orcid.xml create mode 100644 ADSCitationCapture/tests/data/datacite_parsed_metadata_orcid.json diff --git a/ADSCitationCapture/doi.py b/ADSCitationCapture/doi.py index 322447a..29f8fce 100644 --- a/ADSCitationCapture/doi.py +++ b/ADSCitationCapture/doi.py @@ -207,37 +207,6 @@ def _parse_metadata_zenodo_doi(raw_metadata): bibcode = build_bibcode(parsed_metadata, zenodo_doi_re, zenodo_bibstem) if bibcode not in (None, ""): parsed_metadata['bibcode'] = bibcode - parsed_metadata = _normalize_orcid_tags(parsed_metadata) - return parsed_metadata - -def _normalize_orcid_tags(parsed_metadata): - """ - This method takes the parsed metadata from the DataCite parser and converts tags - so they conform to the tags the ADS system expects. - """ - orcid_regex = re.compile('(.*)([^\s]*)') - - affs = parsed_metadata.get('affiliations') - - corrected_affs = [] - - if affs: - for aff in affs: - regex_groups = orcid_regex.match(aff).groups() - orcid_id = regex_groups[-1] - if not orcid_id: - corrected_affs.append(aff) - else: - orcid_aff = '' + orcid_id + '' - if len(regex_groups) == 2: - orcid_aff = regex_groups[0] + orcid_aff - corrected_affs.append(orcid_aff) - - parsed_metadata['affiliations'] = corrected_affs - - return parsed_metadata - - return parsed_metadata def fetch_all_versions_doi(base_doi_url, base_datacite_url, parsed_metadata): diff --git a/ADSCitationCapture/tests/data/datacite_decoded_orcid.xml b/ADSCitationCapture/tests/data/datacite_decoded_orcid.xml new file mode 100644 index 0000000..6b40e75 --- /dev/null +++ b/ADSCitationCapture/tests/data/datacite_decoded_orcid.xml @@ -0,0 +1,55 @@ + + + 10.5281/ZENODO.10683242 + + + Dudouet, Jérémie + Jérémie + Dudouet + 0000-0002-9018-6763 + CNRS Délégation Rhône-Auvergne + + + + Cubix + + Zenodo + 2024 + + + GUI + gamma-ray spectroscopy + ROOT + fits + + + 2024-02-20 + 2023 + 2024 + + en + + oai:zenodo.org:10683242 + + + 10.5281/zenodo.10255692 + 10.5281/zenodo.10683241 + + + + 1.0 + + CeCILL-B Free Software License Agreement + + + Cubix is a ROOT based graphical interface providing a large number of tools for gamma-ray spectrocopy analysis like peak fits, calibrations, gamma-gamma analysis... It is linked with the TkN library to provide a direct access to nuclear databases. + +The source code is available on Gitlab: https://gitlab.in2p3.fr/ip2igamma/cubix/cubix + +The documentation website : https://cubix.in2p3.fr + + + +The Cubix project is governed by the CeCILL-B license under French law and abiding by the rules of distribution of free software. You can use, modify and/or redistribute the software under the terms of the CeCILL-B license as circulated by CEA, CNRS and INRIA at the following link www.cecill.info. + + diff --git a/ADSCitationCapture/tests/data/datacite_parsed_metadata_orcid.json b/ADSCitationCapture/tests/data/datacite_parsed_metadata_orcid.json new file mode 100644 index 0000000..5aeb8a1 --- /dev/null +++ b/ADSCitationCapture/tests/data/datacite_parsed_metadata_orcid.json @@ -0,0 +1 @@ +{"bibcode": "", "authors": ["Dudouet, J\u00e9r\u00e9mie"], "normalized_authors": ["Dudouet, J"], "affiliations": ["CNRS D\u00e9l\u00e9gation Rh\u00f4ne-Auvergne 0000-0002-9018-6763"], "title": "Cubix", "pubdate": "2023", "properties": {"DOI": "10.5281/ZENODO.10683242"}, "keywords": ["GUI", "gamma-ray spectroscopy", "ROOT", "fits"], "abstract": "Cubix is a ROOT based graphical interface providing a large number of tools for gamma-ray spectrocopy analysis like peak fits, calibrations, gamma-gamma analysis... It is linked with the TkN library to provide a direct access to nuclear databases.\n\nThe source code is available on Gitlab: https://gitlab.in2p3.fr/ip2igamma/cubix/cubix\n\nThe documentation website : https://cubix.in2p3.fr\n\n\n\nThe Cubix project is governed by the CeCILL-B license under French law and abiding by the rules of distribution of free software. You can use, modify and/or redistribute the software under the terms of the CeCILL-B license as circulated by CEA, CNRS and INRIA at the following link www.cecill.info.", "references": [], "citations": [], "doctype": "software", "version": "1.0", "versions": [], "version_of": ["10.5281/zenodo.10683241"], "forked_from": [], "forks": [], "described_by": [], "description_of": [], "source": "Zenodo"} \ No newline at end of file diff --git a/ADSCitationCapture/tests/test_doi.py b/ADSCitationCapture/tests/test_doi.py index 1c3abc1..c617b11 100644 --- a/ADSCitationCapture/tests/test_doi.py +++ b/ADSCitationCapture/tests/test_doi.py @@ -62,6 +62,16 @@ def test_parse_metadata(self): parsed_metadata = doi.dc.parse(raw_metadata) self.assertEqual(parsed_metadata, expected_parsed_metadata) + def test_parse_metadata_orcid(self): + datacite_xml_format_filename = os.path.join(self.app.conf['PROJ_HOME'], "ADSCitationCapture/tests/data/datacite_decoded_orcid.xml") + with open(datacite_xml_format_filename, "r") as f: + raw_metadata = "".join(f.readlines()) + datacite_parsed_metadata_filename = os.path.join(self.app.conf['PROJ_HOME'], "ADSCitationCapture/tests/data/datacite_parsed_metadata_orcid.json") + with open(datacite_parsed_metadata_filename, "r") as f: + expected_parsed_metadata = json.loads("".join(f.readlines())) + parsed_metadata = doi.dc.parse(raw_metadata) + self.assertEqual(parsed_metadata, expected_parsed_metadata) + def test_build_bibcode(self): expected_bibcode = "2007zndo.....48535G" datacite_parsed_metadata_filename = os.path.join(self.app.conf['PROJ_HOME'], "ADSCitationCapture/tests/data/datacite_parsed_metadata_and_authors.json") diff --git a/requirements.txt b/requirements.txt index 4c3e2b1..5f6e0a9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -git+https://github.com/adsabs/adsabs-pyingest@v1.2.5 +git+https://github.com/adsabs/adsabs-pyingest@v1.2.6 adsputils==1.4.3 psycopg2-binary==2.8.3 alembic==0.9.3 @@ -9,4 +9,4 @@ astropy==5.2.2 portalocker==1.7.1 SQLAlchemy-Utils==0.37.8 unidecode==0.04.21 -setuptools<=56 +setuptools<=56 \ No newline at end of file diff --git a/scripts/support/pytest.sh b/scripts/support/pytest.sh index 63b9fdc..42d3f21 100755 --- a/scripts/support/pytest.sh +++ b/scripts/support/pytest.sh @@ -11,7 +11,7 @@ if [ -e alembic.ini ]; then fi fi -py.test +#py.test echo "For interactive access, run in a diferent terminal:" echo " docker exec -it pytest_citation_capture_pipeline bash" From df7369bbc3712767e6f1290290933ca68ee542b8 Mon Sep 17 00:00:00 2001 From: tjacovich Date: Wed, 25 Jun 2025 13:36:23 -0400 Subject: [PATCH 4/4] undo commenting out py.test. --- scripts/support/pytest.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/support/pytest.sh b/scripts/support/pytest.sh index 42d3f21..63b9fdc 100755 --- a/scripts/support/pytest.sh +++ b/scripts/support/pytest.sh @@ -11,7 +11,7 @@ if [ -e alembic.ini ]; then fi fi -#py.test +py.test echo "For interactive access, run in a diferent terminal:" echo " docker exec -it pytest_citation_capture_pipeline bash"