From 18cc7209776196fee45c3eaf59cb883a6d980f5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o?= Date: Wed, 22 Apr 2026 16:20:08 +0200 Subject: [PATCH 1/6] feat: add DOI extraction and refactor related code in plos.py --- welearn_datastack/plugins/scrapers/plos.py | 25 +++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/welearn_datastack/plugins/scrapers/plos.py b/welearn_datastack/plugins/scrapers/plos.py index 56c36dd..90bfa8c 100644 --- a/welearn_datastack/plugins/scrapers/plos.py +++ b/welearn_datastack/plugins/scrapers/plos.py @@ -8,6 +8,7 @@ import requests # type: ignore from bs4 import BeautifulSoup, Tag # type: ignore from requests.adapters import HTTPAdapter # type: ignore +from welearn_database.data.enumeration import ExternalIdType from welearn_database.data.models import WeLearnDocument from welearn_datastack.constants import ANTI_URL_REGEX, AUTHORIZED_LICENSES @@ -70,11 +71,11 @@ def _get_document_details(self, soup: BeautifulSoup) -> Dict[str, Any]: authors = self._get_authors(article_meta) - doi_extract = article_meta.find("article-id", {"pub-id-type": "doi"}) + doi = self.extract_doi(article_meta) + published_id_extract = article_meta.find( "article-id", {"pub-id-type": "publisher-id"} ) - doi = "" if not isinstance(doi_extract, Tag) else doi_extract.text published_id = ( "" if not isinstance(published_id_extract, Tag) @@ -82,7 +83,7 @@ def _get_document_details(self, soup: BeautifulSoup) -> Dict[str, Any]: ) journal_extract = journal_meta.find("journal-title") - journal = "" if not isinstance(journal_extract, Tag) else journal_extract.text + journal = self.extract_property(journal_extract) article_type = self._get_article_type(article_meta) @@ -92,7 +93,7 @@ def _get_document_details(self, soup: BeautifulSoup) -> Dict[str, Any]: publication_date = self._generate_timestamp_from_html(pubdate_extract) issn_extract = journal_meta.find("issn") - issn = "" if not isinstance(issn_extract, Tag) else issn_extract.text + issn = self.extract_property(issn_extract) pub_name_extract = journal_meta.find("publisher-name") pub_loc_extract = journal_meta.find("publisher-loc") @@ -118,6 +119,18 @@ def _get_document_details(self, soup: BeautifulSoup) -> Dict[str, Any]: } return ret + def extract_doi(self, article_meta: BeautifulSoup) -> str: + doi_extract = article_meta.find("article-id", {"pub-id-type": "doi"}) + doi = self.extract_property(doi_extract) + if doi.startswith("https://doi.org/"): + doi = doi.replace("https://doi.org/", "") + return doi + + @staticmethod + def extract_property(doi_extract: BeautifulSoup | None) -> str: + doi = "" if not isinstance(doi_extract, Tag) else doi_extract.text + return doi + @staticmethod def _handle_license(article_meta): """ @@ -172,7 +185,7 @@ def _get_authors(article_meta): for name_part in name_tag.children: if name_part.text != "\n": name += name_part.text + " " - name.strip() + name = name.strip() author["name"] = clean_return_to_line(name) # Misc @@ -259,6 +272,8 @@ def extract_data_from_plos_xml( document.description = clean_return_to_line(doc_desc) document.full_content = clean_doc_content document.details = self._get_document_details(soup=soup) + document.external_id = self.extract_doi(article_meta) + document.external_id_type = ExternalIdType.DOI return document From 74a8a2be8b1f904a881215e0eae3f34d495ba95a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o?= Date: Wed, 22 Apr 2026 16:57:54 +0200 Subject: [PATCH 2/6] feat: enhance OpenAlex document processing with detailed metadata extraction and validation --- .../plugins_test/test_open_alex.py | 4 +- .../plugins/rest_requesters/open_alex.py | 255 ++++++++++++------ 2 files changed, 181 insertions(+), 78 deletions(-) diff --git a/tests/document_collector_hub/plugins_test/test_open_alex.py b/tests/document_collector_hub/plugins_test/test_open_alex.py index cac8b87..dc64559 100644 --- a/tests/document_collector_hub/plugins_test/test_open_alex.py +++ b/tests/document_collector_hub/plugins_test/test_open_alex.py @@ -33,7 +33,7 @@ def build_openalex_result( url: str = "https://openalex.org/W123", - doi: str = "10.1234/example", + doi: str = "https://doi.org/10.1234/example", title: str = "Sample Title", ): ids = Ids(openalex=url, doi=doi, mag="", pmid="", pmcid="") @@ -216,6 +216,8 @@ def test_update_welearn_document_returns_expected_document(self, mock_pdf): doc.details["authors"][0]["name"], openalex_result.authorships[0].author.display_name, ) + self.assertEqual(doc.external_id, "10.1234/example") + self.assertEqual(doc.external_id_type, "doi") # Test _update_welearn_document raises on closed access @patch("welearn_datastack.plugins.rest_requesters.open_alex.get_new_https_session") diff --git a/welearn_datastack/plugins/rest_requesters/open_alex.py b/welearn_datastack/plugins/rest_requesters/open_alex.py index 34df448..cae6ca5 100644 --- a/welearn_datastack/plugins/rest_requesters/open_alex.py +++ b/welearn_datastack/plugins/rest_requesters/open_alex.py @@ -5,9 +5,10 @@ from dataclasses import asdict from datetime import datetime from itertools import batched -from typing import Iterable +from typing import Any, Iterable from urllib.parse import urlparse +from welearn_database.data.enumeration import ExternalIdType from welearn_database.data.models import WeLearnDocument from welearn_datastack.constants import ( @@ -69,10 +70,11 @@ def __init__(self) -> None: self.team_email = team_email @staticmethod - def _invert_abstract(inv_index: dict[str, list[int]]) -> str: + def _invert_abstract(inv_index: dict[str, list[int]]) -> str | None: if inv_index is not None: l_inv = [(w, p) for w, pos in inv_index.items() for p in pos] - return " ".join(map(lambda x: x[0], sorted(l_inv, key=lambda x: x[1]))) + return " ".join([x[0] for x in sorted(l_inv, key=lambda x: x[1])]) + return None @staticmethod def _extract_openalex_id_from_urls(urls: Iterable[str]) -> list[str]: @@ -183,44 +185,98 @@ def _remove_useless_first_word( return string_to_clear def _update_welearn_document(self, wrapper: WrapperRawData) -> WeLearnDocument: - document_title = wrapper.raw_data.title document_url = wrapper.raw_data.ids.openalex logger.info(f"Process {document_url}...") - document_desc = self._remove_useless_first_word( - string_to_clear=self._invert_abstract( - wrapper.raw_data.abstract_inverted_index - ), - useless_words=["background", "abstract", "introduction"], - ) + self._check_publisher_authorization(wrapper) + self._check_access(document_url, wrapper) + self._check_license(document_url, wrapper) + logger.info(f"The content {document_url} is legally usable") - work_locations: list[Location] = wrapper.raw_data.locations - host_ids = [] - for location in work_locations: - host_organization_lineage_malformed: list[str] = ( - location.source.host_organization_lineage + document_desc = self.build_description(wrapper) + document_content, pdf_flag = self._resolve_full_content(document_desc, wrapper) + document_details = self._build_details(document_url, pdf_flag, wrapper) + wrapper.document.title = wrapper.raw_data.title + wrapper.document.description = document_desc + wrapper.document.content = document_content + wrapper.document.details = document_details + wrapper.document.external_id = self._get_doi(wrapper) + wrapper.document.external_id_type = ExternalIdType.DOI + + return wrapper.document + + def _build_details( + self, + document_url: str | None, + pdf_flag: str | Any, + wrapper: WrapperRawData, + ) -> dict[ + str | Any, + int | str | None | list[dict[str, Any]] | list[str | None] | list[str] | Any, + ]: + """ + Build the details of the document in a dict format expected by the WeLearn DB + :param document_url: URL of the document to build the details for (used for logging purposes) + :param pdf_flag: flag indicating if the content of the document is from the PDF or not (used for logging purposes) + :param wrapper: WrapperRawData containing the raw data of the document to build the details from + :return: dict containing the details of the document in the format expected by the WeLearn DB + """ + document_details = { + "publication_date": self._build_publication_date(wrapper), + "type": wrapper.raw_data.type, + "doi": self._get_doi(wrapper), + "publisher": wrapper.raw_data.best_oa_location.source.host_organization_name, + "license_url": self._get_licence(document_url, wrapper), + "issn": wrapper.raw_data.best_oa_location.source.issn_l, + "content_from_pdf": pdf_flag, + "topics": [ + asdict(t) for t in self._transform_topics(wrapper.raw_data.topics) + ], + "tags": [x.display_name for x in wrapper.raw_data.keywords], + "referenced_works": wrapper.raw_data.referenced_works, + "related_works": wrapper.raw_data.related_works, + "authors": self._build_authors_list(wrapper), + } + return document_details + + @staticmethod + def _get_doi(wrapper: WrapperRawData) -> str | None: + doi = wrapper.raw_data.ids.doi + if doi.startswith("https://doi.org/"): + doi = doi.replace("https://doi.org/", "") + return doi + + @staticmethod + def _build_authors_list(wrapper: WrapperRawData) -> list[Any]: + authors = [] + for author_info in wrapper.raw_data.authorships: + authors.append( + { + "name": author_info.author.display_name, + "misc": ",".join(author_info.raw_affiliation_strings), + } ) - if ( - host_organization_lineage_malformed is None - or len(host_organization_lineage_malformed) == 0 - ): - continue - try: - host_organization_lineage = self._extract_openalex_id_from_urls( - host_organization_lineage_malformed - ) - host_ids.extend(host_organization_lineage) - except ManagementExceptions as e: - logger.warning( - f"Cannot extract host organization lineage from {location.source.host_organization_lineage}: {e}" - ) - continue + return authors - avoiding_ids = PUBLISHERS_TO_AVOID - for host_id in host_ids: - if host_id.upper() in avoiding_ids: - raise UnauthorizedPublisher(f"{host_id} is not authorized in welearn") + @staticmethod + def _build_publication_date(wrapper: WrapperRawData) -> int: + publication_date = int( + datetime.strptime( + wrapper.raw_data.publication_date, YEAR_FIRST_DATE_FORMAT + ).timestamp() + ) + return publication_date + def _resolve_full_content( + self, document_desc: str | Any, wrapper: WrapperRawData + ) -> tuple[bool, str | Any]: + """ + Get the full content of the document. If the PDF is available and can be retrieved, extract the content from the PDF. Otherwise, use the description as the content. + :param document_desc: Description of the document to use as content if the PDF is not available or cannot be retrieved + :param wrapper: WrapperRawData containing the raw data of the document to get the content from + :return: tuple containing a flag indicating if the content is from the PDF and the content of the document + """ document_content = document_desc + if wrapper.raw_data.best_oa_location.pdf_url is None: pdf_flag = False else: @@ -240,65 +296,110 @@ def _update_welearn_document(self, wrapper: WrapperRawData) -> WeLearnDocument: f"PDF retrievement error, use description as content: {e}" ) pdf_flag = False + return document_content, pdf_flag - publication_date = int( - datetime.strptime( - wrapper.raw_data.publication_date, YEAR_FIRST_DATE_FORMAT - ).timestamp() + def build_description(self, wrapper: WrapperRawData) -> str | Any: + document_desc = self._remove_useless_first_word( + string_to_clear=self._invert_abstract( + wrapper.raw_data.abstract_inverted_index + ), + useless_words=["background", "abstract", "introduction"], ) + return document_desc - authors = [] - for author_info in wrapper.raw_data.authorships: - authors.append( - { - "name": author_info.author.display_name, - "misc": ",".join(author_info.raw_affiliation_strings), - } - ) - + @staticmethod + def _check_access(document_url: str, wrapper: WrapperRawData): + """ + Check if the document is open access. If not, raise a ClosedAccessContent exception + :param document_url: URL of the document to check + :param wrapper: WrapperRawData containing the raw data of the document to check + :exception ClosedAccessContent: If the document is not open access + """ if not wrapper.raw_data.open_access.is_oa: raise ClosedAccessContent() else: logger.info(f"The content {document_url} is open access") - best_oa_location_info = wrapper.raw_data.best_oa_location + def _check_license(self, document_url: str, wrapper: WrapperRawData): + """ + Check if the license of the document is in the list of authorized licenses. If not, raise an UnauthorizedLicense exception + :param document_url: URL of the document to check + :param wrapper: WrapperRawData containing the raw data of the document to check + :exception UnauthorizedLicense: If the license of the document is not in the list of authorized licenses + """ + license_good_format = self._get_licence(document_url, wrapper) - # Open Alex format is cc-by... - license_openalex_format: str = best_oa_location_info.license + if license_good_format.lower() not in AUTHORIZED_LICENSES: + raise UnauthorizedLicense(f"{license_good_format.lower()} is not allowed") - if not license_openalex_format.startswith("cc"): - raise UnauthorizedLicense() + @staticmethod + def _get_licence(document_url: str | None, wrapper: WrapperRawData) -> str: + """ + Get the license of the document in a good format (https://creativecommons.org/licenses/xxx/4.0/) from the raw data of the document. If the license is not in the expected format, log a warning and return it in lowercase. + :param document_url: URL of the document to get the license from (used for logging purposes) + :param wrapper: WrapperRawData containing the raw data of the document to get the license from + :return: License of the document in a good format (https://creativecommons.org/licenses/xxx/4.0/) if it is in the expected format, otherwise in lowercase + """ + best_oa_location_info = wrapper.raw_data.best_oa_location + license_openalex_format: str = best_oa_location_info.license + if license_openalex_format is None: + logger.warning( + f"No license found for {document_url}, set it to empty string" + ) + return "" - logger.info(f"The content {document_url} is legally usable") + if not license_openalex_format.startswith("cc-"): + logger.warning( + f"License {license_openalex_format} of {document_url} is not in the expected format, set it to lowercase" + ) + return license_openalex_format.lower() license_good_format = f"{HTTPS_CREATIVE_COMMONS}/licenses/{license_openalex_format.replace('cc-', '')}/4.0/" + return license_good_format - if license_good_format.lower() not in AUTHORIZED_LICENSES: - raise UnauthorizedLicense(f"{license_good_format.lower()} is not allowed") + def _check_publisher_authorization(self, wrapper: WrapperRawData): + """ + Check if the publisher of the document is authorized to be used in WeLearn. If not, raise an UnauthorizedPublisher exception - logger.info(f"The content {document_url} is legally usable") + :param wrapper: WrapperRawData containing the raw data of the document to check + :exception UnauthorizedPublisher: If the publisher is not authorized to be used in WeLearn + """ + work_locations: list[Location] = wrapper.raw_data.locations + host_ids = self.get_host_ids(work_locations) - document_details = { - "publication_date": publication_date, - "type": wrapper.raw_data.type, - "doi": wrapper.raw_data.ids.doi, - "publisher": wrapper.raw_data.best_oa_location.source.host_organization_name, - "license_url": license_good_format, - "issn": wrapper.raw_data.best_oa_location.source.issn_l, - "content_from_pdf": pdf_flag, - "topics": [ - asdict(t) for t in self._transform_topics(wrapper.raw_data.topics) - ], - "tags": [x.display_name for x in wrapper.raw_data.keywords], - "referenced_works": wrapper.raw_data.referenced_works, - "related_works": wrapper.raw_data.related_works, - "authors": authors, - } - wrapper.document.title = document_title - wrapper.document.description = document_desc - wrapper.document.content = document_content - wrapper.document.details = document_details - return wrapper.document + avoiding_ids = PUBLISHERS_TO_AVOID + for host_id in host_ids: + if host_id.upper() in avoiding_ids: + raise UnauthorizedPublisher(f"{host_id} is not authorized in welearn") + + def get_host_ids(self, work_locations: list[Location]) -> list[Any]: + """ + Get the host organization lineage from the work locations and extract the OpenAlex IDs from it. If the host organization lineage is not in the expected format, log a warning and skip it. + + :param work_locations: list of Location objects containing the host organization lineage to extract the OpenAlex IDs from + :return: list of OpenAlex IDs extracted from the host organization lineage + """ + host_ids = [] + for location in work_locations: + host_organization_lineage_malformed: list[str] = ( + location.source.host_organization_lineage + ) + if ( + host_organization_lineage_malformed is None + or len(host_organization_lineage_malformed) == 0 + ): + continue + try: + host_organization_lineage = self._extract_openalex_id_from_urls( + host_organization_lineage_malformed + ) + host_ids.extend(host_organization_lineage) + except ManagementExceptions as e: + logger.warning( + f"Cannot extract host organization lineage from {location.source.host_organization_lineage}: {e}" + ) + continue + return host_ids def run(self, documents: list[WeLearnDocument]) -> list[WrapperRetrieveDocument]: ret: list[WrapperRetrieveDocument] = [] From a07ae71581ad6980280cf84958f9ba9839d17c33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o?= Date: Wed, 22 Apr 2026 16:59:05 +0200 Subject: [PATCH 3/6] bump version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 8ce6e40..b8acdd7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,7 @@ sentence-transformers = "^5.2.2" spacy = "^3.8.11" refinedoc = "^1.0.0" qdrant-client = "1.16.2" -python-dotenv = "^1.1.0" +python-dotenv = "^1.2.1" beautifulsoup4 = "^4.14.3" pyphen = "^0.17.2" ijson = "^3.4.0" From ceb20e1ca05c06825d16d04919b984b961adf3b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o?= Date: Wed, 22 Apr 2026 17:40:49 +0200 Subject: [PATCH 4/6] feat: update exception handling for version numbers and add DOI validation --- welearn_datastack/exceptions.py | 10 +++++++--- welearn_datastack/plugins/scrapers/peerj.py | 8 +++++++- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/welearn_datastack/exceptions.py b/welearn_datastack/exceptions.py index f5d148d..a0edca3 100644 --- a/welearn_datastack/exceptions.py +++ b/welearn_datastack/exceptions.py @@ -113,7 +113,7 @@ def __init__(self, message="Invalid language code, must be lower ISO-639-1 code" super().__init__(self.message) -class VersionNumberError(BaseException): +class VersionNumberError(Exception): """Raised when an invalid version number is used""" def __init__(self, message="Invalid version number, must be an integer"): @@ -121,7 +121,7 @@ def __init__(self, message="Invalid version number, must be an integer"): super().__init__(self.message) -class NoPreviousCollectionError(BaseException): +class NoPreviousCollectionError(Exception): """Raised when there is no previous collection""" def __init__(self, message="No previous collection found"): @@ -129,7 +129,7 @@ def __init__(self, message="No previous collection found"): super().__init__(self.message) -class NoConnectedCollectionError(BaseException): +class NoConnectedCollectionError(Exception): """Raised when there is no connected collection""" def __init__(self, message="No connected collection found"): @@ -238,3 +238,7 @@ def __init__(self, msg="No title found in this document", *args): class NoDescriptionFoundError(NotEnoughData): """Raised when there is no description found""" + + +class NoDOIFoundError(NotEnoughData): + """Raised when there is no DOI found""" diff --git a/welearn_datastack/plugins/scrapers/peerj.py b/welearn_datastack/plugins/scrapers/peerj.py index 1b65b55..eebe26a 100644 --- a/welearn_datastack/plugins/scrapers/peerj.py +++ b/welearn_datastack/plugins/scrapers/peerj.py @@ -6,11 +6,12 @@ import requests # type: ignore from bs4 import BeautifulSoup, Tag # type: ignore from requests.adapters import HTTPAdapter # type: ignore +from welearn_database.data.enumeration import ExternalIdType from welearn_database.data.models import WeLearnDocument from welearn_datastack.constants import AUTHORIZED_LICENSES from welearn_datastack.data.db_wrapper import WrapperRetrieveDocument -from welearn_datastack.exceptions import UnauthorizedLicense +from welearn_datastack.exceptions import NoDOIFoundError, UnauthorizedLicense from welearn_datastack.plugins.interface import IPluginScrapeCollector from welearn_datastack.utils_.http_client_utils import ( get_http_code_from_exception, @@ -235,6 +236,11 @@ def _scrape_url(self, document: WeLearnDocument) -> WeLearnDocument: document.description = description document.full_content = content_bs_txt document.details = self._get_document_details(soup=soup) + doi = document.details.get("doi", None) + if not doi: + raise NoDOIFoundError(f"No DOI found for '{document.url}'") + document.external_id = doi + document.external_id_type = ExternalIdType.DOI return document From a4627315defa9f37a795834a7067633816b48446 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o?= Date: Wed, 22 Apr 2026 17:46:15 +0200 Subject: [PATCH 5/6] feat: add external ID and type assertions in PeerJ document tests --- .../document_collector_hub/plugins_test/test_scraping_peerj.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/document_collector_hub/plugins_test/test_scraping_peerj.py b/tests/document_collector_hub/plugins_test/test_scraping_peerj.py index 9520b39..6723ade 100644 --- a/tests/document_collector_hub/plugins_test/test_scraping_peerj.py +++ b/tests/document_collector_hub/plugins_test/test_scraping_peerj.py @@ -54,6 +54,8 @@ def test_plugin_run_success(self, mock_get_session): self.assertTrue(doc_result.document.title) self.assertTrue(doc_result.document.description) self.assertTrue(doc_result.document.full_content) + self.assertEqual(doc_result.document.external_id, "10.7717/peerj.12713") + self.assertEqual(doc_result.document.external_id_type, "doi") self.assertIsInstance(doc_result.document.details, dict) self.assertIn("license_url", doc_result.document.details) self.assertIn("authors", doc_result.document.details) From f65ea1680d2a58a732a6506f9980d6ab0bb587a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o?= <133012334+lpi-tn@users.noreply.github.com> Date: Thu, 23 Apr 2026 16:29:58 +0200 Subject: [PATCH 6/6] Update welearn_datastack/plugins/rest_requesters/open_alex.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- welearn_datastack/plugins/rest_requesters/open_alex.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/welearn_datastack/plugins/rest_requesters/open_alex.py b/welearn_datastack/plugins/rest_requesters/open_alex.py index cae6ca5..ee03a15 100644 --- a/welearn_datastack/plugins/rest_requesters/open_alex.py +++ b/welearn_datastack/plugins/rest_requesters/open_alex.py @@ -302,7 +302,8 @@ def build_description(self, wrapper: WrapperRawData) -> str | Any: document_desc = self._remove_useless_first_word( string_to_clear=self._invert_abstract( wrapper.raw_data.abstract_inverted_index - ), + ) + or "", useless_words=["background", "abstract", "introduction"], ) return document_desc