From 18cc7209776196fee45c3eaf59cb883a6d980f5e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9o?= <theo.nardin@cri-paris.org>
Date: Wed, 22 Apr 2026 16:20:08 +0200
Subject: [PATCH 1/6] feat: add DOI extraction and refactor related code in
 plos.py

---
 welearn_datastack/plugins/scrapers/plos.py | 25 +++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/welearn_datastack/plugins/scrapers/plos.py b/welearn_datastack/plugins/scrapers/plos.py
index 56c36dd..90bfa8c 100644
--- a/welearn_datastack/plugins/scrapers/plos.py
+++ b/welearn_datastack/plugins/scrapers/plos.py
@@ -8,6 +8,7 @@
 import requests  # type: ignore
 from bs4 import BeautifulSoup, Tag  # type: ignore
 from requests.adapters import HTTPAdapter  # type: ignore
+from welearn_database.data.enumeration import ExternalIdType
 from welearn_database.data.models import WeLearnDocument
 
 from welearn_datastack.constants import ANTI_URL_REGEX, AUTHORIZED_LICENSES
@@ -70,11 +71,11 @@ def _get_document_details(self, soup: BeautifulSoup) -> Dict[str, Any]:
 
         authors = self._get_authors(article_meta)
 
-        doi_extract = article_meta.find("article-id", {"pub-id-type": "doi"})
+        doi = self.extract_doi(article_meta)
+
         published_id_extract = article_meta.find(
             "article-id", {"pub-id-type": "publisher-id"}
         )
-        doi = "" if not isinstance(doi_extract, Tag) else doi_extract.text
         published_id = (
             ""
             if not isinstance(published_id_extract, Tag)
@@ -82,7 +83,7 @@ def _get_document_details(self, soup: BeautifulSoup) -> Dict[str, Any]:
         )
 
         journal_extract = journal_meta.find("journal-title")
-        journal = "" if not isinstance(journal_extract, Tag) else journal_extract.text
+        journal = self.extract_property(journal_extract)
 
         article_type = self._get_article_type(article_meta)
 
@@ -92,7 +93,7 @@ def _get_document_details(self, soup: BeautifulSoup) -> Dict[str, Any]:
             publication_date = self._generate_timestamp_from_html(pubdate_extract)
 
         issn_extract = journal_meta.find("issn")
-        issn = "" if not isinstance(issn_extract, Tag) else issn_extract.text
+        issn = self.extract_property(issn_extract)
 
         pub_name_extract = journal_meta.find("publisher-name")
         pub_loc_extract = journal_meta.find("publisher-loc")
@@ -118,6 +119,18 @@ def _get_document_details(self, soup: BeautifulSoup) -> Dict[str, Any]:
         }
         return ret
 
+    def extract_doi(self, article_meta: BeautifulSoup) -> str:
+        doi_extract = article_meta.find("article-id", {"pub-id-type": "doi"})
+        doi = self.extract_property(doi_extract)
+        if doi.startswith("https://doi.org/"):
+            doi = doi.replace("https://doi.org/", "")
+        return doi
+
+    @staticmethod
+    def extract_property(doi_extract: BeautifulSoup | None) -> str:
+        doi = "" if not isinstance(doi_extract, Tag) else doi_extract.text
+        return doi
+
     @staticmethod
     def _handle_license(article_meta):
         """
@@ -172,7 +185,7 @@ def _get_authors(article_meta):
             for name_part in name_tag.children:
                 if name_part.text != "\n":
                     name += name_part.text + " "
-            name.strip()
+            name = name.strip()
             author["name"] = clean_return_to_line(name)
 
             # Misc
@@ -259,6 +272,8 @@ def extract_data_from_plos_xml(
         document.description = clean_return_to_line(doc_desc)
         document.full_content = clean_doc_content
         document.details = self._get_document_details(soup=soup)
+        document.external_id = self.extract_doi(article_meta)
+        document.external_id_type = ExternalIdType.DOI
 
         return document
 

From 74a8a2be8b1f904a881215e0eae3f34d495ba95a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9o?= <theo.nardin@cri-paris.org>
Date: Wed, 22 Apr 2026 16:57:54 +0200
Subject: [PATCH 2/6] feat: enhance OpenAlex document processing with detailed
 metadata extraction and validation

---
 .../plugins_test/test_open_alex.py            |   4 +-
 .../plugins/rest_requesters/open_alex.py      | 255 ++++++++++++------
 2 files changed, 181 insertions(+), 78 deletions(-)

diff --git a/tests/document_collector_hub/plugins_test/test_open_alex.py b/tests/document_collector_hub/plugins_test/test_open_alex.py
index cac8b87..dc64559 100644
--- a/tests/document_collector_hub/plugins_test/test_open_alex.py
+++ b/tests/document_collector_hub/plugins_test/test_open_alex.py
@@ -33,7 +33,7 @@
 
 def build_openalex_result(
     url: str = "https://openalex.org/W123",
-    doi: str = "10.1234/example",
+    doi: str = "https://doi.org/10.1234/example",
     title: str = "Sample Title",
 ):
     ids = Ids(openalex=url, doi=doi, mag="", pmid="", pmcid="")
@@ -216,6 +216,8 @@ def test_update_welearn_document_returns_expected_document(self, mock_pdf):
             doc.details["authors"][0]["name"],
             openalex_result.authorships[0].author.display_name,
         )
+        self.assertEqual(doc.external_id, "10.1234/example")
+        self.assertEqual(doc.external_id_type, "doi")
 
     # Test _update_welearn_document raises on closed access
     @patch("welearn_datastack.plugins.rest_requesters.open_alex.get_new_https_session")
diff --git a/welearn_datastack/plugins/rest_requesters/open_alex.py b/welearn_datastack/plugins/rest_requesters/open_alex.py
index 34df448..cae6ca5 100644
--- a/welearn_datastack/plugins/rest_requesters/open_alex.py
+++ b/welearn_datastack/plugins/rest_requesters/open_alex.py
@@ -5,9 +5,10 @@
 from dataclasses import asdict
 from datetime import datetime
 from itertools import batched
-from typing import Iterable
+from typing import Any, Iterable
 from urllib.parse import urlparse
 
+from welearn_database.data.enumeration import ExternalIdType
 from welearn_database.data.models import WeLearnDocument
 
 from welearn_datastack.constants import (
@@ -69,10 +70,11 @@ def __init__(self) -> None:
         self.team_email = team_email
 
     @staticmethod
-    def _invert_abstract(inv_index: dict[str, list[int]]) -> str:
+    def _invert_abstract(inv_index: dict[str, list[int]]) -> str | None:
         if inv_index is not None:
             l_inv = [(w, p) for w, pos in inv_index.items() for p in pos]
-            return " ".join(map(lambda x: x[0], sorted(l_inv, key=lambda x: x[1])))
+            return " ".join([x[0] for x in sorted(l_inv, key=lambda x: x[1])])
+        return None
 
     @staticmethod
     def _extract_openalex_id_from_urls(urls: Iterable[str]) -> list[str]:
@@ -183,44 +185,98 @@ def _remove_useless_first_word(
         return string_to_clear
 
     def _update_welearn_document(self, wrapper: WrapperRawData) -> WeLearnDocument:
-        document_title = wrapper.raw_data.title
         document_url = wrapper.raw_data.ids.openalex
         logger.info(f"Process {document_url}...")
-        document_desc = self._remove_useless_first_word(
-            string_to_clear=self._invert_abstract(
-                wrapper.raw_data.abstract_inverted_index
-            ),
-            useless_words=["background", "abstract", "introduction"],
-        )
+        self._check_publisher_authorization(wrapper)
+        self._check_access(document_url, wrapper)
+        self._check_license(document_url, wrapper)
+        logger.info(f"The content {document_url} is legally usable")
 
-        work_locations: list[Location] = wrapper.raw_data.locations
-        host_ids = []
-        for location in work_locations:
-            host_organization_lineage_malformed: list[str] = (
-                location.source.host_organization_lineage
+        document_desc = self.build_description(wrapper)
+        document_content, pdf_flag = self._resolve_full_content(document_desc, wrapper)
+        document_details = self._build_details(document_url, pdf_flag, wrapper)
+        wrapper.document.title = wrapper.raw_data.title
+        wrapper.document.description = document_desc
+        wrapper.document.content = document_content
+        wrapper.document.details = document_details
+        wrapper.document.external_id = self._get_doi(wrapper)
+        wrapper.document.external_id_type = ExternalIdType.DOI
+
+        return wrapper.document
+
+    def _build_details(
+        self,
+        document_url: str | None,
+        pdf_flag: str | Any,
+        wrapper: WrapperRawData,
+    ) -> dict[
+        str | Any,
+        int | str | None | list[dict[str, Any]] | list[str | None] | list[str] | Any,
+    ]:
+        """
+        Build the details of the document in a dict format expected by the WeLearn DB
+        :param document_url: URL of the document to build the details for (used for logging purposes)
+        :param pdf_flag: flag indicating if the content of the document is from the PDF or not (used for logging purposes)
+        :param wrapper: WrapperRawData containing the raw data of the document to build the details from
+        :return: dict containing the details of the document in the format expected by the WeLearn DB
+        """
+        document_details = {
+            "publication_date": self._build_publication_date(wrapper),
+            "type": wrapper.raw_data.type,
+            "doi": self._get_doi(wrapper),
+            "publisher": wrapper.raw_data.best_oa_location.source.host_organization_name,
+            "license_url": self._get_licence(document_url, wrapper),
+            "issn": wrapper.raw_data.best_oa_location.source.issn_l,
+            "content_from_pdf": pdf_flag,
+            "topics": [
+                asdict(t) for t in self._transform_topics(wrapper.raw_data.topics)
+            ],
+            "tags": [x.display_name for x in wrapper.raw_data.keywords],
+            "referenced_works": wrapper.raw_data.referenced_works,
+            "related_works": wrapper.raw_data.related_works,
+            "authors": self._build_authors_list(wrapper),
+        }
+        return document_details
+
+    @staticmethod
+    def _get_doi(wrapper: WrapperRawData) -> str | None:
+        doi = wrapper.raw_data.ids.doi
+        if doi.startswith("https://doi.org/"):
+            doi = doi.replace("https://doi.org/", "")
+        return doi
+
+    @staticmethod
+    def _build_authors_list(wrapper: WrapperRawData) -> list[Any]:
+        authors = []
+        for author_info in wrapper.raw_data.authorships:
+            authors.append(
+                {
+                    "name": author_info.author.display_name,
+                    "misc": ",".join(author_info.raw_affiliation_strings),
+                }
             )
-            if (
-                host_organization_lineage_malformed is None
-                or len(host_organization_lineage_malformed) == 0
-            ):
-                continue
-            try:
-                host_organization_lineage = self._extract_openalex_id_from_urls(
-                    host_organization_lineage_malformed
-                )
-                host_ids.extend(host_organization_lineage)
-            except ManagementExceptions as e:
-                logger.warning(
-                    f"Cannot extract host organization lineage from {location.source.host_organization_lineage}: {e}"
-                )
-                continue
+        return authors
 
-        avoiding_ids = PUBLISHERS_TO_AVOID
-        for host_id in host_ids:
-            if host_id.upper() in avoiding_ids:
-                raise UnauthorizedPublisher(f"{host_id} is not authorized in welearn")
+    @staticmethod
+    def _build_publication_date(wrapper: WrapperRawData) -> int:
+        publication_date = int(
+            datetime.strptime(
+                wrapper.raw_data.publication_date, YEAR_FIRST_DATE_FORMAT
+            ).timestamp()
+        )
+        return publication_date
 
+    def _resolve_full_content(
+        self, document_desc: str | Any, wrapper: WrapperRawData
+    ) -> tuple[bool, str | Any]:
+        """
+        Get the full content of the document. If the PDF is available and can be retrieved, extract the content from the PDF. Otherwise, use the description as the content.
+        :param document_desc: Description of the document to use as content if the PDF is not available or cannot be retrieved
+        :param wrapper: WrapperRawData containing the raw data of the document to get the content from
+        :return: tuple containing a flag indicating if the content is from the PDF and the content of the document
+        """
         document_content = document_desc
+
         if wrapper.raw_data.best_oa_location.pdf_url is None:
             pdf_flag = False
         else:
@@ -240,65 +296,110 @@ def _update_welearn_document(self, wrapper: WrapperRawData) -> WeLearnDocument:
                     f"PDF retrievement error, use description as content: {e}"
                 )
                 pdf_flag = False
+        return document_content, pdf_flag
 
-        publication_date = int(
-            datetime.strptime(
-                wrapper.raw_data.publication_date, YEAR_FIRST_DATE_FORMAT
-            ).timestamp()
+    def build_description(self, wrapper: WrapperRawData) -> str | Any:
+        document_desc = self._remove_useless_first_word(
+            string_to_clear=self._invert_abstract(
+                wrapper.raw_data.abstract_inverted_index
+            ),
+            useless_words=["background", "abstract", "introduction"],
         )
+        return document_desc
 
-        authors = []
-        for author_info in wrapper.raw_data.authorships:
-            authors.append(
-                {
-                    "name": author_info.author.display_name,
-                    "misc": ",".join(author_info.raw_affiliation_strings),
-                }
-            )
-
+    @staticmethod
+    def _check_access(document_url: str, wrapper: WrapperRawData):
+        """
+        Check if the document is open access. If not, raise a ClosedAccessContent exception
+        :param document_url: URL of the document to check
+        :param wrapper: WrapperRawData containing the raw data of the document to check
+        :exception ClosedAccessContent: If the document is not open access
+        """
         if not wrapper.raw_data.open_access.is_oa:
             raise ClosedAccessContent()
         else:
             logger.info(f"The content {document_url} is open access")
 
-        best_oa_location_info = wrapper.raw_data.best_oa_location
+    def _check_license(self, document_url: str, wrapper: WrapperRawData):
+        """
+        Check if the license of the document is in the list of authorized licenses. If not, raise an UnauthorizedLicense exception
+        :param document_url: URL of the document to check
+        :param wrapper: WrapperRawData containing the raw data of the document to check
+        :exception UnauthorizedLicense: If the license of the document is not in the list of authorized licenses
+        """
+        license_good_format = self._get_licence(document_url, wrapper)
 
-        # Open Alex format is cc-by...
-        license_openalex_format: str = best_oa_location_info.license
+        if license_good_format.lower() not in AUTHORIZED_LICENSES:
+            raise UnauthorizedLicense(f"{license_good_format.lower()} is not allowed")
 
-        if not license_openalex_format.startswith("cc"):
-            raise UnauthorizedLicense()
+    @staticmethod
+    def _get_licence(document_url: str | None, wrapper: WrapperRawData) -> str:
+        """
+        Get the license of the document in a good format (https://creativecommons.org/licenses/xxx/4.0/) from the raw data of the document. If the license is not in the expected format, log a warning and return it in lowercase.
+        :param document_url: URL of the document to get the license from (used for logging purposes)
+        :param wrapper: WrapperRawData containing the raw data of the document to get the license from
+        :return: License of the document in a good format (https://creativecommons.org/licenses/xxx/4.0/) if it is in the expected format, otherwise in lowercase
+        """
+        best_oa_location_info = wrapper.raw_data.best_oa_location
+        license_openalex_format: str = best_oa_location_info.license
+        if license_openalex_format is None:
+            logger.warning(
+                f"No license found for {document_url}, set it to empty string"
+            )
+            return ""
 
-        logger.info(f"The content {document_url} is legally usable")
+        if not license_openalex_format.startswith("cc-"):
+            logger.warning(
+                f"License {license_openalex_format} of {document_url} is not in the expected format, set it to lowercase"
+            )
+            return license_openalex_format.lower()
 
         license_good_format = f"{HTTPS_CREATIVE_COMMONS}/licenses/{license_openalex_format.replace('cc-', '')}/4.0/"
+        return license_good_format
 
-        if license_good_format.lower() not in AUTHORIZED_LICENSES:
-            raise UnauthorizedLicense(f"{license_good_format.lower()} is not allowed")
+    def _check_publisher_authorization(self, wrapper: WrapperRawData):
+        """
+        Check if the publisher of the document is authorized to be used in WeLearn. If not, raise an UnauthorizedPublisher exception
 
-        logger.info(f"The content {document_url} is legally usable")
+        :param wrapper: WrapperRawData containing the raw data of the document to check
+        :exception UnauthorizedPublisher: If the publisher is not authorized to be used in WeLearn
+        """
+        work_locations: list[Location] = wrapper.raw_data.locations
+        host_ids = self.get_host_ids(work_locations)
 
-        document_details = {
-            "publication_date": publication_date,
-            "type": wrapper.raw_data.type,
-            "doi": wrapper.raw_data.ids.doi,
-            "publisher": wrapper.raw_data.best_oa_location.source.host_organization_name,
-            "license_url": license_good_format,
-            "issn": wrapper.raw_data.best_oa_location.source.issn_l,
-            "content_from_pdf": pdf_flag,
-            "topics": [
-                asdict(t) for t in self._transform_topics(wrapper.raw_data.topics)
-            ],
-            "tags": [x.display_name for x in wrapper.raw_data.keywords],
-            "referenced_works": wrapper.raw_data.referenced_works,
-            "related_works": wrapper.raw_data.related_works,
-            "authors": authors,
-        }
-        wrapper.document.title = document_title
-        wrapper.document.description = document_desc
-        wrapper.document.content = document_content
-        wrapper.document.details = document_details
-        return wrapper.document
+        avoiding_ids = PUBLISHERS_TO_AVOID
+        for host_id in host_ids:
+            if host_id.upper() in avoiding_ids:
+                raise UnauthorizedPublisher(f"{host_id} is not authorized in welearn")
+
+    def get_host_ids(self, work_locations: list[Location]) -> list[Any]:
+        """
+        Get the host organization lineage from the work locations and extract the OpenAlex IDs from it. If the host organization lineage is not in the expected format, log a warning and skip it.
+
+        :param work_locations: list of Location objects containing the host organization lineage to extract the OpenAlex IDs from
+        :return: list of OpenAlex IDs extracted from the host organization lineage
+        """
+        host_ids = []
+        for location in work_locations:
+            host_organization_lineage_malformed: list[str] = (
+                location.source.host_organization_lineage
+            )
+            if (
+                host_organization_lineage_malformed is None
+                or len(host_organization_lineage_malformed) == 0
+            ):
+                continue
+            try:
+                host_organization_lineage = self._extract_openalex_id_from_urls(
+                    host_organization_lineage_malformed
+                )
+                host_ids.extend(host_organization_lineage)
+            except ManagementExceptions as e:
+                logger.warning(
+                    f"Cannot extract host organization lineage from {location.source.host_organization_lineage}: {e}"
+                )
+                continue
+        return host_ids
 
     def run(self, documents: list[WeLearnDocument]) -> list[WrapperRetrieveDocument]:
         ret: list[WrapperRetrieveDocument] = []

From a07ae71581ad6980280cf84958f9ba9839d17c33 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9o?= <theo.nardin@cri-paris.org>
Date: Wed, 22 Apr 2026 16:59:05 +0200
Subject: [PATCH 3/6] bump version

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 8ce6e40..b8acdd7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,7 +31,7 @@ sentence-transformers = "^5.2.2"
 spacy = "^3.8.11"
 refinedoc = "^1.0.0"
 qdrant-client = "1.16.2"
-python-dotenv = "^1.1.0"
+python-dotenv = "^1.2.1"
 beautifulsoup4 = "^4.14.3"
 pyphen = "^0.17.2"
 ijson = "^3.4.0"

From ceb20e1ca05c06825d16d04919b984b961adf3b3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9o?= <theo.nardin@cri-paris.org>
Date: Wed, 22 Apr 2026 17:40:49 +0200
Subject: [PATCH 4/6] feat: update exception handling for version numbers and
 add DOI validation

---
 welearn_datastack/exceptions.py             | 10 +++++++---
 welearn_datastack/plugins/scrapers/peerj.py |  8 +++++++-
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/welearn_datastack/exceptions.py b/welearn_datastack/exceptions.py
index f5d148d..a0edca3 100644
--- a/welearn_datastack/exceptions.py
+++ b/welearn_datastack/exceptions.py
@@ -113,7 +113,7 @@ def __init__(self, message="Invalid language code, must be lower ISO-639-1 code"
         super().__init__(self.message)
 
 
-class VersionNumberError(BaseException):
+class VersionNumberError(Exception):
     """Raised when an invalid version number is used"""
 
     def __init__(self, message="Invalid version number, must be an integer"):
@@ -121,7 +121,7 @@ def __init__(self, message="Invalid version number, must be an integer"):
         super().__init__(self.message)
 
 
-class NoPreviousCollectionError(BaseException):
+class NoPreviousCollectionError(Exception):
     """Raised when there is no previous collection"""
 
     def __init__(self, message="No previous collection found"):
@@ -129,7 +129,7 @@ def __init__(self, message="No previous collection found"):
         super().__init__(self.message)
 
 
-class NoConnectedCollectionError(BaseException):
+class NoConnectedCollectionError(Exception):
     """Raised when there is no connected collection"""
 
     def __init__(self, message="No connected collection found"):
@@ -238,3 +238,7 @@ def __init__(self, msg="No title found in this document", *args):
 
 class NoDescriptionFoundError(NotEnoughData):
     """Raised when there is no description found"""
+
+
+class NoDOIFoundError(NotEnoughData):
+    """Raised when there is no DOI found"""
diff --git a/welearn_datastack/plugins/scrapers/peerj.py b/welearn_datastack/plugins/scrapers/peerj.py
index 1b65b55..eebe26a 100644
--- a/welearn_datastack/plugins/scrapers/peerj.py
+++ b/welearn_datastack/plugins/scrapers/peerj.py
@@ -6,11 +6,12 @@
 import requests  # type: ignore
 from bs4 import BeautifulSoup, Tag  # type: ignore
 from requests.adapters import HTTPAdapter  # type: ignore
+from welearn_database.data.enumeration import ExternalIdType
 from welearn_database.data.models import WeLearnDocument
 
 from welearn_datastack.constants import AUTHORIZED_LICENSES
 from welearn_datastack.data.db_wrapper import WrapperRetrieveDocument
-from welearn_datastack.exceptions import UnauthorizedLicense
+from welearn_datastack.exceptions import NoDOIFoundError, UnauthorizedLicense
 from welearn_datastack.plugins.interface import IPluginScrapeCollector
 from welearn_datastack.utils_.http_client_utils import (
     get_http_code_from_exception,
@@ -235,6 +236,11 @@ def _scrape_url(self, document: WeLearnDocument) -> WeLearnDocument:
         document.description = description
         document.full_content = content_bs_txt
         document.details = self._get_document_details(soup=soup)
+        doi = document.details.get("doi", None)
+        if not doi:
+            raise NoDOIFoundError(f"No DOI found for '{document.url}'")
+        document.external_id = doi
+        document.external_id_type = ExternalIdType.DOI
 
         return document
 

From a4627315defa9f37a795834a7067633816b48446 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9o?= <theo.nardin@cri-paris.org>
Date: Wed, 22 Apr 2026 17:46:15 +0200
Subject: [PATCH 5/6] feat: add external ID and type assertions in PeerJ
 document tests

---
 .../document_collector_hub/plugins_test/test_scraping_peerj.py  | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/document_collector_hub/plugins_test/test_scraping_peerj.py b/tests/document_collector_hub/plugins_test/test_scraping_peerj.py
index 9520b39..6723ade 100644
--- a/tests/document_collector_hub/plugins_test/test_scraping_peerj.py
+++ b/tests/document_collector_hub/plugins_test/test_scraping_peerj.py
@@ -54,6 +54,8 @@ def test_plugin_run_success(self, mock_get_session):
         self.assertTrue(doc_result.document.title)
         self.assertTrue(doc_result.document.description)
         self.assertTrue(doc_result.document.full_content)
+        self.assertEqual(doc_result.document.external_id, "10.7717/peerj.12713")
+        self.assertEqual(doc_result.document.external_id_type, "doi")
         self.assertIsInstance(doc_result.document.details, dict)
         self.assertIn("license_url", doc_result.document.details)
         self.assertIn("authors", doc_result.document.details)

From f65ea1680d2a58a732a6506f9980d6ab0bb587a2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9o?= <133012334+lpi-tn@users.noreply.github.com>
Date: Thu, 23 Apr 2026 16:29:58 +0200
Subject: [PATCH 6/6] Update
 welearn_datastack/plugins/rest_requesters/open_alex.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 welearn_datastack/plugins/rest_requesters/open_alex.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/welearn_datastack/plugins/rest_requesters/open_alex.py b/welearn_datastack/plugins/rest_requesters/open_alex.py
index cae6ca5..ee03a15 100644
--- a/welearn_datastack/plugins/rest_requesters/open_alex.py
+++ b/welearn_datastack/plugins/rest_requesters/open_alex.py
@@ -302,7 +302,8 @@ def build_description(self, wrapper: WrapperRawData) -> str | Any:
         document_desc = self._remove_useless_first_word(
             string_to_clear=self._invert_abstract(
                 wrapper.raw_data.abstract_inverted_index
-            ),
+            )
+            or "",
             useless_words=["background", "abstract", "introduction"],
         )
         return document_desc