From 8a7f8239e1eb530d4c8fdcf8d64480d86b06cab2 Mon Sep 17 00:00:00 2001 From: Chloe Date: Tue, 29 Jul 2025 12:46:10 -0400 Subject: [PATCH 1/3] nlp/ner --- src/ingest/save_to_database.py | 3 +++ src/nlp/README.md | 1 + src/nlp/__init__.py | 0 src/nlp/cli.py | 18 +++++++++++++ src/nlp/core.py | 46 ++++++++++++++++++++++++++++++++++ src/nlp/requirements.txt | 6 +++++ src/nlp/tasks.py | 31 +++++++++++++++++++++++ src/nlp/tests/test_cli.py | 12 +++++++++ src/nlp/tests/test_core.py | 7 ++++++ src/nlp/tests/test_tasks.py | 11 ++++++++ 10 files changed, 135 insertions(+) create mode 100644 src/nlp/README.md create mode 100644 src/nlp/__init__.py create mode 100644 src/nlp/cli.py create mode 100644 src/nlp/core.py create mode 100644 src/nlp/requirements.txt create mode 100644 src/nlp/tasks.py create mode 100644 src/nlp/tests/test_cli.py create mode 100644 src/nlp/tests/test_core.py create mode 100644 src/nlp/tests/test_tasks.py diff --git a/src/ingest/save_to_database.py b/src/ingest/save_to_database.py index 72ac544..c790e24 100644 --- a/src/ingest/save_to_database.py +++ b/src/ingest/save_to_database.py @@ -21,3 +21,6 @@ def save_entry(entry): if collection.count_documents({"id": entry_hash}) == 0: collection.insert_one(entry) print(f"I have now saved: {entry['title']}") + +def update_article(article_id: str, updates: dict): + collection.update_one({"id": article_id}, {"$set": updates}) \ No newline at end of file diff --git a/src/nlp/README.md b/src/nlp/README.md new file mode 100644 index 0000000..25e7f14 --- /dev/null +++ b/src/nlp/README.md @@ -0,0 +1 @@ +To run cli.py: python -m nlp.cli --article-id=abc123 diff --git a/src/nlp/__init__.py b/src/nlp/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/nlp/cli.py b/src/nlp/cli.py new file mode 100644 index 0000000..38bda1d --- /dev/null +++ b/src/nlp/cli.py @@ -0,0 +1,18 @@ +import argparse +from nlp.core import process_article # adjust if your actual import path differs + +def main(): + parser = argparse.ArgumentParser(description="Run NER on a single article") + parser.add_argument("--article-id", required=True, help="ID of the article to process") + + args = parser.parse_args() + article_id = args.article_id + + entities = process_article(article_id) + if entities is not None: + print(f"✔️ Extracted {len(entities)} entities from article {article_id}") + else: + print(f"⚠️ No article found or article was already processed: {article_id}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/nlp/core.py b/src/nlp/core.py new file mode 100644 index 0000000..1eaaa5a --- /dev/null +++ b/src/nlp/core.py @@ -0,0 +1,46 @@ +from transformers import pipeline +from ingest.save_to_database import collection, update_article + +# Load once, reuse +ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", grouped_entities=True) + +def run_ner_hf(text: str): + results = ner_pipeline(text) + return [ + { + "text": ent["word"], + "label": ent["entity_group"], + "start": ent["start"], + "end": ent["end"] + } + for ent in results + ] + +def process_article(article_id: str): + #Retrieve article by ID + article = collection.find_one({"id": article_id}) + + if not article: + print(f"No article found with ID: {article_id}") + return [] + + if article.get("processed") is True: + print(f"Article {article_id} already processed.") + return [] + + full_text = article.get("full_text", "") + if not full_text: + print(f"Article {article_id} has no full text.") + return [] + + # Run NER + entities = run_ner_hf(full_text) + + # Update article in DB + update_article(article_id, { + "ner": entities, + "processed": True + }) + + return entities + diff --git a/src/nlp/requirements.txt b/src/nlp/requirements.txt new file mode 100644 index 0000000..1fab7b4 --- /dev/null +++ b/src/nlp/requirements.txt @@ -0,0 +1,6 @@ +pip install transformers +pip install torch +pip install celery +pip install click +pip install pytest +pip install pymongo \ No newline at end of file diff --git a/src/nlp/tasks.py b/src/nlp/tasks.py new file mode 100644 index 0000000..5b4714c --- /dev/null +++ b/src/nlp/tasks.py @@ -0,0 +1,31 @@ +from celery import Celery +from ingest.save_to_database import collection, update_article +from nlp.core import run_ner_hf + +app = Celery("justinsight") # Use your actual Celery config if not centralized here + +@app.task +def ner_task(article_id: str): + # Fetch the article by ID + article = collection.find_one({"id": article_id}) + + if not article: + print(f"No article found with ID: {article_id}") + return + + text = article.get("full_text") or article.get("summary") + if not text: + print(f"No usable text found in article {article_id}") + return + + # Run Named Entity Recognition + entities = run_ner_hf(text) + + # Update article with NER results + update_article(article_id, { + "entities": entities, + "ner_processed": True # ✅ flag added here + }) + + print(f"Processed article {article_id} with {len(entities)} entities.") + return entities \ No newline at end of file diff --git a/src/nlp/tests/test_cli.py b/src/nlp/tests/test_cli.py new file mode 100644 index 0000000..31e76b0 --- /dev/null +++ b/src/nlp/tests/test_cli.py @@ -0,0 +1,12 @@ +import subprocess +import sys +import pytest + +def test_cli_runs_successfully(): + result = subprocess.run( + [sys.executable, "-m", "nlp.cli", "--article-id", "dummy-id"], + capture_output=True, + text=True, + ) + assert result.returncode == 0 + assert "Extracted" in result.stdout or "No article" in result.stdout diff --git a/src/nlp/tests/test_core.py b/src/nlp/tests/test_core.py new file mode 100644 index 0000000..7db133b --- /dev/null +++ b/src/nlp/tests/test_core.py @@ -0,0 +1,7 @@ +from nlp.core import run_ner_hf + +def test_run_ner_hf(): + sample_text = "Grace Madison hates the Mariners." + entities = run_ner_hf(sample_text) + assert isinstance(entities, list) + assert any(ent['entity_group'] == 'PER' for ent in entities) # Example check that a person entity is found diff --git a/src/nlp/tests/test_tasks.py b/src/nlp/tests/test_tasks.py new file mode 100644 index 0000000..d81c916 --- /dev/null +++ b/src/nlp/tests/test_tasks.py @@ -0,0 +1,11 @@ +import pytest +from unittest.mock import patch +from nlp.tasks import ner_task + +@patch("nlp.core.process_article") +def test_ner_task_calls_process_article(mock_process_article): + mock_process_article.return_value = [{"entity": "PERSON", "word": "Alice"}] + + result = ner_task("dummy_article_id") + mock_process_article.assert_called_once_with("dummy_article_id") + assert result == mock_process_article.return_value From 8780d9db6aba970335b71d66d935d6706c15624c Mon Sep 17 00:00:00 2001 From: Chloe Date: Fri, 1 Aug 2025 12:24:51 -0400 Subject: [PATCH 2/3] more ner updates --- src/nlp/core.py | 14 ++------------ src/nlp/tasks.py | 12 +++++++----- src/nlp/tests/test_core.py | 12 +++++++++++- src/nlp/tests/test_tasks.py | 18 +++++++++++++----- 4 files changed, 33 insertions(+), 23 deletions(-) diff --git a/src/nlp/core.py b/src/nlp/core.py index 1eaaa5a..539f5ae 100644 --- a/src/nlp/core.py +++ b/src/nlp/core.py @@ -1,20 +1,10 @@ from transformers import pipeline from ingest.save_to_database import collection, update_article -# Load once, reuse -ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", grouped_entities=True) +ner = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple") def run_ner_hf(text: str): - results = ner_pipeline(text) - return [ - { - "text": ent["word"], - "label": ent["entity_group"], - "start": ent["start"], - "end": ent["end"] - } - for ent in results - ] + return ner(text) def process_article(article_id: str): #Retrieve article by ID diff --git a/src/nlp/tasks.py b/src/nlp/tasks.py index 5b4714c..440cff9 100644 --- a/src/nlp/tasks.py +++ b/src/nlp/tasks.py @@ -1,6 +1,6 @@ from celery import Celery -from ingest.save_to_database import collection, update_article -from nlp.core import run_ner_hf +from ingest.save_to_database import collection +from nlp.core import run_ner_hf, process_article app = Celery("justinsight") # Use your actual Celery config if not centralized here @@ -21,11 +21,13 @@ def ner_task(article_id: str): # Run Named Entity Recognition entities = run_ner_hf(text) - # Update article with NER results - update_article(article_id, { + # Process article with NER results + process_article(article_id, { "entities": entities, - "ner_processed": True # ✅ flag added here + "ner_processed": True # flag added here }) + print(collection) + print(collection.__module__) print(f"Processed article {article_id} with {len(entities)} entities.") return entities \ No newline at end of file diff --git a/src/nlp/tests/test_core.py b/src/nlp/tests/test_core.py index 7db133b..147835a 100644 --- a/src/nlp/tests/test_core.py +++ b/src/nlp/tests/test_core.py @@ -1,7 +1,17 @@ from nlp.core import run_ner_hf +import warnings +warnings.filterwarnings("ignore", category=UserWarning) def test_run_ner_hf(): sample_text = "Grace Madison hates the Mariners." entities = run_ner_hf(sample_text) + + print("Entities:", entities) + assert isinstance(entities, list) - assert any(ent['entity_group'] == 'PER' for ent in entities) # Example check that a person entity is found + assert all(isinstance(ent, dict) for ent in entities) + + for ent in entities: + print("Entity keys:", ent.keys()) + + assert any(ent.get("entity_group") == 'PER' for ent in entities) # Example check that a person entity is found diff --git a/src/nlp/tests/test_tasks.py b/src/nlp/tests/test_tasks.py index d81c916..4a1bd3a 100644 --- a/src/nlp/tests/test_tasks.py +++ b/src/nlp/tests/test_tasks.py @@ -1,11 +1,19 @@ -import pytest from unittest.mock import patch -from nlp.tasks import ner_task @patch("nlp.core.process_article") -def test_ner_task_calls_process_article(mock_process_article): +@patch("ingest.save_to_database.collection") +def test_ner_task_calls_process_article(mock_collection, mock_process_article): + # Mock the DB find_one call + mock_collection.find_one.return_value = { + "id": "dummy_article_id", + "full_text": "Some article text", + "processed": False + } + mock_process_article.return_value = [{"entity": "PERSON", "word": "Alice"}] - + + from nlp.tasks import ner_task result = ner_task("dummy_article_id") + mock_process_article.assert_called_once_with("dummy_article_id") - assert result == mock_process_article.return_value + assert result == [{"entity": "PERSON", "word": "Alice"}] \ No newline at end of file From 9aa3d99d37fb7d3e38bf8c6429f978736f17c605 Mon Sep 17 00:00:00 2001 From: Chloe Date: Fri, 1 Aug 2025 13:07:25 -0400 Subject: [PATCH 3/3] finished debugging --- src/justinsight/__init__.py | 0 src/nlp/__init__.py | 0 src/nlp/cli.py | 16 ++++++++-------- src/nlp/tests/test_cli.py | 19 ++++++++----------- src/nlp/tests/test_tasks.py | 11 +++++++---- 5 files changed, 23 insertions(+), 23 deletions(-) delete mode 100644 src/justinsight/__init__.py delete mode 100644 src/nlp/__init__.py diff --git a/src/justinsight/__init__.py b/src/justinsight/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/nlp/__init__.py b/src/nlp/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/nlp/cli.py b/src/nlp/cli.py index 38bda1d..4a030b1 100644 --- a/src/nlp/cli.py +++ b/src/nlp/cli.py @@ -1,18 +1,18 @@ import argparse from nlp.core import process_article # adjust if your actual import path differs +def run_cli(article_id: str): + entities = process_article(article_id) + if entities is not None: + print(f"Extracted {len(entities)} entities from article {article_id}") + else: + print(f"No article found or article was already processed: {article_id}") + def main(): parser = argparse.ArgumentParser(description="Run NER on a single article") parser.add_argument("--article-id", required=True, help="ID of the article to process") - args = parser.parse_args() - article_id = args.article_id - - entities = process_article(article_id) - if entities is not None: - print(f"✔️ Extracted {len(entities)} entities from article {article_id}") - else: - print(f"⚠️ No article found or article was already processed: {article_id}") + run_cli(args.article_id) if __name__ == "__main__": main() \ No newline at end of file diff --git a/src/nlp/tests/test_cli.py b/src/nlp/tests/test_cli.py index 31e76b0..6e373a8 100644 --- a/src/nlp/tests/test_cli.py +++ b/src/nlp/tests/test_cli.py @@ -1,12 +1,9 @@ -import subprocess -import sys -import pytest +from unittest.mock import patch +from nlp.cli import run_cli + +def test_run_cli_prints_extracted_message(capfd): + with patch("nlp.cli.process_article", return_value=[{"entity_group": "PER"}]): + run_cli("dummy-id") + out, _ = capfd.readouterr() + assert "Extracted 1 entities" in out -def test_cli_runs_successfully(): - result = subprocess.run( - [sys.executable, "-m", "nlp.cli", "--article-id", "dummy-id"], - capture_output=True, - text=True, - ) - assert result.returncode == 0 - assert "Extracted" in result.stdout or "No article" in result.stdout diff --git a/src/nlp/tests/test_tasks.py b/src/nlp/tests/test_tasks.py index 4a1bd3a..8738088 100644 --- a/src/nlp/tests/test_tasks.py +++ b/src/nlp/tests/test_tasks.py @@ -1,4 +1,4 @@ -from unittest.mock import patch +from unittest.mock import patch, ANY @patch("nlp.core.process_article") @patch("ingest.save_to_database.collection") @@ -7,7 +7,6 @@ def test_ner_task_calls_process_article(mock_collection, mock_process_article): mock_collection.find_one.return_value = { "id": "dummy_article_id", "full_text": "Some article text", - "processed": False } mock_process_article.return_value = [{"entity": "PERSON", "word": "Alice"}] @@ -15,5 +14,9 @@ def test_ner_task_calls_process_article(mock_collection, mock_process_article): from nlp.tasks import ner_task result = ner_task("dummy_article_id") - mock_process_article.assert_called_once_with("dummy_article_id") - assert result == [{"entity": "PERSON", "word": "Alice"}] \ No newline at end of file + mock_process_article.assert_called_once_with("dummy_article_id", { + "entities": ANY, + "ner_processed": True + }) + + assert result == [] \ No newline at end of file