diff --git a/src/ingest/save_to_database.py b/src/ingest/save_to_database.py index e40b91b..b482351 100644 --- a/src/ingest/save_to_database.py +++ b/src/ingest/save_to_database.py @@ -31,6 +31,13 @@ def save_entry(entry, using_celery): #check if the entry has already been saved and if it has not then save it entry_hash = entry["id"] if collection.count_documents({"id": entry_hash}) == 0: + + collection.insert_one(entry) + print(f"I have now saved: {entry['title']}") + +def update_article(article_id: str, updates: dict): + collection.update_one({"id": article_id}, {"$set": updates}) + result = collection.insert_one(entry) inserted_id = result.inserted_id @@ -48,3 +55,4 @@ def save_entry(entry, using_celery): #print(f"I have now saved: {entry['title']}") + diff --git a/src/justinsight/__init__.py b/src/justinsight/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/nlp/README.md b/src/nlp/README.md new file mode 100644 index 0000000..25e7f14 --- /dev/null +++ b/src/nlp/README.md @@ -0,0 +1 @@ +To run cli.py: python -m nlp.cli --article-id=abc123 diff --git a/src/nlp/cli.py b/src/nlp/cli.py new file mode 100644 index 0000000..4a030b1 --- /dev/null +++ b/src/nlp/cli.py @@ -0,0 +1,18 @@ +import argparse +from nlp.core import process_article # adjust if your actual import path differs + +def run_cli(article_id: str): + entities = process_article(article_id) + if entities is not None: + print(f"Extracted {len(entities)} entities from article {article_id}") + else: + print(f"No article found or article was already processed: {article_id}") + +def main(): + parser = argparse.ArgumentParser(description="Run NER on a single article") + parser.add_argument("--article-id", required=True, help="ID of the article to process") + args = parser.parse_args() + run_cli(args.article_id) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/nlp/core.py b/src/nlp/core.py new file mode 100644 index 0000000..539f5ae --- /dev/null +++ b/src/nlp/core.py @@ -0,0 +1,36 @@ +from transformers import pipeline +from ingest.save_to_database import collection, update_article + +ner = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple") + +def run_ner_hf(text: str): + return ner(text) + +def process_article(article_id: str): + #Retrieve article by ID + article = collection.find_one({"id": article_id}) + + if not article: + print(f"No article found with ID: {article_id}") + return [] + + if article.get("processed") is True: + print(f"Article {article_id} already processed.") + return [] + + full_text = article.get("full_text", "") + if not full_text: + print(f"Article {article_id} has no full text.") + return [] + + # Run NER + entities = run_ner_hf(full_text) + + # Update article in DB + update_article(article_id, { + "ner": entities, + "processed": True + }) + + return entities + diff --git a/src/nlp/requirements.txt b/src/nlp/requirements.txt new file mode 100644 index 0000000..1fab7b4 --- /dev/null +++ b/src/nlp/requirements.txt @@ -0,0 +1,6 @@ +pip install transformers +pip install torch +pip install celery +pip install click +pip install pytest +pip install pymongo \ No newline at end of file diff --git a/src/nlp/tasks.py b/src/nlp/tasks.py new file mode 100644 index 0000000..440cff9 --- /dev/null +++ b/src/nlp/tasks.py @@ -0,0 +1,33 @@ +from celery import Celery +from ingest.save_to_database import collection +from nlp.core import run_ner_hf, process_article + +app = Celery("justinsight") # Use your actual Celery config if not centralized here + +@app.task +def ner_task(article_id: str): + # Fetch the article by ID + article = collection.find_one({"id": article_id}) + + if not article: + print(f"No article found with ID: {article_id}") + return + + text = article.get("full_text") or article.get("summary") + if not text: + print(f"No usable text found in article {article_id}") + return + + # Run Named Entity Recognition + entities = run_ner_hf(text) + + # Process article with NER results + process_article(article_id, { + "entities": entities, + "ner_processed": True # flag added here + }) + + print(collection) + print(collection.__module__) + print(f"Processed article {article_id} with {len(entities)} entities.") + return entities \ No newline at end of file diff --git a/src/nlp/tests/test_cli.py b/src/nlp/tests/test_cli.py new file mode 100644 index 0000000..6e373a8 --- /dev/null +++ b/src/nlp/tests/test_cli.py @@ -0,0 +1,9 @@ +from unittest.mock import patch +from nlp.cli import run_cli + +def test_run_cli_prints_extracted_message(capfd): + with patch("nlp.cli.process_article", return_value=[{"entity_group": "PER"}]): + run_cli("dummy-id") + out, _ = capfd.readouterr() + assert "Extracted 1 entities" in out + diff --git a/src/nlp/tests/test_core.py b/src/nlp/tests/test_core.py new file mode 100644 index 0000000..147835a --- /dev/null +++ b/src/nlp/tests/test_core.py @@ -0,0 +1,17 @@ +from nlp.core import run_ner_hf +import warnings +warnings.filterwarnings("ignore", category=UserWarning) + +def test_run_ner_hf(): + sample_text = "Grace Madison hates the Mariners." + entities = run_ner_hf(sample_text) + + print("Entities:", entities) + + assert isinstance(entities, list) + assert all(isinstance(ent, dict) for ent in entities) + + for ent in entities: + print("Entity keys:", ent.keys()) + + assert any(ent.get("entity_group") == 'PER' for ent in entities) # Example check that a person entity is found diff --git a/src/nlp/tests/test_tasks.py b/src/nlp/tests/test_tasks.py new file mode 100644 index 0000000..8738088 --- /dev/null +++ b/src/nlp/tests/test_tasks.py @@ -0,0 +1,22 @@ +from unittest.mock import patch, ANY + +@patch("nlp.core.process_article") +@patch("ingest.save_to_database.collection") +def test_ner_task_calls_process_article(mock_collection, mock_process_article): + # Mock the DB find_one call + mock_collection.find_one.return_value = { + "id": "dummy_article_id", + "full_text": "Some article text", + } + + mock_process_article.return_value = [{"entity": "PERSON", "word": "Alice"}] + + from nlp.tasks import ner_task + result = ner_task("dummy_article_id") + + mock_process_article.assert_called_once_with("dummy_article_id", { + "entities": ANY, + "ner_processed": True + }) + + assert result == [] \ No newline at end of file