From 8a7f8239e1eb530d4c8fdcf8d64480d86b06cab2 Mon Sep 17 00:00:00 2001
From: Chloe <chloever.echo@gmail.com>
Date: Tue, 29 Jul 2025 12:46:10 -0400
Subject: [PATCH 1/3] nlp/ner

---
 src/ingest/save_to_database.py |  3 +++
 src/nlp/README.md              |  1 +
 src/nlp/__init__.py            |  0
 src/nlp/cli.py                 | 18 +++++++++++++
 src/nlp/core.py                | 46 ++++++++++++++++++++++++++++++++++
 src/nlp/requirements.txt       |  6 +++++
 src/nlp/tasks.py               | 31 +++++++++++++++++++++++
 src/nlp/tests/test_cli.py      | 12 +++++++++
 src/nlp/tests/test_core.py     |  7 ++++++
 src/nlp/tests/test_tasks.py    | 11 ++++++++
 10 files changed, 135 insertions(+)
 create mode 100644 src/nlp/README.md
 create mode 100644 src/nlp/__init__.py
 create mode 100644 src/nlp/cli.py
 create mode 100644 src/nlp/core.py
 create mode 100644 src/nlp/requirements.txt
 create mode 100644 src/nlp/tasks.py
 create mode 100644 src/nlp/tests/test_cli.py
 create mode 100644 src/nlp/tests/test_core.py
 create mode 100644 src/nlp/tests/test_tasks.py

diff --git a/src/ingest/save_to_database.py b/src/ingest/save_to_database.py
index 72ac544..c790e24 100644
--- a/src/ingest/save_to_database.py
+++ b/src/ingest/save_to_database.py
@@ -21,3 +21,6 @@ def save_entry(entry):
     if collection.count_documents({"id": entry_hash}) == 0:
         collection.insert_one(entry)
         print(f"I have now saved: {entry['title']}")
+
+def update_article(article_id: str, updates: dict):
+    collection.update_one({"id": article_id}, {"$set": updates})
\ No newline at end of file
diff --git a/src/nlp/README.md b/src/nlp/README.md
new file mode 100644
index 0000000..25e7f14
--- /dev/null
+++ b/src/nlp/README.md
@@ -0,0 +1 @@
+To run cli.py: python -m nlp.cli --article-id=abc123
diff --git a/src/nlp/__init__.py b/src/nlp/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/nlp/cli.py b/src/nlp/cli.py
new file mode 100644
index 0000000..38bda1d
--- /dev/null
+++ b/src/nlp/cli.py
@@ -0,0 +1,18 @@
+import argparse
+from nlp.core import process_article  # adjust if your actual import path differs
+
+def main():
+    parser = argparse.ArgumentParser(description="Run NER on a single article")
+    parser.add_argument("--article-id", required=True, help="ID of the article to process")
+
+    args = parser.parse_args()
+    article_id = args.article_id
+
+    entities = process_article(article_id)
+    if entities is not None:
+        print(f"✔️ Extracted {len(entities)} entities from article {article_id}")
+    else:
+        print(f"⚠️ No article found or article was already processed: {article_id}")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/src/nlp/core.py b/src/nlp/core.py
new file mode 100644
index 0000000..1eaaa5a
--- /dev/null
+++ b/src/nlp/core.py
@@ -0,0 +1,46 @@
+from transformers import pipeline
+from ingest.save_to_database import collection, update_article
+
+# Load once, reuse
+ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", grouped_entities=True)
+
+def run_ner_hf(text: str):
+    results = ner_pipeline(text)
+    return [
+        {
+            "text": ent["word"],
+            "label": ent["entity_group"],
+            "start": ent["start"],
+            "end": ent["end"]
+        }
+        for ent in results
+    ]
+
+def process_article(article_id: str):
+    #Retrieve article by ID
+    article = collection.find_one({"id": article_id})
+
+    if not article:
+        print(f"No article found with ID: {article_id}")
+        return []
+
+    if article.get("processed") is True:
+        print(f"Article {article_id} already processed.")
+        return []
+
+    full_text = article.get("full_text", "")
+    if not full_text:
+        print(f"Article {article_id} has no full text.")
+        return []
+
+    # Run NER
+    entities = run_ner_hf(full_text)
+
+    # Update article in DB
+    update_article(article_id, {
+        "ner": entities,
+        "processed": True
+    })
+
+    return entities
+
diff --git a/src/nlp/requirements.txt b/src/nlp/requirements.txt
new file mode 100644
index 0000000..1fab7b4
--- /dev/null
+++ b/src/nlp/requirements.txt
@@ -0,0 +1,6 @@
+pip install transformers
+pip install torch
+pip install celery
+pip install click
+pip install pytest
+pip install pymongo
\ No newline at end of file
diff --git a/src/nlp/tasks.py b/src/nlp/tasks.py
new file mode 100644
index 0000000..5b4714c
--- /dev/null
+++ b/src/nlp/tasks.py
@@ -0,0 +1,31 @@
+from celery import Celery
+from ingest.save_to_database import collection, update_article
+from nlp.core import run_ner_hf
+
+app = Celery("justinsight")  # Use your actual Celery config if not centralized here
+
+@app.task
+def ner_task(article_id: str):
+    # Fetch the article by ID
+    article = collection.find_one({"id": article_id})
+
+    if not article:
+        print(f"No article found with ID: {article_id}")
+        return
+
+    text = article.get("full_text") or article.get("summary")
+    if not text:
+        print(f"No usable text found in article {article_id}")
+        return
+
+    # Run Named Entity Recognition
+    entities = run_ner_hf(text)
+
+    # Update article with NER results
+    update_article(article_id, {
+        "entities": entities,
+        "ner_processed": True  # ✅ flag added here
+    })
+
+    print(f"Processed article {article_id} with {len(entities)} entities.")
+    return entities
\ No newline at end of file
diff --git a/src/nlp/tests/test_cli.py b/src/nlp/tests/test_cli.py
new file mode 100644
index 0000000..31e76b0
--- /dev/null
+++ b/src/nlp/tests/test_cli.py
@@ -0,0 +1,12 @@
+import subprocess
+import sys
+import pytest
+
+def test_cli_runs_successfully():
+    result = subprocess.run(
+        [sys.executable, "-m", "nlp.cli", "--article-id", "dummy-id"],
+        capture_output=True,
+        text=True,
+    )
+    assert result.returncode == 0
+    assert "Extracted" in result.stdout or "No article" in result.stdout
diff --git a/src/nlp/tests/test_core.py b/src/nlp/tests/test_core.py
new file mode 100644
index 0000000..7db133b
--- /dev/null
+++ b/src/nlp/tests/test_core.py
@@ -0,0 +1,7 @@
+from nlp.core import run_ner_hf
+
+def test_run_ner_hf():
+    sample_text = "Grace Madison hates the Mariners."
+    entities = run_ner_hf(sample_text)
+    assert isinstance(entities, list)
+    assert any(ent['entity_group'] == 'PER' for ent in entities)  # Example check that a person entity is found
diff --git a/src/nlp/tests/test_tasks.py b/src/nlp/tests/test_tasks.py
new file mode 100644
index 0000000..d81c916
--- /dev/null
+++ b/src/nlp/tests/test_tasks.py
@@ -0,0 +1,11 @@
+import pytest
+from unittest.mock import patch
+from nlp.tasks import ner_task
+
+@patch("nlp.core.process_article")
+def test_ner_task_calls_process_article(mock_process_article):
+    mock_process_article.return_value = [{"entity": "PERSON", "word": "Alice"}]
+
+    result = ner_task("dummy_article_id")
+    mock_process_article.assert_called_once_with("dummy_article_id")
+    assert result == mock_process_article.return_value

From 8780d9db6aba970335b71d66d935d6706c15624c Mon Sep 17 00:00:00 2001
From: Chloe <chloever.echo@gmail.com>
Date: Fri, 1 Aug 2025 12:24:51 -0400
Subject: [PATCH 2/3] more ner updates

---
 src/nlp/core.py             | 14 ++------------
 src/nlp/tasks.py            | 12 +++++++-----
 src/nlp/tests/test_core.py  | 12 +++++++++++-
 src/nlp/tests/test_tasks.py | 18 +++++++++++++-----
 4 files changed, 33 insertions(+), 23 deletions(-)

diff --git a/src/nlp/core.py b/src/nlp/core.py
index 1eaaa5a..539f5ae 100644
--- a/src/nlp/core.py
+++ b/src/nlp/core.py
@@ -1,20 +1,10 @@
 from transformers import pipeline
 from ingest.save_to_database import collection, update_article
 
-# Load once, reuse
-ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", grouped_entities=True)
+ner = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple")
 
 def run_ner_hf(text: str):
-    results = ner_pipeline(text)
-    return [
-        {
-            "text": ent["word"],
-            "label": ent["entity_group"],
-            "start": ent["start"],
-            "end": ent["end"]
-        }
-        for ent in results
-    ]
+    return ner(text)
 
 def process_article(article_id: str):
     #Retrieve article by ID
diff --git a/src/nlp/tasks.py b/src/nlp/tasks.py
index 5b4714c..440cff9 100644
--- a/src/nlp/tasks.py
+++ b/src/nlp/tasks.py
@@ -1,6 +1,6 @@
 from celery import Celery
-from ingest.save_to_database import collection, update_article
-from nlp.core import run_ner_hf
+from ingest.save_to_database import collection
+from nlp.core import run_ner_hf, process_article
 
 app = Celery("justinsight")  # Use your actual Celery config if not centralized here
 
@@ -21,11 +21,13 @@ def ner_task(article_id: str):
     # Run Named Entity Recognition
     entities = run_ner_hf(text)
 
-    # Update article with NER results
-    update_article(article_id, {
+    # Process article with NER results
+    process_article(article_id, {
         "entities": entities,
-        "ner_processed": True  # ✅ flag added here
+        "ner_processed": True  # flag added here
     })
 
+    print(collection)
+    print(collection.__module__)
     print(f"Processed article {article_id} with {len(entities)} entities.")
     return entities
\ No newline at end of file
diff --git a/src/nlp/tests/test_core.py b/src/nlp/tests/test_core.py
index 7db133b..147835a 100644
--- a/src/nlp/tests/test_core.py
+++ b/src/nlp/tests/test_core.py
@@ -1,7 +1,17 @@
 from nlp.core import run_ner_hf
+import warnings
+warnings.filterwarnings("ignore", category=UserWarning)
 
 def test_run_ner_hf():
     sample_text = "Grace Madison hates the Mariners."
     entities = run_ner_hf(sample_text)
+
+    print("Entities:", entities)
+
     assert isinstance(entities, list)
-    assert any(ent['entity_group'] == 'PER' for ent in entities)  # Example check that a person entity is found
+    assert all(isinstance(ent, dict) for ent in entities)
+
+    for ent in entities:
+        print("Entity keys:", ent.keys())
+
+    assert any(ent.get("entity_group") == 'PER' for ent in entities)  # Example check that a person entity is found
diff --git a/src/nlp/tests/test_tasks.py b/src/nlp/tests/test_tasks.py
index d81c916..4a1bd3a 100644
--- a/src/nlp/tests/test_tasks.py
+++ b/src/nlp/tests/test_tasks.py
@@ -1,11 +1,19 @@
-import pytest
 from unittest.mock import patch
-from nlp.tasks import ner_task
 
 @patch("nlp.core.process_article")
-def test_ner_task_calls_process_article(mock_process_article):
+@patch("ingest.save_to_database.collection")
+def test_ner_task_calls_process_article(mock_collection, mock_process_article):
+    # Mock the DB find_one call
+    mock_collection.find_one.return_value = {
+        "id": "dummy_article_id",
+        "full_text": "Some article text",
+        "processed": False
+    }
+    
     mock_process_article.return_value = [{"entity": "PERSON", "word": "Alice"}]
-
+    
+    from nlp.tasks import ner_task
     result = ner_task("dummy_article_id")
+    
     mock_process_article.assert_called_once_with("dummy_article_id")
-    assert result == mock_process_article.return_value
+    assert result == [{"entity": "PERSON", "word": "Alice"}]
\ No newline at end of file

From 9aa3d99d37fb7d3e38bf8c6429f978736f17c605 Mon Sep 17 00:00:00 2001
From: Chloe <chloever.echo@gmail.com>
Date: Fri, 1 Aug 2025 13:07:25 -0400
Subject: [PATCH 3/3] finished debugging

---
 src/justinsight/__init__.py |  0
 src/nlp/__init__.py         |  0
 src/nlp/cli.py              | 16 ++++++++--------
 src/nlp/tests/test_cli.py   | 19 ++++++++-----------
 src/nlp/tests/test_tasks.py | 11 +++++++----
 5 files changed, 23 insertions(+), 23 deletions(-)
 delete mode 100644 src/justinsight/__init__.py
 delete mode 100644 src/nlp/__init__.py

diff --git a/src/justinsight/__init__.py b/src/justinsight/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/src/nlp/__init__.py b/src/nlp/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/src/nlp/cli.py b/src/nlp/cli.py
index 38bda1d..4a030b1 100644
--- a/src/nlp/cli.py
+++ b/src/nlp/cli.py
@@ -1,18 +1,18 @@
 import argparse
 from nlp.core import process_article  # adjust if your actual import path differs
 
+def run_cli(article_id: str):
+    entities = process_article(article_id)
+    if entities is not None:
+        print(f"Extracted {len(entities)} entities from article {article_id}")
+    else:
+        print(f"No article found or article was already processed: {article_id}")
+
 def main():
     parser = argparse.ArgumentParser(description="Run NER on a single article")
     parser.add_argument("--article-id", required=True, help="ID of the article to process")
-
     args = parser.parse_args()
-    article_id = args.article_id
-
-    entities = process_article(article_id)
-    if entities is not None:
-        print(f"✔️ Extracted {len(entities)} entities from article {article_id}")
-    else:
-        print(f"⚠️ No article found or article was already processed: {article_id}")
+    run_cli(args.article_id)
 
 if __name__ == "__main__":
     main()
\ No newline at end of file
diff --git a/src/nlp/tests/test_cli.py b/src/nlp/tests/test_cli.py
index 31e76b0..6e373a8 100644
--- a/src/nlp/tests/test_cli.py
+++ b/src/nlp/tests/test_cli.py
@@ -1,12 +1,9 @@
-import subprocess
-import sys
-import pytest
+from unittest.mock import patch
+from nlp.cli import run_cli
+
+def test_run_cli_prints_extracted_message(capfd):
+    with patch("nlp.cli.process_article", return_value=[{"entity_group": "PER"}]):
+        run_cli("dummy-id")
+        out, _ = capfd.readouterr()
+        assert "Extracted 1 entities" in out
 
-def test_cli_runs_successfully():
-    result = subprocess.run(
-        [sys.executable, "-m", "nlp.cli", "--article-id", "dummy-id"],
-        capture_output=True,
-        text=True,
-    )
-    assert result.returncode == 0
-    assert "Extracted" in result.stdout or "No article" in result.stdout
diff --git a/src/nlp/tests/test_tasks.py b/src/nlp/tests/test_tasks.py
index 4a1bd3a..8738088 100644
--- a/src/nlp/tests/test_tasks.py
+++ b/src/nlp/tests/test_tasks.py
@@ -1,4 +1,4 @@
-from unittest.mock import patch
+from unittest.mock import patch, ANY
 
 @patch("nlp.core.process_article")
 @patch("ingest.save_to_database.collection")
@@ -7,7 +7,6 @@ def test_ner_task_calls_process_article(mock_collection, mock_process_article):
     mock_collection.find_one.return_value = {
         "id": "dummy_article_id",
         "full_text": "Some article text",
-        "processed": False
     }
     
     mock_process_article.return_value = [{"entity": "PERSON", "word": "Alice"}]
@@ -15,5 +14,9 @@ def test_ner_task_calls_process_article(mock_collection, mock_process_article):
     from nlp.tasks import ner_task
     result = ner_task("dummy_article_id")
     
-    mock_process_article.assert_called_once_with("dummy_article_id")
-    assert result == [{"entity": "PERSON", "word": "Alice"}]
\ No newline at end of file
+    mock_process_article.assert_called_once_with("dummy_article_id", {
+    "entities": ANY,
+    "ner_processed": True
+    })
+    
+    assert result == []
\ No newline at end of file