JustInternetAI · graceannmad · Jun 30, 2025 · Jun 24, 2025 · Jun 24, 2025 · Jun 26, 2025
diff --git a/src/ingest/ap_ingestor.py b/src/ingest/ap_ingestor.py
@@ -0,0 +1,27 @@
+from playwright.sync_api import sync_playwright
+from ingest.base_ingestor import BaseIngestor
+
+class APIngestor(BaseIngestor):
+    RSS_URL = "https://news.google.com/rss/search?q=when:24h+allinurl:apnews.com&hl=en-US&gl=US&ceid=US:en"
+
+    def fetch_full_text(self, article_url):
+        try:
+            with sync_playwright() as p:
+                browser = p.chromium.launch(headless=True)
+                page = browser.new_page()
+                page.goto(article_url, timeout=15000)
+
+                # Wait for the main article body to load
+                page.wait_for_selector('div.RichTextStoryBody', timeout=5000)
+
+                # Extract the text content from the paragraphs inside the body
+                content = page.query_selector_all('div.RichTextStoryBody p')
+                full_text = "\n".join(p.inner_text() for p in content)
+
+                browser.close()
+                return full_text.strip()
+
+        except Exception as e:
+            print(f"Playwright error fetching {url}: {e}")
+            return ""
+
diff --git a/src/ingest/cbs_ingestor.py b/src/ingest/cbs_ingestor.py
@@ -0,0 +1,42 @@
+from bs4 import BeautifulSoup
+import requests
+from ingest.base_ingestor import BaseIngestor
+
+class CBSIngestor(BaseIngestor):
+    RSS_URL = "https://www.cbsnews.com/latest/rss/world"
+
+    def fetch_full_text(self, article_url):
+        try:
+            headers = {"User-Agent": "Mozilla/5.0"}
+            resp = requests.get(article_url, headers=headers, timeout=10)
+            resp.raise_for_status()
+
+            soup = BeautifulSoup(resp.content, 'html.parser')
+
+            # Step 1: find the <h1> (article title)
+            h1 = soup.find('h1')
+            if not h1:
+                print(f"[warn] No <h1> tag found in {article_url}")
+                return ""
+
+            # Step 2: Walk up parent nodes until we find one with enough <p> tags
+            root = h1
+            while root and root.name != 'body':
+                paragraphs = root.find_all('p')
+                if len(paragraphs) >= 5:
+                    break
+                root = root.parent
+
+            if root is None or root.name == 'body':
+                print(f"[warn] Couldn't find content container in {article_url}")
+                return ""
+
+            # Step 3: Extract text from heading/paragraph tags under that container
+            tags = root.find_all(['h2', 'h3', 'h4', 'h5', 'h6', 'p'])
+            text = "\n".join(tag.get_text(strip=True) for tag in tags)
+
+            return text.strip()
+
+        except Exception as e:
+            print(f"[error] {e} while scraping {article_url}")
+            return ""
diff --git a/src/ingest/cbs_rss.py b/src/ingest/cbs_rss.py
@@ -0,0 +1,134 @@
+import feedparser
+import json
+import os
+import re
+import time
+import datetime
+import requests
+from bs4 import BeautifulSoup
+import hashlib
+
+INTERVAL = 3600  # seconds (1 hour)
+HASHES = './data/raw/feed_saved_hashes.json'
+
+os.makedirs('./data/raw/cbs/', exist_ok=True)
+
+def fetch_feed():
+    # Download and parse the feed
+    return feedparser.parse('https://www.cbsnews.com/latest/rss/world')
+
+def slugify(text):
+    # Convert title to a filesystem-friendly slug
+    text = text.lower()
+    text = re.sub(r'[^a-z0-9]+', '-', text)
+    return text.strip('-')
+
+def format_date(entry):
+    # Extract and format the published date
+    try:
+        dt = datetime.datetime(*entry.published_parsed[:6])
+        return dt.strftime("%Y-%m-%d")
+    except:
+        return "unknown-date"
+
+def fetch_full_article(url):
+    try:
+        headers = {"User-Agent": "Mozilla/5.0"}
+        resp = requests.get(url, headers=headers, timeout=10)
+        resp.raise_for_status()
+
+        soup = BeautifulSoup(resp.content, 'html.parser')
+
+        # Step 1: find the <h1> (article title)
+        h1 = soup.find('h1')
+        if not h1:
+            print(f"[warn] No <h1> tag found in {url}")
+            return ""
+
+        # Step 2: Walk up parent nodes until we find one with enough <p> tags
+        root = h1
+        while root and root.name != 'body':
+            paragraphs = root.find_all('p')
+            if len(paragraphs) >= 5:
+                break
+            root = root.parent
+
+        if root is None or root.name == 'body':
+            print(f"[warn] Couldn't find content container in {url}")
+            return ""
+
+        # Step 3: Extract text from heading/paragraph tags under that container
+        tags = root.find_all(['h2', 'h3', 'h4', 'h5', 'h6', 'p'])
+        text = "\n".join(tag.get_text(strip=True) for tag in tags)
+
+        return text.strip()
+
+    except Exception as e:
+        print(f"[error] {e} while scraping {url}")
+        return ""
+
+def load_saved_hashes():
+    if os.path.exists(HASHES):
+        with open(HASHES, 'r', encoding='utf-8') as f:
+            return set(json.load(f))
+    return set()
+
+def save_hashes(hashes):
+    with open(HASHES, 'w', encoding='utf-8') as f:
+        json.dump(list(hashes), f, indent=2)
+
+def generate_entry_hash(entry):
+    hash_input = f"{entry.title}{entry.link}{entry.get('published', '')}"
+    return hashlib.sha256(hash_input.encode('utf-8')).hexdigest()
+
+def save_entry(entry):
+    saved_hashes = load_saved_hashes()
+    entry_hash = generate_entry_hash(entry)
+
+    if entry_hash in saved_hashes:
+        return False  # Already saved
+
+    # Save the entry as a JSON file
+    title_slug = slugify(entry.title)
+    date_str = format_date(entry)
+    filename = f"feed_{date_str}_{title_slug}.json"
+    filepath = os.path.join('./data/raw/cbs/', filename)
+    full_text = fetch_full_article(entry.link)
+
+    data = {
+        "title": entry.title,
+        "link": entry.link,
+        "published": entry.get("published", ""),
+        "summary": entry.get("summary", ""),
+        "full_text": full_text
+    }
+
+    with open(filepath, 'w', encoding='utf-8') as f:
+        json.dump(data, f, ensure_ascii=False, indent=4)
+
+    saved_hashes.add(entry_hash)
+    save_hashes(saved_hashes)
+
+    return True
+
+def check_and_save_new_entries():
+    feed = fetch_feed()
+    new_count = 0
+
+    for entry in feed.entries:
+        saved = save_entry(entry)
+        if saved:
+            new_count += 1
+
+    print(f"Saved {new_count} new entries.")
+
+if __name__ == '__main__':
+    import sys
+
+    if '--once' in sys.argv:
+        check_and_save_new_entries()
+    else:
+        while True:
+            check_and_save_new_entries()
+            time.sleep(INTERVAL)
+
diff --git a/src/ingest/latimes_ingestor.py b/src/ingest/latimes_ingestor.py
@@ -0,0 +1,30 @@
+from bs4 import BeautifulSoup
+import requests
+from ingest.base_ingestor import BaseIngestor
+
+class LATIMESIngestor(BaseIngestor):
+    RSS_URL = "https://www.latimes.com/world/rss2.0.xml"
+
+    def fetch_full_text(self, article_url):
+        try:
+            headers = {
+                "User-Agent": "Mozilla/5.0"
+            }
+            response = requests.get(article_url, headers=headers, timeout=10)
+            soup = BeautifulSoup(response.content, 'html.parser')
+
+            # LATimes articles are usually under <section name="article-body"> or similar
+            article_body = soup.find('section', attrs={'name': 'article-body'})
+            if not article_body:
+                article_body = soup.find('div', class_='rich-text-article-body')  # fallback
+
+            if not article_body:
+                return ""
+
+            paragraphs = article_body.find_all('p')
+            full_text = "\n".join(p.get_text(strip=True) for p in paragraphs)
+            return full_text.strip()
+
+        except Exception as e:
+            print(f"Error fetching LA Times article: {article_url} — {e}")
+            return ""
diff --git a/src/ingest/latimes_rss.py b/src/ingest/latimes_rss.py
@@ -0,0 +1,123 @@
+
+import feedparser
+import json
+import os
+import re
+import time
+import datetime
+from bs4 import BeautifulSoup
+import requests
+import hashlib
+
+INTERVAL = 3600  # seconds (1 hour)
+HASHES = './data/raw/feed_saved_hashes.json'
+
+os.makedirs('./data/raw/latimes/', exist_ok=True)
+
+def fetch_feed():
+    # Download and parse the feed
+    return feedparser.parse('https://www.latimes.com/world/rss2.0.xml')
+
+def slugify(text):
+    # Convert title to a filesystem-friendly slug
+    text = text.lower()
+    text = re.sub(r'[^a-z0-9]+', '-', text)
+    return text.strip('-')
+
+def format_date(entry):
+    # Extract and format the published date
+    try:
+        dt = datetime.datetime(*entry.published_parsed[:6])
+        return dt.strftime("%Y-%m-%d")
+    except:
+        return "unknown-date"
+
+def fetch_full_article(url):
+    try:
+        headers = {
+            "User-Agent": "Mozilla/5.0"
+        }
+        response = requests.get(url, headers=headers, timeout=10)
+        soup = BeautifulSoup(response.content, 'html.parser')
+
+        # LATimes articles are usually under <section name="article-body"> or similar
+        article_body = soup.find('section', attrs={'name': 'article-body'})
+        if not article_body:
+            article_body = soup.find('div', class_='rich-text-article-body')  # fallback
+
+        if not article_body:
+            return ""
+
+        paragraphs = article_body.find_all('p')
+        full_text = "\n".join(p.get_text(strip=True) for p in paragraphs)
+        return full_text.strip()
+
+    except Exception as e:
+        print(f"Error fetching LA Times article: {url} — {e}")
+        return ""
+
+def load_saved_hashes():
+    if os.path.exists(HASHES):
+        with open(HASHES, 'r', encoding='utf-8') as f:
+            return set(json.load(f))
+    return set()
+
+def save_hashes(hashes):
+    with open(HASHES, 'w', encoding='utf-8') as f:
+        json.dump(list(hashes), f, indent=2)
+
+def generate_entry_hash(entry):
+    hash_input = f"{entry.title}{entry.link}{entry.get('published', '')}"
+    return hashlib.sha256(hash_input.encode('utf-8')).hexdigest()
+
+def save_entry(entry):
+    saved_hashes = load_saved_hashes()
+    entry_hash = generate_entry_hash(entry)
+
+    if entry_hash in saved_hashes:
+        return False  # Already saved
+
+    # Save the entry as a JSON file
+    title_slug = slugify(entry.title)
+    date_str = format_date(entry)
+    filename = f"feed_{date_str}_{title_slug}.json"
+    filepath = os.path.join('./data/raw/latimes/', filename)
+    full_text = fetch_full_article(entry.link)
+
+    data = {
+        "title": entry.title,
+        "link": entry.link,
+        "published": entry.get("published", ""),
+        "summary": entry.get("summary", ""),
+        "full_text": full_text
+    }
+
+    with open(filepath, 'w', encoding='utf-8') as f:
+        json.dump(data, f, ensure_ascii=False, indent=4)
+
+    saved_hashes.add(entry_hash)
+    save_hashes(saved_hashes)
+
+    return True
+
+def check_and_save_new_entries():
+    feed = fetch_feed()
+    new_count = 0
+
+    for entry in feed.entries:
+        saved = save_entry(entry)
+        if saved:
+            new_count += 1
+
+    print(f"Saved {new_count} new entries.")
+
+if __name__ == '__main__':
+    import sys
+
+    if '--once' in sys.argv:
+        check_and_save_new_entries()
+    else:
+        while True:
+            check_and_save_new_entries()
+            time.sleep(INTERVAL)
+
diff --git a/src/ingest/nbc_ingestor.py b/src/ingest/nbc_ingestor.py
@@ -0,0 +1,32 @@
+from bs4 import BeautifulSoup
+import requests
+from ingest.base_ingestor import BaseIngestor
+
+class NBCIngestor(BaseIngestor):
+    RSS_URL = "http://feeds.nbcnews.com/feeds/worldnews"
+
+    def fetch_full_text(self, article_url):
+        try:
+            headers = {
+                "User-Agent": "Mozilla/5.0"
+            }
+            response = requests.get(article_url, headers=headers, timeout=10)
+            soup = BeautifulSoup(response.content, 'html.parser')
+
+            # NBC usually puts article content in divs with the class 'article-body__content'
+            content_div = soup.find('div', class_='article-body__content')
+
+            if not content_div:
+                # Fallback: some older articles use this container
+                content_div = soup.find('div', {'data-testid': 'article-body'})
+
+            if not content_div:
+                return ""
+
+            paragraphs = content_div.find_all('p')
+            full_text = "\n".join(p.get_text(strip=True) for p in paragraphs)
+            return full_text.strip()
+
+        except Exception as e:
+            print(f"Error fetching NBC article: {article_url} — {e}")
+            return ""