JustInternetAI · chllein · Jun 26, 2025 · Jun 25, 2025 · Jun 25, 2025 · Jun 26, 2025
diff --git a/src/ingest/ap_rss.py b/src/ingest/ap_rss.py
@@ -0,0 +1,122 @@
+import feedparser
+import json
+import os
+import re
+import time
+import datetime
+from playwright.sync_api import sync_playwright
+import hashlib
+
+INTERVAL = 3600  # seconds (1 hour)
+HASHES = './data/raw/feed_saved_hashes.json'
+
+os.makedirs('./data/raw/ap/', exist_ok=True)
+
+def fetch_feed():
+    # Download and parse the feed
+    return feedparser.parse('https://news.google.com/rss/search?q=when:24h+allinurl:apnews.com&hl=en-US&gl=US&ceid=US:en')
+
+def slugify(text):
+    # Convert title to a filesystem-friendly slug
+    text = text.lower()
+    text = re.sub(r'[^a-z0-9]+', '-', text)
+    return text.strip('-')
+
+def format_date(entry):
+    # Extract and format the published date
+    try:
+        dt = datetime.datetime(*entry.published_parsed[:6])
+        return dt.strftime("%Y-%m-%d")
+    except:
+        return "unknown-date"
+
+def fetch_full_article(url):
+    try:
+        with sync_playwright() as p:
+            browser = p.chromium.launch(headless=True)
+            page = browser.new_page()
+            page.goto(url, timeout=15000)
+
+            # Wait for the main article body to load
+            page.wait_for_selector('div.RichTextStoryBody', timeout=5000)
+
+            # Extract the text content from the paragraphs inside the body
+            content = page.query_selector_all('div.RichTextStoryBody p')
+            full_text = "\n".join(p.inner_text() for p in content)
+
+            browser.close()
+            return full_text.strip()
+
+    except Exception as e:
+        print(f"Playwright error fetching {url}: {e}")
+        return ""
+
+def load_saved_hashes():
+    if os.path.exists(HASHES):
+        with open(HASHES, 'r', encoding='utf-8') as f:
+            return set(json.load(f))
+    return set()
+
+def save_hashes(hashes):
+    with open(HASHES, 'w', encoding='utf-8') as f:
+        json.dump(list(hashes), f, indent=2)
+
+def generate_entry_hash(entry):
+    hash_input = f"{entry.title}{entry.link}{entry.get('published', '')}"
+    return hashlib.sha256(hash_input.encode('utf-8')).hexdigest()
+
+def save_entry(entry):
+    saved_hashes = load_saved_hashes()
+    entry_hash = generate_entry_hash(entry)
+
+    if entry_hash in saved_hashes:
+        return False  # Already saved
+
+    # Save the entry as a JSON file
+    title_slug = slugify(entry.title)
+    date_str = format_date(entry)
+    filename = f"feed_{date_str}_{title_slug}.json"
+    filepath = os.path.join('./data/raw/ap/', filename)
+    full_text = fetch_full_article(entry.link)
+
+    # Avoid overwriting if file already exists
+    if os.path.exists(filepath):
+        return False
+
+    data = {
+        "title": entry.title,
+        "link": entry.link,
+        "published": entry.get("published", ""),
+        "summary": entry.get("summary", ""),
+        "full_text": full_text
+    }
+
+    with open(filepath, 'w', encoding='utf-8') as f:
+        json.dump(data, f, ensure_ascii=False, indent=4)
+
+    saved_hashes.add(entry_hash)
+    save_hashes(saved_hashes)
+
+    return True
+
+def check_and_save_new_entries():
+    feed = fetch_feed()
+    new_count = 0
+
+    for entry in feed.entries:
+        saved = save_entry(entry)
+        if saved:
+            new_count += 1
+
+    print(f"Saved {new_count} new entries.")
+
+if __name__ == '__main__':
+    import sys
+
+    if '--once' in sys.argv:
+        check_and_save_new_entries()
+    else:
+        while True:
+            check_and_save_new_entries()
+            time.sleep(INTERVAL)
+
diff --git a/src/ingest/bbc_rss.py b/src/ingest/bbc_rss.py
@@ -7,8 +7,10 @@
 import datetime
 import requests
 from bs4 import BeautifulSoup
+import hashlib
 
 INTERVAL = 3600  # seconds (1 hour)
+HASHES = './data/raw/feed_saved_hashes.json'
 
 os.makedirs('./data/raw/bbc/', exist_ok=True)
 
@@ -39,18 +41,34 @@ def fetch_full_article(url):
     if article:
         return(article.get_text())
 
+def load_saved_hashes():
+    if os.path.exists(HASHES):
+        with open(HASHES, 'r', encoding='utf-8') as f:
+            return set(json.load(f))
+    return set()
+
+def save_hashes(hashes):
+    with open(HASHES, 'w', encoding='utf-8') as f:
+        json.dump(list(hashes), f, indent=2)
+
+def generate_entry_hash(entry):
+    hash_input = f"{entry.title}{entry.link}{entry.get('published', '')}"
+    return hashlib.sha256(hash_input.encode('utf-8')).hexdigest()
+
 def save_entry(entry):
+    saved_hashes = load_saved_hashes()
+    entry_hash = generate_entry_hash(entry)
+
+    if entry_hash in saved_hashes:
+        return False  # Already saved
+
     # Save the entry as a JSON file
     title_slug = slugify(entry.title)
     date_str = format_date(entry)
     filename = f"feed_{date_str}_{title_slug}.json"
     filepath = os.path.join('./data/raw/bbc/', filename)
     full_text = fetch_full_article(entry.link)
 
-    # Avoid overwriting if file already exists
-    if os.path.exists(filepath):
-        return False
-
     data = {
         "title": entry.title,
         "link": entry.link,
@@ -62,6 +80,9 @@ def save_entry(entry):
     with open(filepath, 'w', encoding='utf-8') as f:
         json.dump(data, f, ensure_ascii=False, indent=4)
 
+    saved_hashes.add(entry_hash)
+    save_hashes(saved_hashes)
+
     return True
 
 def check_and_save_new_entries():

diff --git a/src/ingest/cnn_rss.py b/src/ingest/cnn_rss.py
@@ -0,0 +1,129 @@
+import feedparser
+import json
+import os
+import re
+import time
+import datetime
+import requests
+from bs4 import BeautifulSoup
+import hashlib
+
+INTERVAL = 3600  # seconds (1 hour)
+HASHES = './data/raw/feed_saved_hashes.json'
+
+os.makedirs('./data/raw/cnn/', exist_ok=True)
+
+def fetch_feed():
+    # Download and parse the feed
+    return feedparser.parse('http://rss.cnn.com/rss/cnn_world.rss')
+
+def slugify(text):
+    # Convert title to a filesystem-friendly slug
+    text = text.lower()
+    text = re.sub(r'[^a-z0-9]+', '-', text)
+    return text.strip('-')
+
+def format_date(entry):
+    # Extract and format the published date
+    try:
+        dt = datetime.datetime(*entry.published_parsed[:6])
+        return dt.strftime("%Y-%m-%d")
+    except:
+        return "unknown-date"
+
+def fetch_full_article(url):
+    try:
+        headers = {
+            "User-Agent": "Mozilla/5.0"
+        }
+        response = requests.get(url, headers=headers, timeout=10)
+        response.raise_for_status()
+
+        soup = BeautifulSoup(response.content, 'html.parser')
+
+        # CNN article content is usually within <div class="article__content"> or <section id="body-text">
+        article_section = soup.find('section', id='body-text') or soup.find('div', class_='article__content')
+
+        if not article_section:
+            print("No CNN article body found.")
+            return ""
+
+        paragraphs = article_section.find_all('div', class_='paragraph') or article_section.find_all('p')
+
+        full_text = "\n".join(p.get_text(strip=True) for p in paragraphs)
+
+        return full_text.strip()
+
+    except Exception as e:
+        print(f"Error fetching CNN article: {e}")
+        return ""
+
+def load_saved_hashes():
+    if os.path.exists(HASHES):
+        with open(HASHES, 'r', encoding='utf-8') as f:
+            return set(json.load(f))
+    return set()
+
+def save_hashes(hashes):
+    with open(HASHES, 'w', encoding='utf-8') as f:
+        json.dump(list(hashes), f, indent=2)
+
+def generate_entry_hash(entry):
+    hash_input = f"{entry.title}{entry.link}{entry.get('published', '')}"
+    return hashlib.sha256(hash_input.encode('utf-8')).hexdigest()
+
+def save_entry(entry):
+    saved_hashes = load_saved_hashes()
+    entry_hash = generate_entry_hash(entry)
+
+    if entry_hash in saved_hashes:
+        return False  # Already saved
+
+    # Save the entry as a JSON file
+    title_slug = slugify(entry.title)
+    date_str = format_date(entry)
+    filename = f"feed_{date_str}_{title_slug}.json"
+    filepath = os.path.join('./data/raw/cnn/', filename)
+    full_text = fetch_full_article(entry.link)
+
+    # Avoid overwriting if file already exists
+    if os.path.exists(filepath):
+        return False
+
+    data = {
+        "title": entry.title,
+        "link": entry.link,
+        "published": entry.get("published", ""),
+        "summary": entry.get("summary", ""),
+        "full_text": full_text
+    }
+
+    with open(filepath, 'w', encoding='utf-8') as f:
+        json.dump(data, f, ensure_ascii=False, indent=4)
+
+    saved_hashes.add(entry_hash)
+    save_hashes(saved_hashes)
+
+    return True
+
+def check_and_save_new_entries():
+    feed = fetch_feed()
+    new_count = 0
+
+    for entry in feed.entries:
+        saved = save_entry(entry)
+        if saved:
+            new_count += 1
+
+    print(f"Saved {new_count} new entries.")
+
+if __name__ == '__main__':
+    import sys
+
+    if '--once' in sys.argv:
+        check_and_save_new_entries()
+    else:
+        while True:
+            check_and_save_new_entries()
+            time.sleep(INTERVAL)
+