diff --git a/src/ingest/ap_rss.py b/src/ingest/ap_rss.py
new file mode 100644
index 0000000..ba19391
--- /dev/null
+++ b/src/ingest/ap_rss.py
@@ -0,0 +1,122 @@
+import feedparser
+import json
+import os
+import re
+import time
+import datetime
+from playwright.sync_api import sync_playwright
+import hashlib
+
+INTERVAL = 3600 # seconds (1 hour)
+HASHES = './data/raw/feed_saved_hashes.json'
+
+os.makedirs('./data/raw/ap/', exist_ok=True)
+
+def fetch_feed():
+ # Download and parse the feed
+ return feedparser.parse('https://news.google.com/rss/search?q=when:24h+allinurl:apnews.com&hl=en-US&gl=US&ceid=US:en')
+
+def slugify(text):
+ # Convert title to a filesystem-friendly slug
+ text = text.lower()
+ text = re.sub(r'[^a-z0-9]+', '-', text)
+ return text.strip('-')
+
+def format_date(entry):
+ # Extract and format the published date
+ try:
+ dt = datetime.datetime(*entry.published_parsed[:6])
+ return dt.strftime("%Y-%m-%d")
+ except:
+ return "unknown-date"
+
+def fetch_full_article(url):
+ try:
+ with sync_playwright() as p:
+ browser = p.chromium.launch(headless=True)
+ page = browser.new_page()
+ page.goto(url, timeout=15000)
+
+ # Wait for the main article body to load
+ page.wait_for_selector('div.RichTextStoryBody', timeout=5000)
+
+ # Extract the text content from the paragraphs inside the body
+ content = page.query_selector_all('div.RichTextStoryBody p')
+ full_text = "\n".join(p.inner_text() for p in content)
+
+ browser.close()
+ return full_text.strip()
+
+ except Exception as e:
+ print(f"Playwright error fetching {url}: {e}")
+ return ""
+
+def load_saved_hashes():
+ if os.path.exists(HASHES):
+ with open(HASHES, 'r', encoding='utf-8') as f:
+ return set(json.load(f))
+ return set()
+
+def save_hashes(hashes):
+ with open(HASHES, 'w', encoding='utf-8') as f:
+ json.dump(list(hashes), f, indent=2)
+
+def generate_entry_hash(entry):
+ hash_input = f"{entry.title}{entry.link}{entry.get('published', '')}"
+ return hashlib.sha256(hash_input.encode('utf-8')).hexdigest()
+
+def save_entry(entry):
+ saved_hashes = load_saved_hashes()
+ entry_hash = generate_entry_hash(entry)
+
+ if entry_hash in saved_hashes:
+ return False # Already saved
+
+ # Save the entry as a JSON file
+ title_slug = slugify(entry.title)
+ date_str = format_date(entry)
+ filename = f"feed_{date_str}_{title_slug}.json"
+ filepath = os.path.join('./data/raw/ap/', filename)
+ full_text = fetch_full_article(entry.link)
+
+ # Avoid overwriting if file already exists
+ if os.path.exists(filepath):
+ return False
+
+ data = {
+ "title": entry.title,
+ "link": entry.link,
+ "published": entry.get("published", ""),
+ "summary": entry.get("summary", ""),
+ "full_text": full_text
+ }
+
+ with open(filepath, 'w', encoding='utf-8') as f:
+ json.dump(data, f, ensure_ascii=False, indent=4)
+
+ saved_hashes.add(entry_hash)
+ save_hashes(saved_hashes)
+
+ return True
+
+def check_and_save_new_entries():
+ feed = fetch_feed()
+ new_count = 0
+
+ for entry in feed.entries:
+ saved = save_entry(entry)
+ if saved:
+ new_count += 1
+
+ print(f"Saved {new_count} new entries.")
+
+if __name__ == '__main__':
+ import sys
+
+ if '--once' in sys.argv:
+ check_and_save_new_entries()
+ else:
+ while True:
+ check_and_save_new_entries()
+ time.sleep(INTERVAL)
+
diff --git a/src/ingest/bbc_rss.py b/src/ingest/bbc_rss.py
index cc58320..de36208 100644
--- a/src/ingest/bbc_rss.py
+++ b/src/ingest/bbc_rss.py
@@ -7,8 +7,10 @@
import datetime
import requests
from bs4 import BeautifulSoup
+import hashlib
INTERVAL = 3600 # seconds (1 hour)
+HASHES = './data/raw/feed_saved_hashes.json'
os.makedirs('./data/raw/bbc/', exist_ok=True)
@@ -39,7 +41,27 @@ def fetch_full_article(url):
if article:
return(article.get_text())
+def load_saved_hashes():
+ if os.path.exists(HASHES):
+ with open(HASHES, 'r', encoding='utf-8') as f:
+ return set(json.load(f))
+ return set()
+
+def save_hashes(hashes):
+ with open(HASHES, 'w', encoding='utf-8') as f:
+ json.dump(list(hashes), f, indent=2)
+
+def generate_entry_hash(entry):
+ hash_input = f"{entry.title}{entry.link}{entry.get('published', '')}"
+ return hashlib.sha256(hash_input.encode('utf-8')).hexdigest()
+
def save_entry(entry):
+ saved_hashes = load_saved_hashes()
+ entry_hash = generate_entry_hash(entry)
+
+ if entry_hash in saved_hashes:
+ return False # Already saved
+
# Save the entry as a JSON file
title_slug = slugify(entry.title)
date_str = format_date(entry)
@@ -47,10 +69,6 @@ def save_entry(entry):
filepath = os.path.join('./data/raw/bbc/', filename)
full_text = fetch_full_article(entry.link)
- # Avoid overwriting if file already exists
- if os.path.exists(filepath):
- return False
-
data = {
"title": entry.title,
"link": entry.link,
@@ -62,6 +80,9 @@ def save_entry(entry):
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
+ saved_hashes.add(entry_hash)
+ save_hashes(saved_hashes)
+
return True
def check_and_save_new_entries():
diff --git a/src/ingest/cnn_rss.py b/src/ingest/cnn_rss.py
new file mode 100644
index 0000000..59762d1
--- /dev/null
+++ b/src/ingest/cnn_rss.py
@@ -0,0 +1,129 @@
+import feedparser
+import json
+import os
+import re
+import time
+import datetime
+import requests
+from bs4 import BeautifulSoup
+import hashlib
+
+INTERVAL = 3600 # seconds (1 hour)
+HASHES = './data/raw/feed_saved_hashes.json'
+
+os.makedirs('./data/raw/cnn/', exist_ok=True)
+
+def fetch_feed():
+ # Download and parse the feed
+ return feedparser.parse('http://rss.cnn.com/rss/cnn_world.rss')
+
+def slugify(text):
+ # Convert title to a filesystem-friendly slug
+ text = text.lower()
+ text = re.sub(r'[^a-z0-9]+', '-', text)
+ return text.strip('-')
+
+def format_date(entry):
+ # Extract and format the published date
+ try:
+ dt = datetime.datetime(*entry.published_parsed[:6])
+ return dt.strftime("%Y-%m-%d")
+ except:
+ return "unknown-date"
+
+def fetch_full_article(url):
+ try:
+ headers = {
+ "User-Agent": "Mozilla/5.0"
+ }
+ response = requests.get(url, headers=headers, timeout=10)
+ response.raise_for_status()
+
+ soup = BeautifulSoup(response.content, 'html.parser')
+
+ # CNN article content is usually within
or
+ article_section = soup.find('section', id='body-text') or soup.find('div', class_='article__content')
+
+ if not article_section:
+ print("No CNN article body found.")
+ return ""
+
+ paragraphs = article_section.find_all('div', class_='paragraph') or article_section.find_all('p')
+
+ full_text = "\n".join(p.get_text(strip=True) for p in paragraphs)
+
+ return full_text.strip()
+
+ except Exception as e:
+ print(f"Error fetching CNN article: {e}")
+ return ""
+
+def load_saved_hashes():
+ if os.path.exists(HASHES):
+ with open(HASHES, 'r', encoding='utf-8') as f:
+ return set(json.load(f))
+ return set()
+
+def save_hashes(hashes):
+ with open(HASHES, 'w', encoding='utf-8') as f:
+ json.dump(list(hashes), f, indent=2)
+
+def generate_entry_hash(entry):
+ hash_input = f"{entry.title}{entry.link}{entry.get('published', '')}"
+ return hashlib.sha256(hash_input.encode('utf-8')).hexdigest()
+
+def save_entry(entry):
+ saved_hashes = load_saved_hashes()
+ entry_hash = generate_entry_hash(entry)
+
+ if entry_hash in saved_hashes:
+ return False # Already saved
+
+ # Save the entry as a JSON file
+ title_slug = slugify(entry.title)
+ date_str = format_date(entry)
+ filename = f"feed_{date_str}_{title_slug}.json"
+ filepath = os.path.join('./data/raw/cnn/', filename)
+ full_text = fetch_full_article(entry.link)
+
+ # Avoid overwriting if file already exists
+ if os.path.exists(filepath):
+ return False
+
+ data = {
+ "title": entry.title,
+ "link": entry.link,
+ "published": entry.get("published", ""),
+ "summary": entry.get("summary", ""),
+ "full_text": full_text
+ }
+
+ with open(filepath, 'w', encoding='utf-8') as f:
+ json.dump(data, f, ensure_ascii=False, indent=4)
+
+ saved_hashes.add(entry_hash)
+ save_hashes(saved_hashes)
+
+ return True
+
+def check_and_save_new_entries():
+ feed = fetch_feed()
+ new_count = 0
+
+ for entry in feed.entries:
+ saved = save_entry(entry)
+ if saved:
+ new_count += 1
+
+ print(f"Saved {new_count} new entries.")
+
+if __name__ == '__main__':
+ import sys
+
+ if '--once' in sys.argv:
+ check_and_save_new_entries()
+ else:
+ while True:
+ check_and_save_new_entries()
+ time.sleep(INTERVAL)
+
diff --git a/src/ingest/npr_rss.py b/src/ingest/npr_rss.py
new file mode 100644
index 0000000..1dd343f
--- /dev/null
+++ b/src/ingest/npr_rss.py
@@ -0,0 +1,127 @@
+
+
+import feedparser
+import json
+import os
+import re
+import time
+import datetime
+import requests
+from bs4 import BeautifulSoup
+import hashlib
+
+INTERVAL = 3600 # seconds (1 hour)
+HASHES = './data/raw/feed_saved_hashes.json'
+
+os.makedirs('./data/raw/npr/', exist_ok=True)
+
+def fetch_feed():
+ # Download and parse the feed
+ return feedparser.parse('https://feeds.npr.org/1004/rss.xml')
+
+def slugify(text):
+ # Convert title to a filesystem-friendly slug
+ text = text.lower()
+ text = re.sub(r'[^a-z0-9]+', '-', text)
+ return text.strip('-')
+
+def format_date(entry):
+ # Extract and format the published date
+ try:
+ dt = datetime.datetime(*entry.published_parsed[:6])
+ return dt.strftime("%Y-%m-%d")
+ except:
+ return "unknown-date"
+
+def fetch_full_article(url):
+ try:
+ headers = {
+ "User-Agent": "Mozilla/5.0"
+ }
+ response = requests.get(url, headers=headers, timeout=10)
+ response.raise_for_status()
+
+ soup = BeautifulSoup(response.content, 'html.parser')
+
+ # NPR article body is typically in or
blocks
+ article_body = soup.find('div', class_='storytext') or soup.find('article')
+
+ if not article_body:
+ print("No main article content found.")
+ return ""
+
+ paragraphs = article_body.find_all('p')
+ full_text = "\n".join(p.get_text(strip=True) for p in paragraphs)
+
+ return full_text.strip()
+
+ except Exception as e:
+ print(f"Error fetching NPR article: {e}")
+ return ""
+
+def load_saved_hashes():
+ if os.path.exists(HASHES):
+ with open(HASHES, 'r', encoding='utf-8') as f:
+ return set(json.load(f))
+ return set()
+
+def save_hashes(hashes):
+ with open(HASHES, 'w', encoding='utf-8') as f:
+ json.dump(list(hashes), f, indent=2)
+
+def generate_entry_hash(entry):
+ hash_input = f"{entry.title}{entry.link}{entry.get('published', '')}"
+ return hashlib.sha256(hash_input.encode('utf-8')).hexdigest()
+
+def save_entry(entry):
+ saved_hashes = load_saved_hashes()
+ entry_hash = generate_entry_hash(entry)
+
+ if entry_hash in saved_hashes:
+ return False # Already saved
+
+ # Save the entry as a JSON file
+ title_slug = slugify(entry.title)
+ date_str = format_date(entry)
+ filename = f"feed_{date_str}_{title_slug}.json"
+ filepath = os.path.join('./data/raw/npr/', filename)
+ full_text = fetch_full_article(entry.link)
+
+ # Avoid overwriting if file already exists
+ if os.path.exists(filepath):
+ return False
+
+ data = {
+ "title": entry.title,
+ "link": entry.link,
+ "published": entry.get("published", ""),
+ "summary": entry.get("summary", ""),
+ "full_text": full_text
+ }
+
+ with open(filepath, 'w', encoding='utf-8') as f:
+ json.dump(data, f, ensure_ascii=False, indent=4)
+
+ return True
+
+def check_and_save_new_entries():
+ feed = fetch_feed()
+ new_count = 0
+
+ for entry in feed.entries:
+ saved = save_entry(entry)
+ if saved:
+ new_count += 1
+
+ print(f"Saved {new_count} new entries.")
+
+if __name__ == '__main__':
+ import sys
+
+ if '--once' in sys.argv:
+ check_and_save_new_entries()
+ else:
+ while True:
+ check_and_save_new_entries()
+ time.sleep(INTERVAL)
+
diff --git a/src/ingest/nyt_rss.py b/src/ingest/nyt_rss.py
index e17093a..0484d67 100644
--- a/src/ingest/nyt_rss.py
+++ b/src/ingest/nyt_rss.py
@@ -6,8 +6,10 @@
import datetime
import requests
from bs4 import BeautifulSoup
+import hashlib
INTERVAL = 3600 # seconds (1 hour)
+HASHES = './data/raw/feed_saved_hashes.json'
os.makedirs('./data/raw/nyt/', exist_ok=True)
@@ -50,12 +52,31 @@ def fetch_full_article(url):
print(f"Error fetching full article: {e}")
return ""
+def load_saved_hashes():
+ if os.path.exists(HASHES):
+ with open(HASHES, 'r', encoding='utf-8') as f:
+ return set(json.load(f))
+ return set()
+
+def save_hashes(hashes):
+ with open(HASHES, 'w', encoding='utf-8') as f:
+ json.dump(list(hashes), f, indent=2)
+
+def generate_entry_hash(entry):
+ hash_input = f"{entry.title}{entry.link}{entry.get('published', '')}"
+ return hashlib.sha256(hash_input.encode('utf-8')).hexdigest()
def save_entry(entry):
+ saved_hashes = load_saved_hashes()
+ entry_hash = generate_entry_hash(entry)
+
+ if entry_hash in saved_hashes:
+ return False # Already saved
+
# Save the entry as a JSON file
title_slug = slugify(entry.title)
date_str = format_date(entry)
- filename = f"{date_str}_{title_slug}.json"
+ filename = f"feed_{date_str}_{title_slug}.json"
filepath = os.path.join('./data/raw/nyt/', filename)
full_text = fetch_full_article(entry.link)
@@ -74,6 +95,9 @@ def save_entry(entry):
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
+ saved_hashes.add(entry_hash)
+ save_hashes(saved_hashes)
+
return True
def check_and_save_new_entries():
@@ -96,4 +120,3 @@ def check_and_save_new_entries():
while True:
check_and_save_new_entries()
time.sleep(INTERVAL)
-
\ No newline at end of file