Skip to content
Merged
27 changes: 27 additions & 0 deletions src/ingest/ap_ingestor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from playwright.sync_api import sync_playwright
from ingest.base_ingestor import BaseIngestor

class APIngestor(BaseIngestor):
RSS_URL = "https://news.google.com/rss/search?q=when:24h+allinurl:apnews.com&hl=en-US&gl=US&ceid=US:en"

def fetch_full_text(self, article_url):
try:
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
page.goto(article_url, timeout=15000)

# Wait for the main article body to load
page.wait_for_selector('div.RichTextStoryBody', timeout=5000)

# Extract the text content from the paragraphs inside the body
content = page.query_selector_all('div.RichTextStoryBody p')
full_text = "\n".join(p.inner_text() for p in content)

browser.close()
return full_text.strip()

except Exception as e:
print(f"Playwright error fetching {url}: {e}")
return ""

42 changes: 42 additions & 0 deletions src/ingest/cbs_ingestor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from bs4 import BeautifulSoup
import requests
from ingest.base_ingestor import BaseIngestor

class CBSIngestor(BaseIngestor):
RSS_URL = "https://www.cbsnews.com/latest/rss/world"

def fetch_full_text(self, article_url):
try:
headers = {"User-Agent": "Mozilla/5.0"}
resp = requests.get(article_url, headers=headers, timeout=10)
resp.raise_for_status()

soup = BeautifulSoup(resp.content, 'html.parser')

# Step 1: find the <h1> (article title)
h1 = soup.find('h1')
if not h1:
print(f"[warn] No <h1> tag found in {article_url}")
return ""

# Step 2: Walk up parent nodes until we find one with enough <p> tags
root = h1
while root and root.name != 'body':
paragraphs = root.find_all('p')
if len(paragraphs) >= 5:
break
root = root.parent

if root is None or root.name == 'body':
print(f"[warn] Couldn't find content container in {article_url}")
return ""

# Step 3: Extract text from heading/paragraph tags under that container
tags = root.find_all(['h2', 'h3', 'h4', 'h5', 'h6', 'p'])
text = "\n".join(tag.get_text(strip=True) for tag in tags)

return text.strip()

except Exception as e:
print(f"[error] {e} while scraping {article_url}")
return ""
134 changes: 134 additions & 0 deletions src/ingest/cbs_rss.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
import feedparser
import json
import os
import re
import time
import datetime
import requests
from bs4 import BeautifulSoup
import hashlib

INTERVAL = 3600 # seconds (1 hour)
HASHES = './data/raw/feed_saved_hashes.json'

os.makedirs('./data/raw/cbs/', exist_ok=True)

def fetch_feed():
# Download and parse the feed
return feedparser.parse('https://www.cbsnews.com/latest/rss/world')

def slugify(text):
# Convert title to a filesystem-friendly slug
text = text.lower()
text = re.sub(r'[^a-z0-9]+', '-', text)
return text.strip('-')

def format_date(entry):
# Extract and format the published date
try:
dt = datetime.datetime(*entry.published_parsed[:6])
return dt.strftime("%Y-%m-%d")
except:
return "unknown-date"

def fetch_full_article(url):
try:
headers = {"User-Agent": "Mozilla/5.0"}
resp = requests.get(url, headers=headers, timeout=10)
resp.raise_for_status()

soup = BeautifulSoup(resp.content, 'html.parser')

# Step 1: find the <h1> (article title)
h1 = soup.find('h1')
if not h1:
print(f"[warn] No <h1> tag found in {url}")
return ""

# Step 2: Walk up parent nodes until we find one with enough <p> tags
root = h1
while root and root.name != 'body':
paragraphs = root.find_all('p')
if len(paragraphs) >= 5:
break
root = root.parent

if root is None or root.name == 'body':
print(f"[warn] Couldn't find content container in {url}")
return ""

# Step 3: Extract text from heading/paragraph tags under that container
tags = root.find_all(['h2', 'h3', 'h4', 'h5', 'h6', 'p'])
text = "\n".join(tag.get_text(strip=True) for tag in tags)

return text.strip()

except Exception as e:
print(f"[error] {e} while scraping {url}")
return ""

def load_saved_hashes():
if os.path.exists(HASHES):
with open(HASHES, 'r', encoding='utf-8') as f:
return set(json.load(f))
return set()

def save_hashes(hashes):
with open(HASHES, 'w', encoding='utf-8') as f:
json.dump(list(hashes), f, indent=2)

def generate_entry_hash(entry):
hash_input = f"{entry.title}{entry.link}{entry.get('published', '')}"
return hashlib.sha256(hash_input.encode('utf-8')).hexdigest()

def save_entry(entry):
saved_hashes = load_saved_hashes()
entry_hash = generate_entry_hash(entry)

if entry_hash in saved_hashes:
return False # Already saved

# Save the entry as a JSON file
title_slug = slugify(entry.title)
date_str = format_date(entry)
filename = f"feed_{date_str}_{title_slug}.json"
filepath = os.path.join('./data/raw/cbs/', filename)
full_text = fetch_full_article(entry.link)

data = {
"title": entry.title,
"link": entry.link,
"published": entry.get("published", ""),
"summary": entry.get("summary", ""),
"full_text": full_text
}

with open(filepath, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)

saved_hashes.add(entry_hash)
save_hashes(saved_hashes)

return True

def check_and_save_new_entries():
feed = fetch_feed()
new_count = 0

for entry in feed.entries:
saved = save_entry(entry)
if saved:
new_count += 1

print(f"Saved {new_count} new entries.")

if __name__ == '__main__':
import sys

if '--once' in sys.argv:
check_and_save_new_entries()
else:
while True:
check_and_save_new_entries()
time.sleep(INTERVAL)

30 changes: 30 additions & 0 deletions src/ingest/latimes_ingestor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from bs4 import BeautifulSoup
import requests
from ingest.base_ingestor import BaseIngestor

class LATIMESIngestor(BaseIngestor):
RSS_URL = "https://www.latimes.com/world/rss2.0.xml"

def fetch_full_text(self, article_url):
try:
headers = {
"User-Agent": "Mozilla/5.0"
}
response = requests.get(article_url, headers=headers, timeout=10)
soup = BeautifulSoup(response.content, 'html.parser')

# LATimes articles are usually under <section name="article-body"> or similar
article_body = soup.find('section', attrs={'name': 'article-body'})
if not article_body:
article_body = soup.find('div', class_='rich-text-article-body') # fallback

if not article_body:
return ""

paragraphs = article_body.find_all('p')
full_text = "\n".join(p.get_text(strip=True) for p in paragraphs)
return full_text.strip()

except Exception as e:
print(f"Error fetching LA Times article: {article_url} — {e}")
return ""
123 changes: 123 additions & 0 deletions src/ingest/latimes_rss.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@

import feedparser
import json
import os
import re
import time
import datetime
from bs4 import BeautifulSoup
import requests
import hashlib

INTERVAL = 3600 # seconds (1 hour)
HASHES = './data/raw/feed_saved_hashes.json'

os.makedirs('./data/raw/latimes/', exist_ok=True)

def fetch_feed():
# Download and parse the feed
return feedparser.parse('https://www.latimes.com/world/rss2.0.xml')

def slugify(text):
# Convert title to a filesystem-friendly slug
text = text.lower()
text = re.sub(r'[^a-z0-9]+', '-', text)
return text.strip('-')

def format_date(entry):
# Extract and format the published date
try:
dt = datetime.datetime(*entry.published_parsed[:6])
return dt.strftime("%Y-%m-%d")
except:
return "unknown-date"

def fetch_full_article(url):
try:
headers = {
"User-Agent": "Mozilla/5.0"
}
response = requests.get(url, headers=headers, timeout=10)
soup = BeautifulSoup(response.content, 'html.parser')

# LATimes articles are usually under <section name="article-body"> or similar
article_body = soup.find('section', attrs={'name': 'article-body'})
if not article_body:
article_body = soup.find('div', class_='rich-text-article-body') # fallback

if not article_body:
return ""

paragraphs = article_body.find_all('p')
full_text = "\n".join(p.get_text(strip=True) for p in paragraphs)
return full_text.strip()

except Exception as e:
print(f"Error fetching LA Times article: {url} — {e}")
return ""

def load_saved_hashes():
if os.path.exists(HASHES):
with open(HASHES, 'r', encoding='utf-8') as f:
return set(json.load(f))
return set()

def save_hashes(hashes):
with open(HASHES, 'w', encoding='utf-8') as f:
json.dump(list(hashes), f, indent=2)

def generate_entry_hash(entry):
hash_input = f"{entry.title}{entry.link}{entry.get('published', '')}"
return hashlib.sha256(hash_input.encode('utf-8')).hexdigest()

def save_entry(entry):
saved_hashes = load_saved_hashes()
entry_hash = generate_entry_hash(entry)

if entry_hash in saved_hashes:
return False # Already saved

# Save the entry as a JSON file
title_slug = slugify(entry.title)
date_str = format_date(entry)
filename = f"feed_{date_str}_{title_slug}.json"
filepath = os.path.join('./data/raw/latimes/', filename)
full_text = fetch_full_article(entry.link)

data = {
"title": entry.title,
"link": entry.link,
"published": entry.get("published", ""),
"summary": entry.get("summary", ""),
"full_text": full_text
}

with open(filepath, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)

saved_hashes.add(entry_hash)
save_hashes(saved_hashes)

return True

def check_and_save_new_entries():
feed = fetch_feed()
new_count = 0

for entry in feed.entries:
saved = save_entry(entry)
if saved:
new_count += 1

print(f"Saved {new_count} new entries.")

if __name__ == '__main__':
import sys

if '--once' in sys.argv:
check_and_save_new_entries()
else:
while True:
check_and_save_new_entries()
time.sleep(INTERVAL)

32 changes: 32 additions & 0 deletions src/ingest/nbc_ingestor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from bs4 import BeautifulSoup
import requests
from ingest.base_ingestor import BaseIngestor

class NBCIngestor(BaseIngestor):
RSS_URL = "http://feeds.nbcnews.com/feeds/worldnews"

def fetch_full_text(self, article_url):
try:
headers = {
"User-Agent": "Mozilla/5.0"
}
response = requests.get(article_url, headers=headers, timeout=10)
soup = BeautifulSoup(response.content, 'html.parser')

# NBC usually puts article content in divs with the class 'article-body__content'
content_div = soup.find('div', class_='article-body__content')

if not content_div:
# Fallback: some older articles use this container
content_div = soup.find('div', {'data-testid': 'article-body'})

if not content_div:
return ""

paragraphs = content_div.find_all('p')
full_text = "\n".join(p.get_text(strip=True) for p in paragraphs)
return full_text.strip()

except Exception as e:
print(f"Error fetching NBC article: {article_url} — {e}")
return ""
Loading
Loading