Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
122 changes: 122 additions & 0 deletions src/ingest/ap_rss.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
import feedparser
import json
import os
import re
import time
import datetime
from playwright.sync_api import sync_playwright
import hashlib

INTERVAL = 3600 # seconds (1 hour)
HASHES = './data/raw/feed_saved_hashes.json'

os.makedirs('./data/raw/ap/', exist_ok=True)

def fetch_feed():
# Download and parse the feed
return feedparser.parse('https://news.google.com/rss/search?q=when:24h+allinurl:apnews.com&hl=en-US&gl=US&ceid=US:en')

def slugify(text):
# Convert title to a filesystem-friendly slug
text = text.lower()
text = re.sub(r'[^a-z0-9]+', '-', text)
return text.strip('-')

def format_date(entry):
# Extract and format the published date
try:
dt = datetime.datetime(*entry.published_parsed[:6])
return dt.strftime("%Y-%m-%d")
except:
return "unknown-date"

def fetch_full_article(url):
try:
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
page.goto(url, timeout=15000)

# Wait for the main article body to load
page.wait_for_selector('div.RichTextStoryBody', timeout=5000)

# Extract the text content from the paragraphs inside the body
content = page.query_selector_all('div.RichTextStoryBody p')
full_text = "\n".join(p.inner_text() for p in content)

browser.close()
return full_text.strip()

except Exception as e:
print(f"Playwright error fetching {url}: {e}")
return ""

def load_saved_hashes():
if os.path.exists(HASHES):
with open(HASHES, 'r', encoding='utf-8') as f:
return set(json.load(f))
return set()

def save_hashes(hashes):
with open(HASHES, 'w', encoding='utf-8') as f:
json.dump(list(hashes), f, indent=2)

def generate_entry_hash(entry):
hash_input = f"{entry.title}{entry.link}{entry.get('published', '')}"
return hashlib.sha256(hash_input.encode('utf-8')).hexdigest()

def save_entry(entry):
saved_hashes = load_saved_hashes()
entry_hash = generate_entry_hash(entry)

if entry_hash in saved_hashes:
return False # Already saved

# Save the entry as a JSON file
title_slug = slugify(entry.title)
date_str = format_date(entry)
filename = f"feed_{date_str}_{title_slug}.json"
filepath = os.path.join('./data/raw/ap/', filename)
full_text = fetch_full_article(entry.link)

# Avoid overwriting if file already exists
if os.path.exists(filepath):
return False

data = {
"title": entry.title,
"link": entry.link,
"published": entry.get("published", ""),
"summary": entry.get("summary", ""),
"full_text": full_text
}

with open(filepath, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)

saved_hashes.add(entry_hash)
save_hashes(saved_hashes)

return True

def check_and_save_new_entries():
feed = fetch_feed()
new_count = 0

for entry in feed.entries:
saved = save_entry(entry)
if saved:
new_count += 1

print(f"Saved {new_count} new entries.")

if __name__ == '__main__':
import sys

if '--once' in sys.argv:
check_and_save_new_entries()
else:
while True:
check_and_save_new_entries()
time.sleep(INTERVAL)

29 changes: 25 additions & 4 deletions src/ingest/bbc_rss.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,10 @@
import datetime
import requests
from bs4 import BeautifulSoup
import hashlib

INTERVAL = 3600 # seconds (1 hour)
HASHES = './data/raw/feed_saved_hashes.json'

os.makedirs('./data/raw/bbc/', exist_ok=True)

Expand Down Expand Up @@ -39,18 +41,34 @@ def fetch_full_article(url):
if article:
return(article.get_text())

def load_saved_hashes():
if os.path.exists(HASHES):
with open(HASHES, 'r', encoding='utf-8') as f:
return set(json.load(f))
return set()

def save_hashes(hashes):
with open(HASHES, 'w', encoding='utf-8') as f:
json.dump(list(hashes), f, indent=2)

def generate_entry_hash(entry):
hash_input = f"{entry.title}{entry.link}{entry.get('published', '')}"
return hashlib.sha256(hash_input.encode('utf-8')).hexdigest()

def save_entry(entry):
saved_hashes = load_saved_hashes()
entry_hash = generate_entry_hash(entry)

if entry_hash in saved_hashes:
return False # Already saved

# Save the entry as a JSON file
title_slug = slugify(entry.title)
date_str = format_date(entry)
filename = f"feed_{date_str}_{title_slug}.json"
filepath = os.path.join('./data/raw/bbc/', filename)
full_text = fetch_full_article(entry.link)

# Avoid overwriting if file already exists
if os.path.exists(filepath):
return False

data = {
"title": entry.title,
"link": entry.link,
Expand All @@ -62,6 +80,9 @@ def save_entry(entry):
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)

saved_hashes.add(entry_hash)
save_hashes(saved_hashes)

return True

def check_and_save_new_entries():
Expand Down
129 changes: 129 additions & 0 deletions src/ingest/cnn_rss.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
import feedparser
import json
import os
import re
import time
import datetime
import requests
from bs4 import BeautifulSoup
import hashlib

INTERVAL = 3600 # seconds (1 hour)
HASHES = './data/raw/feed_saved_hashes.json'

os.makedirs('./data/raw/cnn/', exist_ok=True)

def fetch_feed():
# Download and parse the feed
return feedparser.parse('http://rss.cnn.com/rss/cnn_world.rss')

def slugify(text):
# Convert title to a filesystem-friendly slug
text = text.lower()
text = re.sub(r'[^a-z0-9]+', '-', text)
return text.strip('-')

def format_date(entry):
# Extract and format the published date
try:
dt = datetime.datetime(*entry.published_parsed[:6])
return dt.strftime("%Y-%m-%d")
except:
return "unknown-date"

def fetch_full_article(url):
try:
headers = {
"User-Agent": "Mozilla/5.0"
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()

soup = BeautifulSoup(response.content, 'html.parser')

# CNN article content is usually within <div class="article__content"> or <section id="body-text">
article_section = soup.find('section', id='body-text') or soup.find('div', class_='article__content')

if not article_section:
print("No CNN article body found.")
return ""

paragraphs = article_section.find_all('div', class_='paragraph') or article_section.find_all('p')

full_text = "\n".join(p.get_text(strip=True) for p in paragraphs)

return full_text.strip()

except Exception as e:
print(f"Error fetching CNN article: {e}")
return ""

def load_saved_hashes():
if os.path.exists(HASHES):
with open(HASHES, 'r', encoding='utf-8') as f:
return set(json.load(f))
return set()

def save_hashes(hashes):
with open(HASHES, 'w', encoding='utf-8') as f:
json.dump(list(hashes), f, indent=2)

def generate_entry_hash(entry):
hash_input = f"{entry.title}{entry.link}{entry.get('published', '')}"
return hashlib.sha256(hash_input.encode('utf-8')).hexdigest()

def save_entry(entry):
saved_hashes = load_saved_hashes()
entry_hash = generate_entry_hash(entry)

if entry_hash in saved_hashes:
return False # Already saved

# Save the entry as a JSON file
title_slug = slugify(entry.title)
date_str = format_date(entry)
filename = f"feed_{date_str}_{title_slug}.json"
filepath = os.path.join('./data/raw/cnn/', filename)
full_text = fetch_full_article(entry.link)

# Avoid overwriting if file already exists
if os.path.exists(filepath):
return False

data = {
"title": entry.title,
"link": entry.link,
"published": entry.get("published", ""),
"summary": entry.get("summary", ""),
"full_text": full_text
}

with open(filepath, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)

saved_hashes.add(entry_hash)
save_hashes(saved_hashes)

return True

def check_and_save_new_entries():
feed = fetch_feed()
new_count = 0

for entry in feed.entries:
saved = save_entry(entry)
if saved:
new_count += 1

print(f"Saved {new_count} new entries.")

if __name__ == '__main__':
import sys

if '--once' in sys.argv:
check_and_save_new_entries()
else:
while True:
check_and_save_new_entries()
time.sleep(INTERVAL)

Loading