diff --git a/src/ingest/ap_rss.py b/src/ingest/ap_rss.py new file mode 100644 index 0000000..ba19391 --- /dev/null +++ b/src/ingest/ap_rss.py @@ -0,0 +1,122 @@ +import feedparser +import json +import os +import re +import time +import datetime +from playwright.sync_api import sync_playwright +import hashlib + +INTERVAL = 3600 # seconds (1 hour) +HASHES = './data/raw/feed_saved_hashes.json' + +os.makedirs('./data/raw/ap/', exist_ok=True) + +def fetch_feed(): + # Download and parse the feed + return feedparser.parse('https://news.google.com/rss/search?q=when:24h+allinurl:apnews.com&hl=en-US&gl=US&ceid=US:en') + +def slugify(text): + # Convert title to a filesystem-friendly slug + text = text.lower() + text = re.sub(r'[^a-z0-9]+', '-', text) + return text.strip('-') + +def format_date(entry): + # Extract and format the published date + try: + dt = datetime.datetime(*entry.published_parsed[:6]) + return dt.strftime("%Y-%m-%d") + except: + return "unknown-date" + +def fetch_full_article(url): + try: + with sync_playwright() as p: + browser = p.chromium.launch(headless=True) + page = browser.new_page() + page.goto(url, timeout=15000) + + # Wait for the main article body to load + page.wait_for_selector('div.RichTextStoryBody', timeout=5000) + + # Extract the text content from the paragraphs inside the body + content = page.query_selector_all('div.RichTextStoryBody p') + full_text = "\n".join(p.inner_text() for p in content) + + browser.close() + return full_text.strip() + + except Exception as e: + print(f"Playwright error fetching {url}: {e}") + return "" + +def load_saved_hashes(): + if os.path.exists(HASHES): + with open(HASHES, 'r', encoding='utf-8') as f: + return set(json.load(f)) + return set() + +def save_hashes(hashes): + with open(HASHES, 'w', encoding='utf-8') as f: + json.dump(list(hashes), f, indent=2) + +def generate_entry_hash(entry): + hash_input = f"{entry.title}{entry.link}{entry.get('published', '')}" + return hashlib.sha256(hash_input.encode('utf-8')).hexdigest() + +def save_entry(entry): + saved_hashes = load_saved_hashes() + entry_hash = generate_entry_hash(entry) + + if entry_hash in saved_hashes: + return False # Already saved + + # Save the entry as a JSON file + title_slug = slugify(entry.title) + date_str = format_date(entry) + filename = f"feed_{date_str}_{title_slug}.json" + filepath = os.path.join('./data/raw/ap/', filename) + full_text = fetch_full_article(entry.link) + + # Avoid overwriting if file already exists + if os.path.exists(filepath): + return False + + data = { + "title": entry.title, + "link": entry.link, + "published": entry.get("published", ""), + "summary": entry.get("summary", ""), + "full_text": full_text + } + + with open(filepath, 'w', encoding='utf-8') as f: + json.dump(data, f, ensure_ascii=False, indent=4) + + saved_hashes.add(entry_hash) + save_hashes(saved_hashes) + + return True + +def check_and_save_new_entries(): + feed = fetch_feed() + new_count = 0 + + for entry in feed.entries: + saved = save_entry(entry) + if saved: + new_count += 1 + + print(f"Saved {new_count} new entries.") + +if __name__ == '__main__': + import sys + + if '--once' in sys.argv: + check_and_save_new_entries() + else: + while True: + check_and_save_new_entries() + time.sleep(INTERVAL) + diff --git a/src/ingest/bbc_rss.py b/src/ingest/bbc_rss.py index cc58320..de36208 100644 --- a/src/ingest/bbc_rss.py +++ b/src/ingest/bbc_rss.py @@ -7,8 +7,10 @@ import datetime import requests from bs4 import BeautifulSoup +import hashlib INTERVAL = 3600 # seconds (1 hour) +HASHES = './data/raw/feed_saved_hashes.json' os.makedirs('./data/raw/bbc/', exist_ok=True) @@ -39,7 +41,27 @@ def fetch_full_article(url): if article: return(article.get_text()) +def load_saved_hashes(): + if os.path.exists(HASHES): + with open(HASHES, 'r', encoding='utf-8') as f: + return set(json.load(f)) + return set() + +def save_hashes(hashes): + with open(HASHES, 'w', encoding='utf-8') as f: + json.dump(list(hashes), f, indent=2) + +def generate_entry_hash(entry): + hash_input = f"{entry.title}{entry.link}{entry.get('published', '')}" + return hashlib.sha256(hash_input.encode('utf-8')).hexdigest() + def save_entry(entry): + saved_hashes = load_saved_hashes() + entry_hash = generate_entry_hash(entry) + + if entry_hash in saved_hashes: + return False # Already saved + # Save the entry as a JSON file title_slug = slugify(entry.title) date_str = format_date(entry) @@ -47,10 +69,6 @@ def save_entry(entry): filepath = os.path.join('./data/raw/bbc/', filename) full_text = fetch_full_article(entry.link) - # Avoid overwriting if file already exists - if os.path.exists(filepath): - return False - data = { "title": entry.title, "link": entry.link, @@ -62,6 +80,9 @@ def save_entry(entry): with open(filepath, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=4) + saved_hashes.add(entry_hash) + save_hashes(saved_hashes) + return True def check_and_save_new_entries(): diff --git a/src/ingest/cnn_rss.py b/src/ingest/cnn_rss.py new file mode 100644 index 0000000..59762d1 --- /dev/null +++ b/src/ingest/cnn_rss.py @@ -0,0 +1,129 @@ +import feedparser +import json +import os +import re +import time +import datetime +import requests +from bs4 import BeautifulSoup +import hashlib + +INTERVAL = 3600 # seconds (1 hour) +HASHES = './data/raw/feed_saved_hashes.json' + +os.makedirs('./data/raw/cnn/', exist_ok=True) + +def fetch_feed(): + # Download and parse the feed + return feedparser.parse('http://rss.cnn.com/rss/cnn_world.rss') + +def slugify(text): + # Convert title to a filesystem-friendly slug + text = text.lower() + text = re.sub(r'[^a-z0-9]+', '-', text) + return text.strip('-') + +def format_date(entry): + # Extract and format the published date + try: + dt = datetime.datetime(*entry.published_parsed[:6]) + return dt.strftime("%Y-%m-%d") + except: + return "unknown-date" + +def fetch_full_article(url): + try: + headers = { + "User-Agent": "Mozilla/5.0" + } + response = requests.get(url, headers=headers, timeout=10) + response.raise_for_status() + + soup = BeautifulSoup(response.content, 'html.parser') + + # CNN article content is usually within
or
+ article_section = soup.find('section', id='body-text') or soup.find('div', class_='article__content') + + if not article_section: + print("No CNN article body found.") + return "" + + paragraphs = article_section.find_all('div', class_='paragraph') or article_section.find_all('p') + + full_text = "\n".join(p.get_text(strip=True) for p in paragraphs) + + return full_text.strip() + + except Exception as e: + print(f"Error fetching CNN article: {e}") + return "" + +def load_saved_hashes(): + if os.path.exists(HASHES): + with open(HASHES, 'r', encoding='utf-8') as f: + return set(json.load(f)) + return set() + +def save_hashes(hashes): + with open(HASHES, 'w', encoding='utf-8') as f: + json.dump(list(hashes), f, indent=2) + +def generate_entry_hash(entry): + hash_input = f"{entry.title}{entry.link}{entry.get('published', '')}" + return hashlib.sha256(hash_input.encode('utf-8')).hexdigest() + +def save_entry(entry): + saved_hashes = load_saved_hashes() + entry_hash = generate_entry_hash(entry) + + if entry_hash in saved_hashes: + return False # Already saved + + # Save the entry as a JSON file + title_slug = slugify(entry.title) + date_str = format_date(entry) + filename = f"feed_{date_str}_{title_slug}.json" + filepath = os.path.join('./data/raw/cnn/', filename) + full_text = fetch_full_article(entry.link) + + # Avoid overwriting if file already exists + if os.path.exists(filepath): + return False + + data = { + "title": entry.title, + "link": entry.link, + "published": entry.get("published", ""), + "summary": entry.get("summary", ""), + "full_text": full_text + } + + with open(filepath, 'w', encoding='utf-8') as f: + json.dump(data, f, ensure_ascii=False, indent=4) + + saved_hashes.add(entry_hash) + save_hashes(saved_hashes) + + return True + +def check_and_save_new_entries(): + feed = fetch_feed() + new_count = 0 + + for entry in feed.entries: + saved = save_entry(entry) + if saved: + new_count += 1 + + print(f"Saved {new_count} new entries.") + +if __name__ == '__main__': + import sys + + if '--once' in sys.argv: + check_and_save_new_entries() + else: + while True: + check_and_save_new_entries() + time.sleep(INTERVAL) + diff --git a/src/ingest/npr_rss.py b/src/ingest/npr_rss.py new file mode 100644 index 0000000..1dd343f --- /dev/null +++ b/src/ingest/npr_rss.py @@ -0,0 +1,127 @@ + + +import feedparser +import json +import os +import re +import time +import datetime +import requests +from bs4 import BeautifulSoup +import hashlib + +INTERVAL = 3600 # seconds (1 hour) +HASHES = './data/raw/feed_saved_hashes.json' + +os.makedirs('./data/raw/npr/', exist_ok=True) + +def fetch_feed(): + # Download and parse the feed + return feedparser.parse('https://feeds.npr.org/1004/rss.xml') + +def slugify(text): + # Convert title to a filesystem-friendly slug + text = text.lower() + text = re.sub(r'[^a-z0-9]+', '-', text) + return text.strip('-') + +def format_date(entry): + # Extract and format the published date + try: + dt = datetime.datetime(*entry.published_parsed[:6]) + return dt.strftime("%Y-%m-%d") + except: + return "unknown-date" + +def fetch_full_article(url): + try: + headers = { + "User-Agent": "Mozilla/5.0" + } + response = requests.get(url, headers=headers, timeout=10) + response.raise_for_status() + + soup = BeautifulSoup(response.content, 'html.parser') + + # NPR article body is typically in
or
blocks + article_body = soup.find('div', class_='storytext') or soup.find('article') + + if not article_body: + print("No main article content found.") + return "" + + paragraphs = article_body.find_all('p') + full_text = "\n".join(p.get_text(strip=True) for p in paragraphs) + + return full_text.strip() + + except Exception as e: + print(f"Error fetching NPR article: {e}") + return "" + +def load_saved_hashes(): + if os.path.exists(HASHES): + with open(HASHES, 'r', encoding='utf-8') as f: + return set(json.load(f)) + return set() + +def save_hashes(hashes): + with open(HASHES, 'w', encoding='utf-8') as f: + json.dump(list(hashes), f, indent=2) + +def generate_entry_hash(entry): + hash_input = f"{entry.title}{entry.link}{entry.get('published', '')}" + return hashlib.sha256(hash_input.encode('utf-8')).hexdigest() + +def save_entry(entry): + saved_hashes = load_saved_hashes() + entry_hash = generate_entry_hash(entry) + + if entry_hash in saved_hashes: + return False # Already saved + + # Save the entry as a JSON file + title_slug = slugify(entry.title) + date_str = format_date(entry) + filename = f"feed_{date_str}_{title_slug}.json" + filepath = os.path.join('./data/raw/npr/', filename) + full_text = fetch_full_article(entry.link) + + # Avoid overwriting if file already exists + if os.path.exists(filepath): + return False + + data = { + "title": entry.title, + "link": entry.link, + "published": entry.get("published", ""), + "summary": entry.get("summary", ""), + "full_text": full_text + } + + with open(filepath, 'w', encoding='utf-8') as f: + json.dump(data, f, ensure_ascii=False, indent=4) + + return True + +def check_and_save_new_entries(): + feed = fetch_feed() + new_count = 0 + + for entry in feed.entries: + saved = save_entry(entry) + if saved: + new_count += 1 + + print(f"Saved {new_count} new entries.") + +if __name__ == '__main__': + import sys + + if '--once' in sys.argv: + check_and_save_new_entries() + else: + while True: + check_and_save_new_entries() + time.sleep(INTERVAL) + diff --git a/src/ingest/nyt_rss.py b/src/ingest/nyt_rss.py index e17093a..0484d67 100644 --- a/src/ingest/nyt_rss.py +++ b/src/ingest/nyt_rss.py @@ -6,8 +6,10 @@ import datetime import requests from bs4 import BeautifulSoup +import hashlib INTERVAL = 3600 # seconds (1 hour) +HASHES = './data/raw/feed_saved_hashes.json' os.makedirs('./data/raw/nyt/', exist_ok=True) @@ -50,12 +52,31 @@ def fetch_full_article(url): print(f"Error fetching full article: {e}") return "" +def load_saved_hashes(): + if os.path.exists(HASHES): + with open(HASHES, 'r', encoding='utf-8') as f: + return set(json.load(f)) + return set() + +def save_hashes(hashes): + with open(HASHES, 'w', encoding='utf-8') as f: + json.dump(list(hashes), f, indent=2) + +def generate_entry_hash(entry): + hash_input = f"{entry.title}{entry.link}{entry.get('published', '')}" + return hashlib.sha256(hash_input.encode('utf-8')).hexdigest() def save_entry(entry): + saved_hashes = load_saved_hashes() + entry_hash = generate_entry_hash(entry) + + if entry_hash in saved_hashes: + return False # Already saved + # Save the entry as a JSON file title_slug = slugify(entry.title) date_str = format_date(entry) - filename = f"{date_str}_{title_slug}.json" + filename = f"feed_{date_str}_{title_slug}.json" filepath = os.path.join('./data/raw/nyt/', filename) full_text = fetch_full_article(entry.link) @@ -74,6 +95,9 @@ def save_entry(entry): with open(filepath, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=4) + saved_hashes.add(entry_hash) + save_hashes(saved_hashes) + return True def check_and_save_new_entries(): @@ -96,4 +120,3 @@ def check_and_save_new_entries(): while True: check_and_save_new_entries() time.sleep(INTERVAL) - \ No newline at end of file