From 0c86117256487d565241b1c28710a594d670df6c Mon Sep 17 00:00:00 2001 From: Chloe Date: Wed, 25 Jun 2025 13:22:44 -0400 Subject: [PATCH 1/3] more feeds --- src/ingest/ap_rss.py | 97 ++++++++++++++++++++++++++++++++++++++ src/ingest/cnn_rss.py | 104 +++++++++++++++++++++++++++++++++++++++++ src/ingest/npr_rss.py | 105 ++++++++++++++++++++++++++++++++++++++++++ src/ingest/nyt_rss.py | 3 +- 4 files changed, 307 insertions(+), 2 deletions(-) create mode 100644 src/ingest/ap_rss.py create mode 100644 src/ingest/cnn_rss.py create mode 100644 src/ingest/npr_rss.py diff --git a/src/ingest/ap_rss.py b/src/ingest/ap_rss.py new file mode 100644 index 0000000..661b9af --- /dev/null +++ b/src/ingest/ap_rss.py @@ -0,0 +1,97 @@ +import feedparser +import json +import os +import re +import time +import datetime +from playwright.sync_api import sync_playwright + +INTERVAL = 3600 # seconds (1 hour) + +os.makedirs('./data/raw/ap/', exist_ok=True) + +def fetch_feed(): + # Download and parse the feed + return feedparser.parse('https://news.google.com/rss/search?q=when:24h+allinurl:apnews.com&hl=en-US&gl=US&ceid=US:en') + +def slugify(text): + # Convert title to a filesystem-friendly slug + text = text.lower() + text = re.sub(r'[^a-z0-9]+', '-', text) + return text.strip('-') + +def format_date(entry): + # Extract and format the published date + try: + dt = datetime.datetime(*entry.published_parsed[:6]) + return dt.strftime("%Y-%m-%d") + except: + return "unknown-date" + +def fetch_full_article(url): + try: + with sync_playwright() as p: + browser = p.chromium.launch(headless=True) + page = browser.new_page() + page.goto(url, timeout=15000) + + # Wait for the main article body to load + page.wait_for_selector('div.RichTextStoryBody', timeout=5000) + + # Extract the text content from the paragraphs inside the body + content = page.query_selector_all('div.RichTextStoryBody p') + full_text = "\n".join(p.inner_text() for p in content) + + browser.close() + return full_text.strip() + + except Exception as e: + print(f"Playwright error fetching {url}: {e}") + return "" + +def save_entry(entry): + # Save the entry as a JSON file + title_slug = slugify(entry.title) + date_str = format_date(entry) + filename = f"feed_{date_str}_{title_slug}.json" + filepath = os.path.join('./data/raw/ap/', filename) + full_text = fetch_full_article(entry.link) + + # Avoid overwriting if file already exists + if os.path.exists(filepath): + return False + + data = { + "title": entry.title, + "link": entry.link, + "published": entry.get("published", ""), + "summary": entry.get("summary", ""), + "full_text": full_text + } + + with open(filepath, 'w', encoding='utf-8') as f: + json.dump(data, f, ensure_ascii=False, indent=4) + + return True + +def check_and_save_new_entries(): + feed = fetch_feed() + new_count = 0 + + for entry in feed.entries: + saved = save_entry(entry) + if saved: + new_count += 1 + + print(f"Saved {new_count} new entries.") + +if __name__ == '__main__': + import sys + + if '--once' in sys.argv: + check_and_save_new_entries() + else: + while True: + check_and_save_new_entries() + time.sleep(INTERVAL) + diff --git a/src/ingest/cnn_rss.py b/src/ingest/cnn_rss.py new file mode 100644 index 0000000..9b02aa0 --- /dev/null +++ b/src/ingest/cnn_rss.py @@ -0,0 +1,104 @@ +import feedparser +import json +import os +import re +import time +import datetime +import requests +from bs4 import BeautifulSoup + +INTERVAL = 3600 # seconds (1 hour) + +os.makedirs('./data/raw/cnn/', exist_ok=True) + +def fetch_feed(): + # Download and parse the feed + return feedparser.parse('http://rss.cnn.com/rss/cnn_world.rss') + +def slugify(text): + # Convert title to a filesystem-friendly slug + text = text.lower() + text = re.sub(r'[^a-z0-9]+', '-', text) + return text.strip('-') + +def format_date(entry): + # Extract and format the published date + try: + dt = datetime.datetime(*entry.published_parsed[:6]) + return dt.strftime("%Y-%m-%d") + except: + return "unknown-date" + +def fetch_full_article(url): + try: + headers = { + "User-Agent": "Mozilla/5.0" + } + response = requests.get(url, headers=headers, timeout=10) + response.raise_for_status() + + soup = BeautifulSoup(response.content, 'html.parser') + + # CNN article content is usually within
or
+ article_section = soup.find('section', id='body-text') or soup.find('div', class_='article__content') + + if not article_section: + print("No CNN article body found.") + return "" + + paragraphs = article_section.find_all('div', class_='paragraph') or article_section.find_all('p') + + full_text = "\n".join(p.get_text(strip=True) for p in paragraphs) + + return full_text.strip() + + except Exception as e: + print(f"Error fetching CNN article: {e}") + return "" + +def save_entry(entry): + # Save the entry as a JSON file + title_slug = slugify(entry.title) + date_str = format_date(entry) + filename = f"feed_{date_str}_{title_slug}.json" + filepath = os.path.join('./data/raw/cnn/', filename) + full_text = fetch_full_article(entry.link) + + # Avoid overwriting if file already exists + if os.path.exists(filepath): + return False + + data = { + "title": entry.title, + "link": entry.link, + "published": entry.get("published", ""), + "summary": entry.get("summary", ""), + "full_text": full_text + } + + with open(filepath, 'w', encoding='utf-8') as f: + json.dump(data, f, ensure_ascii=False, indent=4) + + return True + +def check_and_save_new_entries(): + feed = fetch_feed() + new_count = 0 + + for entry in feed.entries: + saved = save_entry(entry) + if saved: + new_count += 1 + + print(f"Saved {new_count} new entries.") + +if __name__ == '__main__': + import sys + + if '--once' in sys.argv: + check_and_save_new_entries() + else: + while True: + check_and_save_new_entries() + time.sleep(INTERVAL) + diff --git a/src/ingest/npr_rss.py b/src/ingest/npr_rss.py new file mode 100644 index 0000000..b67da50 --- /dev/null +++ b/src/ingest/npr_rss.py @@ -0,0 +1,105 @@ + + +import feedparser +import json +import os +import re +import time +import datetime +import requests +from bs4 import BeautifulSoup + +INTERVAL = 3600 # seconds (1 hour) + +os.makedirs('./data/raw/npr/', exist_ok=True) + +def fetch_feed(): + # Download and parse the feed + return feedparser.parse('https://feeds.npr.org/1004/rss.xml') + +def slugify(text): + # Convert title to a filesystem-friendly slug + text = text.lower() + text = re.sub(r'[^a-z0-9]+', '-', text) + return text.strip('-') + +def format_date(entry): + # Extract and format the published date + try: + dt = datetime.datetime(*entry.published_parsed[:6]) + return dt.strftime("%Y-%m-%d") + except: + return "unknown-date" + +def fetch_full_article(url): + try: + headers = { + "User-Agent": "Mozilla/5.0" + } + response = requests.get(url, headers=headers, timeout=10) + response.raise_for_status() + + soup = BeautifulSoup(response.content, 'html.parser') + + # NPR article body is typically in
or
blocks + article_body = soup.find('div', class_='storytext') or soup.find('article') + + if not article_body: + print("No main article content found.") + return "" + + paragraphs = article_body.find_all('p') + full_text = "\n".join(p.get_text(strip=True) for p in paragraphs) + + return full_text.strip() + + except Exception as e: + print(f"Error fetching NPR article: {e}") + return "" + +def save_entry(entry): + # Save the entry as a JSON file + title_slug = slugify(entry.title) + date_str = format_date(entry) + filename = f"feed_{date_str}_{title_slug}.json" + filepath = os.path.join('./data/raw/npr/', filename) + full_text = fetch_full_article(entry.link) + + # Avoid overwriting if file already exists + if os.path.exists(filepath): + return False + + data = { + "title": entry.title, + "link": entry.link, + "published": entry.get("published", ""), + "summary": entry.get("summary", ""), + "full_text": full_text + } + + with open(filepath, 'w', encoding='utf-8') as f: + json.dump(data, f, ensure_ascii=False, indent=4) + + return True + +def check_and_save_new_entries(): + feed = fetch_feed() + new_count = 0 + + for entry in feed.entries: + saved = save_entry(entry) + if saved: + new_count += 1 + + print(f"Saved {new_count} new entries.") + +if __name__ == '__main__': + import sys + + if '--once' in sys.argv: + check_and_save_new_entries() + else: + while True: + check_and_save_new_entries() + time.sleep(INTERVAL) + diff --git a/src/ingest/nyt_rss.py b/src/ingest/nyt_rss.py index e17093a..cd9de9f 100644 --- a/src/ingest/nyt_rss.py +++ b/src/ingest/nyt_rss.py @@ -55,7 +55,7 @@ def save_entry(entry): # Save the entry as a JSON file title_slug = slugify(entry.title) date_str = format_date(entry) - filename = f"{date_str}_{title_slug}.json" + filename = f"feed_{date_str}_{title_slug}.json" filepath = os.path.join('./data/raw/nyt/', filename) full_text = fetch_full_article(entry.link) @@ -96,4 +96,3 @@ def check_and_save_new_entries(): while True: check_and_save_new_entries() time.sleep(INTERVAL) - \ No newline at end of file From 853c259c479aa2f1410ecbf3c8dff91009501fe4 Mon Sep 17 00:00:00 2001 From: Chloe Date: Wed, 25 Jun 2025 15:01:31 -0400 Subject: [PATCH 2/3] tracking with hashes --- data/raw/saved_hashes.json | 234 +++++++++++++++++++++++++++++++++++++ src/ingest/ap_rss.py | 25 ++++ src/ingest/bbc_rss.py | 29 ++++- src/ingest/cnn_rss.py | 25 ++++ src/ingest/npr_rss.py | 22 ++++ src/ingest/nyt_rss.py | 24 ++++ 6 files changed, 355 insertions(+), 4 deletions(-) create mode 100644 data/raw/saved_hashes.json diff --git a/data/raw/saved_hashes.json b/data/raw/saved_hashes.json new file mode 100644 index 0000000..aee63a5 --- /dev/null +++ b/data/raw/saved_hashes.json @@ -0,0 +1,234 @@ +[ + "fbf294207d946c79ab93552e686de28d40ccb5deaee72ab4f235a21c7cad76b2", + "5eac907551c7a5592f129012a5b059bc5d70a091debd5f08694f2795de657cba", + "c63223ae28e40d0b535a5319f0249fe9777d3ff87dc40348e95acb986e6ae6e9", + "1930956de4778ede37b6e1d25fa3984ef572ac5201800e3df4b3c42545eca3b5", + "116433f42ee6a02668b797cbb5c5f543329ca2370e53327b933a2a16455734e9", + "a24a981047bd63d5b89e39ced51c9b1f0153f0d50d605c3d5732ea8929868bfa", + "403d7b0b6ebab4726bd81a036cf514af57f933cf8b9fa78e0225ec990c479bd3", + "550ff585e956b4763e91ff6c9d73c85998cd6f45d3e492825840dc38e1f8b5bf", + "095b5a6f59d52b53623c3bd8ca44f830a7c3a1b8ca4293467876a070d1affa22", + "101e1ca5c5a89871d9c4a8d520ce2da9b0d4c27e8a52155e0a59c7265d1a27b8", + "544f3f228544442a727b7892bff2dbc4da0e913031620fe84fe26098538bc30e", + "91c904112341bf3481b97e147be6d2a0e9f82d5bdd8fbfe28599db97f755222d", + "aa5f1d3dddc46f0248136bbb054207ddc4dc86970d80fd70e4f16d7355381eb1", + "11edccff611dca376574677a40b358912ac399436402d39e3cb66bec96af07a6", + "06f36c673e8f03c8cae1582ddd3cdf64ba542a1e823f9325623a4af61b0df985", + "0672cdb6f3569d7528f5943aa67306d91c8509fe996bbaa7bfdc7265c6f2bc38", + "32c48c8836e1fdac9ced02b67f0442449793ee0817b91cdd898c2637cc60588b", + "cabcf8db458851d3b9f9134fd2087d979d1d5d537bf06f2c41177b8e28aedcce", + "347c0ff0b7442bf20d0e4f8018f760b45c7d9f3ae9a74872c6d5047c8193918d", + "5085ced7800e156cc2780390539c41efc8717b07fae4c9c248c3c1d051db6123", + "5ae9851eac255a6315acf4a221078757d90a4d79a2f0ec850c49d3b7300a4fe4", + "631a8667134a6456021a42d1caa2b25b89f0fefc4361537108d75ad9772622e7", + "3ea50d9ebb59e454c27d2b49b7acbad2cf44067aa2c13df87850ec9fca8982dc", + "f7bcfab30a36ad327f8ce9dc3822e41d5f9d5a702ed043ebc8618ba08ffa5eff", + "5637af231ee52c3bced4ccf46e678dabc6b85564f867baf8ff04da084c15c909", + "506a669eca52d69a2f85c2e223f0ee9bd2017c2824517ae7a6ded95e61ff8206", + "5581629471197a5a754cffd0ff7b181efc8eb9520dea563e32bc2e47ac20fed1", + "d5861be197cfbf19cc128c394c5c0fc6282a52cfb0985c3e876a0d7049e5170c", + "3805f6d596587b3b33c8e22eb7b4ea85b2c0838d09429f6648ffad155baf5984", + "0848d1ae87caef87730f1753f3a19d3ca9f73f4559a56e29882519b16e534eb4", + "535078b9347123c2fa1d38d0174bf7f2822d37bbcf3d13241213700d2d08ac0a", + "c1ff34100209b4a6f92d0ddd1914fe68230746cde2389f12143c9d3f59c9a767", + "53ccebbcfdad359fed3f3b569546f9007dbaed0ee51ca0b9b28653612b0babff", + "9901f33b016404c6d1692927866eb8b62423dd28fce1d3f503d696f53d6a6e2c", + "bcc5353969a650358313ed93e31c7674c879273fd8c0b14c8744863b27be1dc7", + "ef9f217867372b80d45972b95238058214153085a54faa2eb588a953d39b892f", + "ae6fe6385e6028ebedd9bdc7603fda3801aa2c032c38d8c88ff8cf1fc16f8e55", + "aa99b80cba0c9e4e4c8228dfdfa56e8e09232f8f719a9e6e4a918e74f5e81cf9", + "4faaeb49c66de82ea75ed77b92c0098be506d483a43e99074ca183a6adf87e70", + "4a05ea364d4de6a50b8affbf854a11d4c22d16445990ec4f06937bddfca502ad", + "aa52ac0ce4f8eeeb0375777946732dfc9fe3b197fc2d41277c07e8f30439b2a5", + "3ee9bf8e5e7987297aa36849fe2c3e641c3db5081bc21a00939522ca052be143", + "5a6c3b25702b452dfbd1adb9610ba5d50d3ec26f83023f84c399ea630425979f", + "11e3738f2bb42ad8ec14ec586c9319fb7ffad64fb7054ce66ddcbf866c41d6c9", + "a662e561124073bbeb99000b3848c977a7cb5aa15d3736a1b746621590dc1b96", + "50c983808cd037e50752a1c916fcd1844c87740fb99663798a7eab33beee6370", + "87bf4bf42a97a6806821d7ea35a04c56df6f2df4160a84afd8626678d4c3b079", + "67d5200eb51e9226e85e9eb4d78c8155b627c59e0eb170d7f8ca99cc68f8b193", + "91ec708865a5f9ad938234a0bd93e8cbf52f32460f68fdd57ba8b53890d7a6bc", + "f28d1106938cdab46ee6e4e0ea9474afb6fdea2f22ab252f41020d273d46565d", + "9ef4a49aef2fb09026f04120a6453b04c246d1dfa39da9441ffb4da31ee5e990", + "0322814e355f5cf7236a02aecfdf4feabcabb836605dea4a08ef225802b83278", + "993ab81f7ead34764e0002533479f8426dd4fa2ef85050689d677b409cb0a711", + "0ac6006f97c942d14f8e0385600504a3ee44413fe6a8d6ae6c1a6325e77a9880", + "98f347c27cae3c497543864597046b13957ef98981fb0a53247123e7ebbca280", + "10588e0103bd2abaa52c0c2ae784bc11367afe6c1339c1778aaafa5f6e1040d7", + "79e297decb96e9edc1d90f657d6a5a77109fd493238d7c14ac6e6b713a9589c2", + "5af2c67999f9950d15e204eb1ceb76fc72033e5747036a89de14ba71f1d1ebef", + "aec587954e3687c6d7a11329360851917d6eac6b59562e4adb88b82745da0283", + "f84c63a8c3068b3c1df467f8ee047f2186f1f294752ff323ee4dab4d5550e3fd", + "7e912964d29ee7cd9309bedb76f344b03d486ef84e70f1385a9cffc100ade6ea", + "a43dadaf685b9957c4f84ae301b8429c6e20bb1f3347f01ba7c2e1d977cfded9", + "7f67d5036c803db231e980c9101471a6b072f59d188a4f6ff37cf12125509e0e", + "7ddabdf615e2e7cc9de58a8a1f3380039368250b125da724f6224e95a02c501a", + "322bd30ea92983b330137f772ee69449ca6f9e1efb8b94eb17bd6d0570c51199", + "daf5435245307e0e0da4e92f6b99e092c666f02b85896c019ea79ba5640f0fa0", + "368dc7ed02fa1a0b5ea9755882285c9f60845c0a519fbeacc53a2197929c5b79", + "93fcfbbd74d3921725927ccb31c90e54aac02a50740a332fe2e7ac3e49cef8c0", + "1e80bd955527fbb3774a22d89581939320cbf61425b1e58a8bf0fd30b7ce1dce", + "72b3a2b289104d83d67df08c40df67d8eed6a56e5b702c6d2e167b40d39cf7d2", + "a8b929213e57ebaddb0097bd23a4a88c4818524829d220acd7988b8dd065d5cc", + "7aa15c47af1de82ee933229a9a662fb979faa7cd381b540b9c7c961cd2b6fb9c", + "2c8f0eb2a68139a7a04bdac3137488f6e2fd1d4055fa792aac49054f0e7001d1", + "c8dfbe3fbbe10b598369d768840a80cc24f160768e71c7bf7a04511cb67793fa", + "ce02ee070143d9f797e72719e9bb67eb6dbe11c3e82aa6861ba613fe47bd132f", + "01ed3d923d5a6fc43703d11cb07d6004510eab1193ff6ba5a3f90ec93540b37f", + "b991758a0e5c7c3ed0b3989838cc26658b54823c8e8c39877d402bc2244937c7", + "d59153a64f97f7b0a728edeaac76aeeb9d49f2e6fc54a0f0f31617f0a096d7b2", + "beb1a858ff58fb471904bc249d195c2bea101a343281af77c98b72717af9df2d", + "af67f6961f562440cc3e5a35e22172bd1678e3e0a32c3ac3774c93e3499b7725", + "dacb56173ffb86bb7e9567d8e156dd80467995cb1e0e74d7e951b1ed55fb361e", + "abe850e06fa525bc60998b2dfb003578a17075d2489e0ab510783079bdef7c74", + "ff5a02da19c9849f3784a2e9f37f245dedb913cad4dbefdd2c4c481022479f65", + "6d1f773d784b8e6cbd301d91015815b2f72ec48bfd6ae721706287dee87aa574", + "de428ecde2380716c4396a1a337f03d78e8ae4bdf54fe1838d1d66644b972bfd", + "d7c05c3b7e02c4f9bd1afe217af1678c39e945c69e0a6263a19678a3031a852a", + "19bf88505a839614d383d4bf0fd9c4ed66e07d25490f8c3d4604d778e5761165", + "65ddf8744d2202c4f0c76830d7a909d9eec05b880013a08f6143a925bdbf79e8", + "088a192829d46abc6300e6ab169412c7e5f8b09c87c70cfa5fcfe5a851fcba34", + "f7b5e89cea363319e1d3f01ac2a2cf27a60a0cff16405d23d8d2e322242998e7", + "c095de0094bee3ec60b4c5a73384628d0f1c14a4cf2da69efa74d92687000392", + "e17113aea9dea7d2eca2a8a7da20c3d52a3aae63cde4671aa5cf07ca680d6d03", + "97fcff8453f3fc6c889295748b0908e6cc1c8efb140b08760cdf86e498cd21b6", + "076eeb71a397e736337d13823be856b8b0f6e834ef4f3107104d1ddb134f47df", + "e61626bb903a5da8d5a708d47c95d5d66c260d7f1ddd0ebbefb315d3e6c6571c", + "e5414c0d018c8d92d2e60f6c27826c0d55a70bc9641aa8a89dd041feb819af0c", + "3672c59097d6cefa430667fb4cbab52f37a968b2a27716d8173edd1350fe7994", + "2298d88f4428ce91bb288c2302e3808f36a4cac5f3a527069562e9cb3771b22c", + "edc209551eaf9969aabec69d337648b79927305b84bf9cb867ea615d0f7af461", + "0f1e1c431f6e2dfb413c59ff10dfff00b4b0ea7aa11fe8bbf147084d8479d7f9", + "9aa3ba2be2139b55c03889b72e08de93707a54e77874092a63b7ccd231ad190c", + "72ea5b0704babbca852bbc0be79b361214d3d904c6d69fdcb9976a9fe392c5dc", + "0ef50bed7f01953e07fc26b0b13151082c25cafe608c82eb77920d21203eab00", + "b99bcb2e86a507ae3cafac8600750693dd16f02b86a7b343d7a84a1522a08202", + "32bd17cd9619e3e52f19c1b2921512c3744f8968db370bb69f0f78320f991733", + "23356c46c7a0112ccff8a25b4999abd37c6f6e415e30d91c928373748c04161d", + "c9d4b505577d0362e27aaf7ac6a801dbfa162da064eb268bf4625f593973493b", + "a8dc51525352f0bfa93b62c27685612131e5e77bd747917cc07ae70c96dd1352", + "289e3e9707e43c1c8c29318e72810521ba5158a1729891a186929c525f22b9d0", + "3550e722b5bb4a03907db2ce22cd3a50c7aa05a922649903b0ce7d0013146a5a", + "8bb05cf6909e948e34453508580fab743793cebeed758809a743dc77e3b46bec", + "97afd0a15c1bdaedb9ae6319cb95b9c4c89f0026e3ea19199a7bcb72f8e34274", + "58d9b17a3bff5f34f4c68ac4e027343f03febb53ed167f7af0b6836661bd7b45", + "361eba61a9141544b2f80703e210a0ffe8824476c963b45a14b91382ff409e86", + "0426387acb64da318d1d1e642c5ffcae04b59bd3f0d648a407f43fd54f962f3d", + "7459e2d423225f0884e5cbab8180ee5b89dd2380470cfd5cf1312c14c1b120bd", + "526e2ddee57354ea79533b7e2435a530d04a6eb38de4a0492bb546edf0370be2", + "cf6c2dcd7816069f63f3c0fe3901038db9cd256f93ef0c99b96885cac6a32c2a", + "33b7e06107a7d3495eccf5900aa9e91940a344fde4a031a2806b934325cd33e2", + "354314baf7cb63ca9159f553914b89461bd7ec7a5f79b0320e052f996651587b", + "06820d488e6bd4b11c4d6e6b9056d307c79fea61bff4d14be5132d37233c2326", + "61939f900a060fe103e40c0a3a6e4890aa715af705d2441dcc1ddaf0bc7bd996", + "e188d42e2aaa69a31070e2345edf1252ccef0ad82c9b914b5780f5d2fb216c12", + "f4e5b3f355ce7463a2676c017e57409b00a77f210b2d26a5410a8ace2521caf1", + "8f257e36324219c85575bfdd79f74f720bc13d5428b43445a1092820829a7623", + "d0832954b41aed400ecd7b4856fc76a5f428189ea1b65a645cedb88ea78ab9eb", + "00f8b31ccb57ac0a4a77b37b9713bcd1f3230ceaa20ccbca4671ed12dbdb3b72", + "4ddc0aa91cc641800bfc2a7152478817cbd1d477c1ca3871857c5ac4d36a01d8", + "e6311ee34a08523732a83f753a0e5768fe0f776973b9a7cc8a7278c21746ec4e", + "e070dbb6b8e0d4076dba0237fc5ee477c697c937fd4075c4ecd04f4aaa74e483", + "3e81d97ebd2551ca222899bac1a6eba99bc3cabb0e7143d02d7d3ce7617c3b57", + "457483321a108b1288173f88e69dea1ba37bc2baa4bad79274704921f0bad0cd", + "85eac9e9f9cfec389a7008a62a69795a03db7f7b65f837ba6fab788f3dd57f39", + "8584232355cee897b7776f75994760c3b971183a7794d0624e33fb0d2d0e2d0e", + "092b6e60ea63a6ddf64895ca930a4665b27441fae844efa16c5a15a6c74440c9", + "e8653750f300787b4ba3af22db2988daffc3ce203fb9067d54d2157e23bc8a38", + "5943a46b916091b205253c208161ec0d987f54b883f9b15b39aa9dab43474915", + "84d653e29477b8ede9b4959b26bcdc63ccccf73884f65fc27fbb4a9e115b8ded", + "3794967106edfe5131ff6681c18315b06296f93a5d52f6ef71a1654f2d8248df", + "cfa6322f27aebc43bde9908fb8224c4a83e64b567ee0a493f1e09e2e07c0e891", + "7d211d12f098a0fac3672b800a205d789b4adff916ea39c2eb62b9212e080127", + "9302979097bb97c469487e50c56b1660516350d7e7f6d07a6c64a15404e36b5c", + "fc8d148c9e87fc7674fb13623e111dd010e7a1a559a0753b293e5c087ca6916d", + "2689f818f0d4ffa78b87ea441496a203aa4c87839395beb23d723735d76d52a6", + "4e28cc5aac1b339c6e7a3862e033114143b4ec6b349a49b95f4eb6feb64a4944", + "0ee8d30ca4426fc405bd32108e5e1a489cba4ff6863062030c7ea655edcc158d", + "b6600bb1b58b945997676c52d48853708e3d9a75f797f036d9d1583da5fed30b", + "c99b550e31d5767f9f63fd484bd2b7b33f6f2e67f9b84204e70d0080994a32f0", + "255506bc0c8f4ee4abc4880245b99cd0d1782fb60ba92c197b4cf0bb67df8dde", + "031ed14e4befe7a560c007037dbc64d621e2c02755cfa6646f73a2a6419a8baf", + "94b5e9994f7abeaa6fadae8f85a9f337e1ef503e3d9b03bdc15512899bb2a33e", + "ab33a44fe87e661bf7a1483aa6ef50574ef81c9e1f7768c9acc3d4250adc93fa", + "6986caaee9e941e191d6108cfba8ce14c5a7c8c3fbccb790ea21082a035559cc", + "ee8d7c248f3c912f20549c53213d2892360c6881bd2942e9fba6c910229450f7", + "3a99d2ea91c37908d43ed77367fe050a123d1335668bf83ea080eca836a6ff88", + "d1de77f13fa71b17998e63cccb55dae345b96f56ffcd8fe504fa2ba3bbfdf594", + "2281bb152eb70be33d533b493731e03fc182ad693fea21d030c8d3378b43c5de", + "0151e75829cd2a82ce5909bf37f3dd6621752a37a2ae05a5b740e3493251d251", + "ffeee150a723f63ab1602c08b1744bab5a840a6db94c83864c288a9374c21345", + "50c8dca9b18d977cbe9d2b7696f830973c3cf39a8298bc22df4cb4a3bf881559", + "7f7150a8f52b2ebce05c018a42b818730ccca517987b544f6aa3a094d351d15a", + "625bf686604f22af1583885186fe3fd6061c6b037a7faa9ff0f42ee9009852ca", + "a76f3bb1ba2b0c311302f140409f64ee6c40bca2c697fb93d33105081e04c65f", + "09c8092cc9afa49b1aa9b15c2862a6ad9ef1f49d2f8c55c58dbb11c1f6d4f231", + "7af0416110fcfaafd4708a8baecc4de3ca49f67f710976c0576d062e7629631b", + "3131b09f42c1f4b54a5725970e541a93dd3cfde10aa89e3b0512b7ad23d865aa", + "5eaffe16ce3f45fc1c72777b2d4ad371235671e20cb92509cf2b81d3094f86de", + "f9d9bb6553b244c604f8dc5183ec9fec0d926c2ebb92f97411b52451094457a8", + "6ba984ba37ae7a433d79ee3d6694e469adcbb2c842b6bd9a9a2dfe7ccfd5f01d", + "b8d7e678244e0a1a6b514d6afa030c20f2db2266a47c1c0c4832b1dc854e321d", + "096ef03fdcb0bcf913b531e7f83baeda387e54b414e23e6c48b21ae476f64940", + "f37fcabfca8dfb7301a81cc1bf0197818d9ce5385db2f7b8d66d45e0bde146b1", + "e63e8e91836cc5b1b5bbdb049c2ef4989512e45bc70c1a062ce3e505c7378fa8", + "fbb88fb74c221809c4254ab1d8cb5939fa6d5cd6713d2254d4245be1dcb74c83", + "677bad8e2df848826b0a3ff186ba04dc77254c6a5c2e8d147f3a9640963b7eaa", + "9e163c9ebbe666c4edb5d3177b69dff0bec15e0b1424c6b3e655471aaf27bef6", + "83552c79ccdb2dfc6b434e569453c321e0e8e46244217616865a529194752671", + "9a427cf07ace06b3c4cc2de056bf265f22c971b70f4b6d4fa1534a44fa5abbfa", + "a75134573b09c02031344157b0174b5214b749889bb697cd0b3cf4ccd8ba7029", + "ea70959b34cc3965eb0b6c3c1938da8043b67920e2c4c7aa967e317d5f8a5027", + "d85d0e135e4a4a4e12d649318e1b185d38c42f2328ee28477d90c420cd163b5a", + "0be75c22edafe0dbb86fa075708ba0109bcd705d7d18eb49ff049f63668037f6", + "7509f15f33c0ff5766d090a255490ce3152ba24ca85f25a4669ef84b2540fc6e", + "cc5d8d232a420326b5b18afebdde49d1198475cae361ac928d77308c532bd252", + "a017e22938e92d392396d5f81c740dccae452e42b27b5c5c70eeef0a2babdfe4", + "d248e70a13a18fbc7299ea50a0a64ff201c5b2171c7f62d377c2f7bf4ef114f0", + "92079a64edc2212d5186879e9ecaaeb8c9da499188dd0075a0903f21ef8eef87", + "cd17320b3e9c11c931e68895403c2d963055a3e4ce9278298eb8bd090c79878b", + "eb9efd37db47717fc8027fcb207115b4d028c263ca3cfb4a07650aabeda378e2", + "aa7f6393a89a12b14584698e8c47703120f0762fb6c4f99fc0f1bad2ac454318", + "e48aba9874d6f9ce2c2fa8b637972ef89b8aafa7ebfbde5ec8d7e3510b6238f5", + "2d8c369d1f490838b534e311e53020663bef7cd6267c4b8f00441710bbe5ba34", + "37e2e153d3a9b9fe9a22e0ada0badbf9cdce409b5b4231d9ee06c508a9866dcb", + "e77c0a9eac3f18a199e2f1f328e4f8a327ca248cb96adead7f3df35c81c328ac", + "9191c0848fd9c965fca5736350d679c31f6caa2a9196eec175eb6672f12e9c7a", + "a060043a21fdccdc9c7750b5d10cc2ab7993cb4a32d1226b28180212f07f30de", + "4d5db4c54ffd81fa497f031469cf722d2669e30fc279dd55cd09e9ad08a87b0a", + "705e7a0a447e20c806c949a120f9d815254dd8f7d7fdfddb855f783c52c2fe34", + "facc99c3ddc9da5be21b4eb2332df6b7dc3b44085a93d586de024b3e5ae18cf4", + "668388bc8c3fbb9f12c3e126ce6a7f3ed6e12648bc50af76de06aaca0a16b425", + "de8a1bfd6328e3502f4b558de82dd40f138aa54ae01d94500f7552f27b31044e", + "8df15b1d0b41d04ec91afa96f615922b0185ed33865e33e14da3c04cf7321583", + "8bd6437a006e89d125b2dc656eaa4bbf9cbd8c5841f7ee013af121d937db9cef", + "e120b94c9cbb8efbc58f8a189e929ff85d61bd44b013002881900a3263785c48", + "97a20716ebd63890fe79a419559689a41d155988a14934c7e727fdaf1ac91af1", + "868099f579d1616f4507a92e6e2677a3f7360a3887215b7a27eb0eadbace6fc5", + "9a5df771c074ff2ecd894cf31ca516a0698d9e23b74cbd601e255b53b3c669e9", + "83d8e1168a2d0bd2fbdf39c4a97848cc05dbab0fc0338bd646a66ce7f9124cc0", + "93aaed9d615c4ef6c340fbf0f1d65c00152cb5d165b5b6d8857ef632fe55e360", + "eca0c03922dfcd653c59f35bab0f6b5518e5ad77098276d0b247152732b783a3", + "60090f3a566d2df9187bb547c03e8b453b356f392b525d9eed8707ebf59575e1", + "215f0981c75342dbe611423b245091ded69f610c923644149221e98270a44fc6", + "ec93a12fe0ba3253bc49dd9626310cf78b824902bdba18b9cae92ca607cf36f8", + "3e7fb7b82dac408d807a1a714789ec220c769f2abe490a914291d63ea612606e", + "cb7007d1828a9c6ed0b3af9c86f1bf5286148559c48ed1b09c000f9f00afe707", + "78fe22905b15b3acd627068b7a6c0f1aa7c150af9ea18eb1ef4dac31113e4b6f", + "7e8c752a2b97a8aa19f2e612437c7cb425eb18f73ddf3f8f55f1e7ce42186016", + "153a5f9cce35b336de3f5ef2e9af367fbeda302842241dcd3cf7e7a020cafab1", + "fd3446bfee5795e151406cfab7ce1a788b5e2cb4dd7351771e6342e1aabf83b9", + "42562da1c7a422db4603418a529b0ad2ce13fc0787172b13891e56cec0c0dda9", + "333b35e2d4792942685144ac4bcda16a8adee92ff4044118510e18fe1f804bef", + "4223bc6c358e935b97c237f3830b7a7e6d8dfc0606c2d482afc483ca9f19f133", + "8d3a054254c8c76f3a668d8b904c3396fbc103c8b8d1aed59d0584e11a244756", + "2310e10082d77250eb7a34fc64c6a4a7da1794910fef744c16de706a680c841f", + "0877a3616f6adcb8d5b79089218e449fc987be81ea2e275596b2be9bc8b9a1a4", + "352e44235a3fb04c70b11541e8b9a4bcd7623fd4bc8f0a98de740d2583066b50", + "a931638a4467e3386dd988dae5b13498c6c6479d215a2a13179e30c8a33a22b2", + "f850e3e295e3a0f67841d02eca1672f9ba581b34e33cfee407ecb4d283f3023a", + "8a64085f1df0c729e3a07b46aa50491667a064aa7e9e184840eca14c8375abf0", + "20982688c9f9b96e51017be32bf36e22fa96d5aa0fdf38e4f2c19f8f1094b25f", + "f194bd7a98a669ec0a1e8cd243372d6936a494447c898e40b26e7eae3f0d5143", + "498aa4ef4f007276772ded48952f7d85571efe35a06006c4d62785dcad2f3c5c" +] \ No newline at end of file diff --git a/src/ingest/ap_rss.py b/src/ingest/ap_rss.py index 661b9af..259d4ff 100644 --- a/src/ingest/ap_rss.py +++ b/src/ingest/ap_rss.py @@ -5,8 +5,10 @@ import time import datetime from playwright.sync_api import sync_playwright +import hashlib INTERVAL = 3600 # seconds (1 hour) +HASHES = './data/raw/saved_hashes.json' os.makedirs('./data/raw/ap/', exist_ok=True) @@ -49,7 +51,27 @@ def fetch_full_article(url): print(f"Playwright error fetching {url}: {e}") return "" +def load_saved_hashes(): + if os.path.exists(HASHES): + with open(HASHES, 'r', encoding='utf-8') as f: + return set(json.load(f)) + return set() + +def save_hashes(hashes): + with open(HASHES, 'w', encoding='utf-8') as f: + json.dump(list(hashes), f, indent=2) + +def generate_entry_hash(entry): + hash_input = f"{entry.title}{entry.link}{entry.get('published', '')}" + return hashlib.sha256(hash_input.encode('utf-8')).hexdigest() + def save_entry(entry): + saved_hashes = load_saved_hashes() + entry_hash = generate_entry_hash(entry) + + if entry_hash in saved_hashes: + return False # Already saved + # Save the entry as a JSON file title_slug = slugify(entry.title) date_str = format_date(entry) @@ -72,6 +94,9 @@ def save_entry(entry): with open(filepath, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=4) + saved_hashes.add(entry_hash) + save_hashes(saved_hashes) + return True def check_and_save_new_entries(): diff --git a/src/ingest/bbc_rss.py b/src/ingest/bbc_rss.py index cc58320..a7d64f7 100644 --- a/src/ingest/bbc_rss.py +++ b/src/ingest/bbc_rss.py @@ -7,8 +7,10 @@ import datetime import requests from bs4 import BeautifulSoup +import hashlib INTERVAL = 3600 # seconds (1 hour) +HASHES = './data/raw/saved_hashes.json' os.makedirs('./data/raw/bbc/', exist_ok=True) @@ -39,7 +41,27 @@ def fetch_full_article(url): if article: return(article.get_text()) +def load_saved_hashes(): + if os.path.exists(HASHES): + with open(HASHES, 'r', encoding='utf-8') as f: + return set(json.load(f)) + return set() + +def save_hashes(hashes): + with open(HASHES, 'w', encoding='utf-8') as f: + json.dump(list(hashes), f, indent=2) + +def generate_entry_hash(entry): + hash_input = f"{entry.title}{entry.link}{entry.get('published', '')}" + return hashlib.sha256(hash_input.encode('utf-8')).hexdigest() + def save_entry(entry): + saved_hashes = load_saved_hashes() + entry_hash = generate_entry_hash(entry) + + if entry_hash in saved_hashes: + return False # Already saved + # Save the entry as a JSON file title_slug = slugify(entry.title) date_str = format_date(entry) @@ -47,10 +69,6 @@ def save_entry(entry): filepath = os.path.join('./data/raw/bbc/', filename) full_text = fetch_full_article(entry.link) - # Avoid overwriting if file already exists - if os.path.exists(filepath): - return False - data = { "title": entry.title, "link": entry.link, @@ -62,6 +80,9 @@ def save_entry(entry): with open(filepath, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=4) + saved_hashes.add(entry_hash) + save_hashes(saved_hashes) + return True def check_and_save_new_entries(): diff --git a/src/ingest/cnn_rss.py b/src/ingest/cnn_rss.py index 9b02aa0..0049bfe 100644 --- a/src/ingest/cnn_rss.py +++ b/src/ingest/cnn_rss.py @@ -6,8 +6,10 @@ import datetime import requests from bs4 import BeautifulSoup +import hashlib INTERVAL = 3600 # seconds (1 hour) +HASHES = './data/raw/saved_hashes.json' os.makedirs('./data/raw/cnn/', exist_ok=True) @@ -55,8 +57,28 @@ def fetch_full_article(url): except Exception as e: print(f"Error fetching CNN article: {e}") return "" + +def load_saved_hashes(): + if os.path.exists(HASHES): + with open(HASHES, 'r', encoding='utf-8') as f: + return set(json.load(f)) + return set() + +def save_hashes(hashes): + with open(HASHES, 'w', encoding='utf-8') as f: + json.dump(list(hashes), f, indent=2) + +def generate_entry_hash(entry): + hash_input = f"{entry.title}{entry.link}{entry.get('published', '')}" + return hashlib.sha256(hash_input.encode('utf-8')).hexdigest() def save_entry(entry): + saved_hashes = load_saved_hashes() + entry_hash = generate_entry_hash(entry) + + if entry_hash in saved_hashes: + return False # Already saved + # Save the entry as a JSON file title_slug = slugify(entry.title) date_str = format_date(entry) @@ -79,6 +101,9 @@ def save_entry(entry): with open(filepath, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=4) + saved_hashes.add(entry_hash) + save_hashes(saved_hashes) + return True def check_and_save_new_entries(): diff --git a/src/ingest/npr_rss.py b/src/ingest/npr_rss.py index b67da50..58f9a7f 100644 --- a/src/ingest/npr_rss.py +++ b/src/ingest/npr_rss.py @@ -8,8 +8,10 @@ import datetime import requests from bs4 import BeautifulSoup +import hashlib INTERVAL = 3600 # seconds (1 hour) +HASHES = './data/raw/saved_hashes.json' os.makedirs('./data/raw/npr/', exist_ok=True) @@ -57,7 +59,27 @@ def fetch_full_article(url): print(f"Error fetching NPR article: {e}") return "" +def load_saved_hashes(): + if os.path.exists(HASHES): + with open(HASHES, 'r', encoding='utf-8') as f: + return set(json.load(f)) + return set() + +def save_hashes(hashes): + with open(HASHES, 'w', encoding='utf-8') as f: + json.dump(list(hashes), f, indent=2) + +def generate_entry_hash(entry): + hash_input = f"{entry.title}{entry.link}{entry.get('published', '')}" + return hashlib.sha256(hash_input.encode('utf-8')).hexdigest() + def save_entry(entry): + saved_hashes = load_saved_hashes() + entry_hash = generate_entry_hash(entry) + + if entry_hash in saved_hashes: + return False # Already saved + # Save the entry as a JSON file title_slug = slugify(entry.title) date_str = format_date(entry) diff --git a/src/ingest/nyt_rss.py b/src/ingest/nyt_rss.py index cd9de9f..0b0895e 100644 --- a/src/ingest/nyt_rss.py +++ b/src/ingest/nyt_rss.py @@ -6,8 +6,10 @@ import datetime import requests from bs4 import BeautifulSoup +import hashlib INTERVAL = 3600 # seconds (1 hour) +HASHES = './data/raw/saved_hashes.json' os.makedirs('./data/raw/nyt/', exist_ok=True) @@ -50,8 +52,27 @@ def fetch_full_article(url): print(f"Error fetching full article: {e}") return "" +def load_saved_hashes(): + if os.path.exists(HASHES): + with open(HASHES, 'r', encoding='utf-8') as f: + return set(json.load(f)) + return set() + +def save_hashes(hashes): + with open(HASHES, 'w', encoding='utf-8') as f: + json.dump(list(hashes), f, indent=2) + +def generate_entry_hash(entry): + hash_input = f"{entry.title}{entry.link}{entry.get('published', '')}" + return hashlib.sha256(hash_input.encode('utf-8')).hexdigest() def save_entry(entry): + saved_hashes = load_saved_hashes() + entry_hash = generate_entry_hash(entry) + + if entry_hash in saved_hashes: + return False # Already saved + # Save the entry as a JSON file title_slug = slugify(entry.title) date_str = format_date(entry) @@ -74,6 +95,9 @@ def save_entry(entry): with open(filepath, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=4) + saved_hashes.add(entry_hash) + save_hashes(saved_hashes) + return True def check_and_save_new_entries(): From b130c4e916fcdf35fb845eef1295ca1b2bb8b140 Mon Sep 17 00:00:00 2001 From: Chloe Date: Thu, 26 Jun 2025 11:24:41 -0400 Subject: [PATCH 3/3] Update to gitignore saved hashes --- data/raw/saved_hashes.json | 234 ------------------------------------- src/ingest/ap_rss.py | 4 +- src/ingest/bbc_rss.py | 2 +- src/ingest/cnn_rss.py | 2 +- src/ingest/npr_rss.py | 2 +- src/ingest/nyt_rss.py | 2 +- 6 files changed, 6 insertions(+), 240 deletions(-) delete mode 100644 data/raw/saved_hashes.json diff --git a/data/raw/saved_hashes.json b/data/raw/saved_hashes.json deleted file mode 100644 index aee63a5..0000000 --- a/data/raw/saved_hashes.json +++ /dev/null @@ -1,234 +0,0 @@ -[ - "fbf294207d946c79ab93552e686de28d40ccb5deaee72ab4f235a21c7cad76b2", - "5eac907551c7a5592f129012a5b059bc5d70a091debd5f08694f2795de657cba", - "c63223ae28e40d0b535a5319f0249fe9777d3ff87dc40348e95acb986e6ae6e9", - "1930956de4778ede37b6e1d25fa3984ef572ac5201800e3df4b3c42545eca3b5", - "116433f42ee6a02668b797cbb5c5f543329ca2370e53327b933a2a16455734e9", - "a24a981047bd63d5b89e39ced51c9b1f0153f0d50d605c3d5732ea8929868bfa", - "403d7b0b6ebab4726bd81a036cf514af57f933cf8b9fa78e0225ec990c479bd3", - "550ff585e956b4763e91ff6c9d73c85998cd6f45d3e492825840dc38e1f8b5bf", - "095b5a6f59d52b53623c3bd8ca44f830a7c3a1b8ca4293467876a070d1affa22", - "101e1ca5c5a89871d9c4a8d520ce2da9b0d4c27e8a52155e0a59c7265d1a27b8", - "544f3f228544442a727b7892bff2dbc4da0e913031620fe84fe26098538bc30e", - "91c904112341bf3481b97e147be6d2a0e9f82d5bdd8fbfe28599db97f755222d", - "aa5f1d3dddc46f0248136bbb054207ddc4dc86970d80fd70e4f16d7355381eb1", - "11edccff611dca376574677a40b358912ac399436402d39e3cb66bec96af07a6", - "06f36c673e8f03c8cae1582ddd3cdf64ba542a1e823f9325623a4af61b0df985", - "0672cdb6f3569d7528f5943aa67306d91c8509fe996bbaa7bfdc7265c6f2bc38", - "32c48c8836e1fdac9ced02b67f0442449793ee0817b91cdd898c2637cc60588b", - "cabcf8db458851d3b9f9134fd2087d979d1d5d537bf06f2c41177b8e28aedcce", - "347c0ff0b7442bf20d0e4f8018f760b45c7d9f3ae9a74872c6d5047c8193918d", - "5085ced7800e156cc2780390539c41efc8717b07fae4c9c248c3c1d051db6123", - "5ae9851eac255a6315acf4a221078757d90a4d79a2f0ec850c49d3b7300a4fe4", - "631a8667134a6456021a42d1caa2b25b89f0fefc4361537108d75ad9772622e7", - "3ea50d9ebb59e454c27d2b49b7acbad2cf44067aa2c13df87850ec9fca8982dc", - "f7bcfab30a36ad327f8ce9dc3822e41d5f9d5a702ed043ebc8618ba08ffa5eff", - "5637af231ee52c3bced4ccf46e678dabc6b85564f867baf8ff04da084c15c909", - "506a669eca52d69a2f85c2e223f0ee9bd2017c2824517ae7a6ded95e61ff8206", - "5581629471197a5a754cffd0ff7b181efc8eb9520dea563e32bc2e47ac20fed1", - "d5861be197cfbf19cc128c394c5c0fc6282a52cfb0985c3e876a0d7049e5170c", - "3805f6d596587b3b33c8e22eb7b4ea85b2c0838d09429f6648ffad155baf5984", - "0848d1ae87caef87730f1753f3a19d3ca9f73f4559a56e29882519b16e534eb4", - "535078b9347123c2fa1d38d0174bf7f2822d37bbcf3d13241213700d2d08ac0a", - "c1ff34100209b4a6f92d0ddd1914fe68230746cde2389f12143c9d3f59c9a767", - "53ccebbcfdad359fed3f3b569546f9007dbaed0ee51ca0b9b28653612b0babff", - "9901f33b016404c6d1692927866eb8b62423dd28fce1d3f503d696f53d6a6e2c", - "bcc5353969a650358313ed93e31c7674c879273fd8c0b14c8744863b27be1dc7", - "ef9f217867372b80d45972b95238058214153085a54faa2eb588a953d39b892f", - "ae6fe6385e6028ebedd9bdc7603fda3801aa2c032c38d8c88ff8cf1fc16f8e55", - "aa99b80cba0c9e4e4c8228dfdfa56e8e09232f8f719a9e6e4a918e74f5e81cf9", - "4faaeb49c66de82ea75ed77b92c0098be506d483a43e99074ca183a6adf87e70", - "4a05ea364d4de6a50b8affbf854a11d4c22d16445990ec4f06937bddfca502ad", - "aa52ac0ce4f8eeeb0375777946732dfc9fe3b197fc2d41277c07e8f30439b2a5", - "3ee9bf8e5e7987297aa36849fe2c3e641c3db5081bc21a00939522ca052be143", - "5a6c3b25702b452dfbd1adb9610ba5d50d3ec26f83023f84c399ea630425979f", - "11e3738f2bb42ad8ec14ec586c9319fb7ffad64fb7054ce66ddcbf866c41d6c9", - "a662e561124073bbeb99000b3848c977a7cb5aa15d3736a1b746621590dc1b96", - "50c983808cd037e50752a1c916fcd1844c87740fb99663798a7eab33beee6370", - "87bf4bf42a97a6806821d7ea35a04c56df6f2df4160a84afd8626678d4c3b079", - "67d5200eb51e9226e85e9eb4d78c8155b627c59e0eb170d7f8ca99cc68f8b193", - "91ec708865a5f9ad938234a0bd93e8cbf52f32460f68fdd57ba8b53890d7a6bc", - "f28d1106938cdab46ee6e4e0ea9474afb6fdea2f22ab252f41020d273d46565d", - "9ef4a49aef2fb09026f04120a6453b04c246d1dfa39da9441ffb4da31ee5e990", - "0322814e355f5cf7236a02aecfdf4feabcabb836605dea4a08ef225802b83278", - "993ab81f7ead34764e0002533479f8426dd4fa2ef85050689d677b409cb0a711", - "0ac6006f97c942d14f8e0385600504a3ee44413fe6a8d6ae6c1a6325e77a9880", - "98f347c27cae3c497543864597046b13957ef98981fb0a53247123e7ebbca280", - "10588e0103bd2abaa52c0c2ae784bc11367afe6c1339c1778aaafa5f6e1040d7", - "79e297decb96e9edc1d90f657d6a5a77109fd493238d7c14ac6e6b713a9589c2", - "5af2c67999f9950d15e204eb1ceb76fc72033e5747036a89de14ba71f1d1ebef", - "aec587954e3687c6d7a11329360851917d6eac6b59562e4adb88b82745da0283", - "f84c63a8c3068b3c1df467f8ee047f2186f1f294752ff323ee4dab4d5550e3fd", - "7e912964d29ee7cd9309bedb76f344b03d486ef84e70f1385a9cffc100ade6ea", - "a43dadaf685b9957c4f84ae301b8429c6e20bb1f3347f01ba7c2e1d977cfded9", - "7f67d5036c803db231e980c9101471a6b072f59d188a4f6ff37cf12125509e0e", - "7ddabdf615e2e7cc9de58a8a1f3380039368250b125da724f6224e95a02c501a", - "322bd30ea92983b330137f772ee69449ca6f9e1efb8b94eb17bd6d0570c51199", - "daf5435245307e0e0da4e92f6b99e092c666f02b85896c019ea79ba5640f0fa0", - "368dc7ed02fa1a0b5ea9755882285c9f60845c0a519fbeacc53a2197929c5b79", - "93fcfbbd74d3921725927ccb31c90e54aac02a50740a332fe2e7ac3e49cef8c0", - "1e80bd955527fbb3774a22d89581939320cbf61425b1e58a8bf0fd30b7ce1dce", - "72b3a2b289104d83d67df08c40df67d8eed6a56e5b702c6d2e167b40d39cf7d2", - "a8b929213e57ebaddb0097bd23a4a88c4818524829d220acd7988b8dd065d5cc", - "7aa15c47af1de82ee933229a9a662fb979faa7cd381b540b9c7c961cd2b6fb9c", - "2c8f0eb2a68139a7a04bdac3137488f6e2fd1d4055fa792aac49054f0e7001d1", - "c8dfbe3fbbe10b598369d768840a80cc24f160768e71c7bf7a04511cb67793fa", - "ce02ee070143d9f797e72719e9bb67eb6dbe11c3e82aa6861ba613fe47bd132f", - "01ed3d923d5a6fc43703d11cb07d6004510eab1193ff6ba5a3f90ec93540b37f", - "b991758a0e5c7c3ed0b3989838cc26658b54823c8e8c39877d402bc2244937c7", - "d59153a64f97f7b0a728edeaac76aeeb9d49f2e6fc54a0f0f31617f0a096d7b2", - "beb1a858ff58fb471904bc249d195c2bea101a343281af77c98b72717af9df2d", - "af67f6961f562440cc3e5a35e22172bd1678e3e0a32c3ac3774c93e3499b7725", - "dacb56173ffb86bb7e9567d8e156dd80467995cb1e0e74d7e951b1ed55fb361e", - "abe850e06fa525bc60998b2dfb003578a17075d2489e0ab510783079bdef7c74", - "ff5a02da19c9849f3784a2e9f37f245dedb913cad4dbefdd2c4c481022479f65", - "6d1f773d784b8e6cbd301d91015815b2f72ec48bfd6ae721706287dee87aa574", - "de428ecde2380716c4396a1a337f03d78e8ae4bdf54fe1838d1d66644b972bfd", - "d7c05c3b7e02c4f9bd1afe217af1678c39e945c69e0a6263a19678a3031a852a", - "19bf88505a839614d383d4bf0fd9c4ed66e07d25490f8c3d4604d778e5761165", - "65ddf8744d2202c4f0c76830d7a909d9eec05b880013a08f6143a925bdbf79e8", - "088a192829d46abc6300e6ab169412c7e5f8b09c87c70cfa5fcfe5a851fcba34", - "f7b5e89cea363319e1d3f01ac2a2cf27a60a0cff16405d23d8d2e322242998e7", - "c095de0094bee3ec60b4c5a73384628d0f1c14a4cf2da69efa74d92687000392", - "e17113aea9dea7d2eca2a8a7da20c3d52a3aae63cde4671aa5cf07ca680d6d03", - "97fcff8453f3fc6c889295748b0908e6cc1c8efb140b08760cdf86e498cd21b6", - "076eeb71a397e736337d13823be856b8b0f6e834ef4f3107104d1ddb134f47df", - "e61626bb903a5da8d5a708d47c95d5d66c260d7f1ddd0ebbefb315d3e6c6571c", - "e5414c0d018c8d92d2e60f6c27826c0d55a70bc9641aa8a89dd041feb819af0c", - "3672c59097d6cefa430667fb4cbab52f37a968b2a27716d8173edd1350fe7994", - "2298d88f4428ce91bb288c2302e3808f36a4cac5f3a527069562e9cb3771b22c", - "edc209551eaf9969aabec69d337648b79927305b84bf9cb867ea615d0f7af461", - "0f1e1c431f6e2dfb413c59ff10dfff00b4b0ea7aa11fe8bbf147084d8479d7f9", - "9aa3ba2be2139b55c03889b72e08de93707a54e77874092a63b7ccd231ad190c", - "72ea5b0704babbca852bbc0be79b361214d3d904c6d69fdcb9976a9fe392c5dc", - "0ef50bed7f01953e07fc26b0b13151082c25cafe608c82eb77920d21203eab00", - "b99bcb2e86a507ae3cafac8600750693dd16f02b86a7b343d7a84a1522a08202", - "32bd17cd9619e3e52f19c1b2921512c3744f8968db370bb69f0f78320f991733", - "23356c46c7a0112ccff8a25b4999abd37c6f6e415e30d91c928373748c04161d", - "c9d4b505577d0362e27aaf7ac6a801dbfa162da064eb268bf4625f593973493b", - "a8dc51525352f0bfa93b62c27685612131e5e77bd747917cc07ae70c96dd1352", - "289e3e9707e43c1c8c29318e72810521ba5158a1729891a186929c525f22b9d0", - "3550e722b5bb4a03907db2ce22cd3a50c7aa05a922649903b0ce7d0013146a5a", - "8bb05cf6909e948e34453508580fab743793cebeed758809a743dc77e3b46bec", - "97afd0a15c1bdaedb9ae6319cb95b9c4c89f0026e3ea19199a7bcb72f8e34274", - "58d9b17a3bff5f34f4c68ac4e027343f03febb53ed167f7af0b6836661bd7b45", - "361eba61a9141544b2f80703e210a0ffe8824476c963b45a14b91382ff409e86", - "0426387acb64da318d1d1e642c5ffcae04b59bd3f0d648a407f43fd54f962f3d", - "7459e2d423225f0884e5cbab8180ee5b89dd2380470cfd5cf1312c14c1b120bd", - "526e2ddee57354ea79533b7e2435a530d04a6eb38de4a0492bb546edf0370be2", - "cf6c2dcd7816069f63f3c0fe3901038db9cd256f93ef0c99b96885cac6a32c2a", - "33b7e06107a7d3495eccf5900aa9e91940a344fde4a031a2806b934325cd33e2", - "354314baf7cb63ca9159f553914b89461bd7ec7a5f79b0320e052f996651587b", - "06820d488e6bd4b11c4d6e6b9056d307c79fea61bff4d14be5132d37233c2326", - "61939f900a060fe103e40c0a3a6e4890aa715af705d2441dcc1ddaf0bc7bd996", - "e188d42e2aaa69a31070e2345edf1252ccef0ad82c9b914b5780f5d2fb216c12", - "f4e5b3f355ce7463a2676c017e57409b00a77f210b2d26a5410a8ace2521caf1", - "8f257e36324219c85575bfdd79f74f720bc13d5428b43445a1092820829a7623", - "d0832954b41aed400ecd7b4856fc76a5f428189ea1b65a645cedb88ea78ab9eb", - "00f8b31ccb57ac0a4a77b37b9713bcd1f3230ceaa20ccbca4671ed12dbdb3b72", - "4ddc0aa91cc641800bfc2a7152478817cbd1d477c1ca3871857c5ac4d36a01d8", - "e6311ee34a08523732a83f753a0e5768fe0f776973b9a7cc8a7278c21746ec4e", - "e070dbb6b8e0d4076dba0237fc5ee477c697c937fd4075c4ecd04f4aaa74e483", - "3e81d97ebd2551ca222899bac1a6eba99bc3cabb0e7143d02d7d3ce7617c3b57", - "457483321a108b1288173f88e69dea1ba37bc2baa4bad79274704921f0bad0cd", - "85eac9e9f9cfec389a7008a62a69795a03db7f7b65f837ba6fab788f3dd57f39", - "8584232355cee897b7776f75994760c3b971183a7794d0624e33fb0d2d0e2d0e", - "092b6e60ea63a6ddf64895ca930a4665b27441fae844efa16c5a15a6c74440c9", - "e8653750f300787b4ba3af22db2988daffc3ce203fb9067d54d2157e23bc8a38", - "5943a46b916091b205253c208161ec0d987f54b883f9b15b39aa9dab43474915", - "84d653e29477b8ede9b4959b26bcdc63ccccf73884f65fc27fbb4a9e115b8ded", - "3794967106edfe5131ff6681c18315b06296f93a5d52f6ef71a1654f2d8248df", - "cfa6322f27aebc43bde9908fb8224c4a83e64b567ee0a493f1e09e2e07c0e891", - "7d211d12f098a0fac3672b800a205d789b4adff916ea39c2eb62b9212e080127", - "9302979097bb97c469487e50c56b1660516350d7e7f6d07a6c64a15404e36b5c", - "fc8d148c9e87fc7674fb13623e111dd010e7a1a559a0753b293e5c087ca6916d", - "2689f818f0d4ffa78b87ea441496a203aa4c87839395beb23d723735d76d52a6", - "4e28cc5aac1b339c6e7a3862e033114143b4ec6b349a49b95f4eb6feb64a4944", - "0ee8d30ca4426fc405bd32108e5e1a489cba4ff6863062030c7ea655edcc158d", - "b6600bb1b58b945997676c52d48853708e3d9a75f797f036d9d1583da5fed30b", - "c99b550e31d5767f9f63fd484bd2b7b33f6f2e67f9b84204e70d0080994a32f0", - "255506bc0c8f4ee4abc4880245b99cd0d1782fb60ba92c197b4cf0bb67df8dde", - "031ed14e4befe7a560c007037dbc64d621e2c02755cfa6646f73a2a6419a8baf", - "94b5e9994f7abeaa6fadae8f85a9f337e1ef503e3d9b03bdc15512899bb2a33e", - "ab33a44fe87e661bf7a1483aa6ef50574ef81c9e1f7768c9acc3d4250adc93fa", - "6986caaee9e941e191d6108cfba8ce14c5a7c8c3fbccb790ea21082a035559cc", - "ee8d7c248f3c912f20549c53213d2892360c6881bd2942e9fba6c910229450f7", - "3a99d2ea91c37908d43ed77367fe050a123d1335668bf83ea080eca836a6ff88", - "d1de77f13fa71b17998e63cccb55dae345b96f56ffcd8fe504fa2ba3bbfdf594", - "2281bb152eb70be33d533b493731e03fc182ad693fea21d030c8d3378b43c5de", - "0151e75829cd2a82ce5909bf37f3dd6621752a37a2ae05a5b740e3493251d251", - "ffeee150a723f63ab1602c08b1744bab5a840a6db94c83864c288a9374c21345", - "50c8dca9b18d977cbe9d2b7696f830973c3cf39a8298bc22df4cb4a3bf881559", - "7f7150a8f52b2ebce05c018a42b818730ccca517987b544f6aa3a094d351d15a", - "625bf686604f22af1583885186fe3fd6061c6b037a7faa9ff0f42ee9009852ca", - "a76f3bb1ba2b0c311302f140409f64ee6c40bca2c697fb93d33105081e04c65f", - "09c8092cc9afa49b1aa9b15c2862a6ad9ef1f49d2f8c55c58dbb11c1f6d4f231", - "7af0416110fcfaafd4708a8baecc4de3ca49f67f710976c0576d062e7629631b", - "3131b09f42c1f4b54a5725970e541a93dd3cfde10aa89e3b0512b7ad23d865aa", - "5eaffe16ce3f45fc1c72777b2d4ad371235671e20cb92509cf2b81d3094f86de", - "f9d9bb6553b244c604f8dc5183ec9fec0d926c2ebb92f97411b52451094457a8", - "6ba984ba37ae7a433d79ee3d6694e469adcbb2c842b6bd9a9a2dfe7ccfd5f01d", - "b8d7e678244e0a1a6b514d6afa030c20f2db2266a47c1c0c4832b1dc854e321d", - "096ef03fdcb0bcf913b531e7f83baeda387e54b414e23e6c48b21ae476f64940", - "f37fcabfca8dfb7301a81cc1bf0197818d9ce5385db2f7b8d66d45e0bde146b1", - "e63e8e91836cc5b1b5bbdb049c2ef4989512e45bc70c1a062ce3e505c7378fa8", - "fbb88fb74c221809c4254ab1d8cb5939fa6d5cd6713d2254d4245be1dcb74c83", - "677bad8e2df848826b0a3ff186ba04dc77254c6a5c2e8d147f3a9640963b7eaa", - "9e163c9ebbe666c4edb5d3177b69dff0bec15e0b1424c6b3e655471aaf27bef6", - "83552c79ccdb2dfc6b434e569453c321e0e8e46244217616865a529194752671", - "9a427cf07ace06b3c4cc2de056bf265f22c971b70f4b6d4fa1534a44fa5abbfa", - "a75134573b09c02031344157b0174b5214b749889bb697cd0b3cf4ccd8ba7029", - "ea70959b34cc3965eb0b6c3c1938da8043b67920e2c4c7aa967e317d5f8a5027", - "d85d0e135e4a4a4e12d649318e1b185d38c42f2328ee28477d90c420cd163b5a", - "0be75c22edafe0dbb86fa075708ba0109bcd705d7d18eb49ff049f63668037f6", - "7509f15f33c0ff5766d090a255490ce3152ba24ca85f25a4669ef84b2540fc6e", - "cc5d8d232a420326b5b18afebdde49d1198475cae361ac928d77308c532bd252", - "a017e22938e92d392396d5f81c740dccae452e42b27b5c5c70eeef0a2babdfe4", - "d248e70a13a18fbc7299ea50a0a64ff201c5b2171c7f62d377c2f7bf4ef114f0", - "92079a64edc2212d5186879e9ecaaeb8c9da499188dd0075a0903f21ef8eef87", - "cd17320b3e9c11c931e68895403c2d963055a3e4ce9278298eb8bd090c79878b", - "eb9efd37db47717fc8027fcb207115b4d028c263ca3cfb4a07650aabeda378e2", - "aa7f6393a89a12b14584698e8c47703120f0762fb6c4f99fc0f1bad2ac454318", - "e48aba9874d6f9ce2c2fa8b637972ef89b8aafa7ebfbde5ec8d7e3510b6238f5", - "2d8c369d1f490838b534e311e53020663bef7cd6267c4b8f00441710bbe5ba34", - "37e2e153d3a9b9fe9a22e0ada0badbf9cdce409b5b4231d9ee06c508a9866dcb", - "e77c0a9eac3f18a199e2f1f328e4f8a327ca248cb96adead7f3df35c81c328ac", - "9191c0848fd9c965fca5736350d679c31f6caa2a9196eec175eb6672f12e9c7a", - "a060043a21fdccdc9c7750b5d10cc2ab7993cb4a32d1226b28180212f07f30de", - "4d5db4c54ffd81fa497f031469cf722d2669e30fc279dd55cd09e9ad08a87b0a", - "705e7a0a447e20c806c949a120f9d815254dd8f7d7fdfddb855f783c52c2fe34", - "facc99c3ddc9da5be21b4eb2332df6b7dc3b44085a93d586de024b3e5ae18cf4", - "668388bc8c3fbb9f12c3e126ce6a7f3ed6e12648bc50af76de06aaca0a16b425", - "de8a1bfd6328e3502f4b558de82dd40f138aa54ae01d94500f7552f27b31044e", - "8df15b1d0b41d04ec91afa96f615922b0185ed33865e33e14da3c04cf7321583", - "8bd6437a006e89d125b2dc656eaa4bbf9cbd8c5841f7ee013af121d937db9cef", - "e120b94c9cbb8efbc58f8a189e929ff85d61bd44b013002881900a3263785c48", - "97a20716ebd63890fe79a419559689a41d155988a14934c7e727fdaf1ac91af1", - "868099f579d1616f4507a92e6e2677a3f7360a3887215b7a27eb0eadbace6fc5", - "9a5df771c074ff2ecd894cf31ca516a0698d9e23b74cbd601e255b53b3c669e9", - "83d8e1168a2d0bd2fbdf39c4a97848cc05dbab0fc0338bd646a66ce7f9124cc0", - "93aaed9d615c4ef6c340fbf0f1d65c00152cb5d165b5b6d8857ef632fe55e360", - "eca0c03922dfcd653c59f35bab0f6b5518e5ad77098276d0b247152732b783a3", - "60090f3a566d2df9187bb547c03e8b453b356f392b525d9eed8707ebf59575e1", - "215f0981c75342dbe611423b245091ded69f610c923644149221e98270a44fc6", - "ec93a12fe0ba3253bc49dd9626310cf78b824902bdba18b9cae92ca607cf36f8", - "3e7fb7b82dac408d807a1a714789ec220c769f2abe490a914291d63ea612606e", - "cb7007d1828a9c6ed0b3af9c86f1bf5286148559c48ed1b09c000f9f00afe707", - "78fe22905b15b3acd627068b7a6c0f1aa7c150af9ea18eb1ef4dac31113e4b6f", - "7e8c752a2b97a8aa19f2e612437c7cb425eb18f73ddf3f8f55f1e7ce42186016", - "153a5f9cce35b336de3f5ef2e9af367fbeda302842241dcd3cf7e7a020cafab1", - "fd3446bfee5795e151406cfab7ce1a788b5e2cb4dd7351771e6342e1aabf83b9", - "42562da1c7a422db4603418a529b0ad2ce13fc0787172b13891e56cec0c0dda9", - "333b35e2d4792942685144ac4bcda16a8adee92ff4044118510e18fe1f804bef", - "4223bc6c358e935b97c237f3830b7a7e6d8dfc0606c2d482afc483ca9f19f133", - "8d3a054254c8c76f3a668d8b904c3396fbc103c8b8d1aed59d0584e11a244756", - "2310e10082d77250eb7a34fc64c6a4a7da1794910fef744c16de706a680c841f", - "0877a3616f6adcb8d5b79089218e449fc987be81ea2e275596b2be9bc8b9a1a4", - "352e44235a3fb04c70b11541e8b9a4bcd7623fd4bc8f0a98de740d2583066b50", - "a931638a4467e3386dd988dae5b13498c6c6479d215a2a13179e30c8a33a22b2", - "f850e3e295e3a0f67841d02eca1672f9ba581b34e33cfee407ecb4d283f3023a", - "8a64085f1df0c729e3a07b46aa50491667a064aa7e9e184840eca14c8375abf0", - "20982688c9f9b96e51017be32bf36e22fa96d5aa0fdf38e4f2c19f8f1094b25f", - "f194bd7a98a669ec0a1e8cd243372d6936a494447c898e40b26e7eae3f0d5143", - "498aa4ef4f007276772ded48952f7d85571efe35a06006c4d62785dcad2f3c5c" -] \ No newline at end of file diff --git a/src/ingest/ap_rss.py b/src/ingest/ap_rss.py index 259d4ff..ba19391 100644 --- a/src/ingest/ap_rss.py +++ b/src/ingest/ap_rss.py @@ -8,7 +8,7 @@ import hashlib INTERVAL = 3600 # seconds (1 hour) -HASHES = './data/raw/saved_hashes.json' +HASHES = './data/raw/feed_saved_hashes.json' os.makedirs('./data/raw/ap/', exist_ok=True) @@ -96,7 +96,7 @@ def save_entry(entry): saved_hashes.add(entry_hash) save_hashes(saved_hashes) - + return True def check_and_save_new_entries(): diff --git a/src/ingest/bbc_rss.py b/src/ingest/bbc_rss.py index a7d64f7..de36208 100644 --- a/src/ingest/bbc_rss.py +++ b/src/ingest/bbc_rss.py @@ -10,7 +10,7 @@ import hashlib INTERVAL = 3600 # seconds (1 hour) -HASHES = './data/raw/saved_hashes.json' +HASHES = './data/raw/feed_saved_hashes.json' os.makedirs('./data/raw/bbc/', exist_ok=True) diff --git a/src/ingest/cnn_rss.py b/src/ingest/cnn_rss.py index 0049bfe..59762d1 100644 --- a/src/ingest/cnn_rss.py +++ b/src/ingest/cnn_rss.py @@ -9,7 +9,7 @@ import hashlib INTERVAL = 3600 # seconds (1 hour) -HASHES = './data/raw/saved_hashes.json' +HASHES = './data/raw/feed_saved_hashes.json' os.makedirs('./data/raw/cnn/', exist_ok=True) diff --git a/src/ingest/npr_rss.py b/src/ingest/npr_rss.py index 58f9a7f..1dd343f 100644 --- a/src/ingest/npr_rss.py +++ b/src/ingest/npr_rss.py @@ -11,7 +11,7 @@ import hashlib INTERVAL = 3600 # seconds (1 hour) -HASHES = './data/raw/saved_hashes.json' +HASHES = './data/raw/feed_saved_hashes.json' os.makedirs('./data/raw/npr/', exist_ok=True) diff --git a/src/ingest/nyt_rss.py b/src/ingest/nyt_rss.py index 0b0895e..0484d67 100644 --- a/src/ingest/nyt_rss.py +++ b/src/ingest/nyt_rss.py @@ -9,7 +9,7 @@ import hashlib INTERVAL = 3600 # seconds (1 hour) -HASHES = './data/raw/saved_hashes.json' +HASHES = './data/raw/feed_saved_hashes.json' os.makedirs('./data/raw/nyt/', exist_ok=True)