Execute the scraper to fetch latest news:
python scraper.pyThis will:
- Connect to configured news sources
- Parse RSS feeds
- Extract article content
- Perform sentiment analysis
- Extract financial entities
- Save to database
Check the database:
sqlite3 financial_news.db "SELECT COUNT(*) FROM financial_news;"from scraper import NewsScraper
import asyncio
async def scrape_single_source():
async with NewsScraper() as scraper:
articles, errors = await scraper.scrape_source(
'yahoo_finance',
Config.NEWS_SOURCES['yahoo_finance']
)
print(f"Scraped {len(articles)} articles")
asyncio.run(scrape_single_source())from datetime import datetime, timedelta
from scraper import NewsScraper
# Scrape articles from last 7 days
start_date = datetime.now() - timedelta(days=7)
# Configure in scraper logicfrom data_export import export_daily_news
# Export in JSON format
export_daily_news(format='json', output_dir='exports')
# Export in CSV format
export_daily_news(format='csv', output_dir='exports')
# Export in XML format
export_daily_news(format='xml', output_dir='exports')
# Export in Parquet format
export_daily_news(format='parquet', output_dir='exports')from data_export import DataExporter
from datetime import datetime, timedelta
start = datetime.now() - timedelta(days=30)
end = datetime.now()
DataExporter.export_date_range(
start_date=start,
end_date=end,
format='json',
filename='exports/monthly_news.json'
)from data_export import DataExporter
from datetime import datetime, timedelta
start = datetime.now() - timedelta(days=7)
end = datetime.now()
DataExporter.export_date_range(
start_date=start,
end_date=end,
format='csv',
filename='exports/yahoo_weekly.csv',
source='yahoo_finance'
)from data_export import DataExporter
from datetime import datetime, timedelta
start = datetime.now() - timedelta(days=7)
end = datetime.now()
# Export only top 100 articles
DataExporter.export_date_range(
start_date=start,
end_date=end,
format='json',
filename='exports/top_100.json',
limit=100
)from database import SessionLocal
from models import FinancialNews
from datetime import datetime, timedelta
db = SessionLocal()
# Get all articles from last 24 hours
yesterday = datetime.now() - timedelta(days=1)
articles = db.query(FinancialNews).filter(
FinancialNews.published_date >= yesterday
).all()
# Get articles by source
yahoo_articles = db.query(FinancialNews).filter(
FinancialNews.source == 'yahoo_finance'
).all()
# Get positive sentiment articles
positive_articles = db.query(FinancialNews).filter(
FinancialNews.sentiment_label == 'positive'
).all()
# Get articles mentioning specific stock
import json
articles_with_aapl = []
for article in db.query(FinancialNews).all():
if article.mentioned_stocks:
stocks = json.loads(article.mentioned_stocks)
if 'AAPL' in stocks:
articles_with_aapl.append(article)
db.close()from database import SessionLocal
from models import FinancialNews
from sqlalchemy import func
db = SessionLocal()
# Count articles by source
source_counts = db.query(
FinancialNews.source,
func.count(FinancialNews.id)
).group_by(FinancialNews.source).all()
# Average sentiment by source
avg_sentiment = db.query(
FinancialNews.source,
func.avg(FinancialNews.sentiment_score)
).group_by(FinancialNews.source).all()
# Articles per day
from sqlalchemy import cast, Date
daily_counts = db.query(
cast(FinancialNews.published_date, Date),
func.count(FinancialNews.id)
).group_by(cast(FinancialNews.published_date, Date)).all()
db.close()python api.pyThe API will be available at http://localhost:8000
curl http://localhost:8000/articles?limit=10curl http://localhost:8000/articles?source=yahoo_financecurl "http://localhost:8000/articles?start_date=2026-02-01&end_date=2026-02-03"curl http://localhost:8000/articles/1curl "http://localhost:8000/search?q=Tesla"curl http://localhost:8000/statsThe scraper runs automatically via GitHub Actions:
- Schedule: Daily at 2:00 AM UTC
- Manual Trigger: Via GitHub Actions UI
- Outputs: Database, exports, artifacts
gh workflow run daily-scraping.ymlcrontab -eAdd:
0 2 * * * cd /path/to/FinDB && /path/to/python scraper.py
- Open Task Scheduler
- Create Basic Task
- Set trigger: Daily at 2:00 AM
- Action: Start a program
- Program:
python.exe - Arguments:
scraper.py - Start in:
C:\path\to\FinDB
import schedule
import time
from scraper import main_scraping
import asyncio
def job():
asyncio.run(main_scraping())
schedule.every().day.at("02:00").do(job)
while True:
schedule.run_pending()
time.sleep(60)Add to config.py:
NEWS_SOURCES = {
"custom_source": {
"rss_url": "https://example.com/rss",
"base_url": "https://example.com",
"content_selector": "div.article-content p",
"title_selector": "h1.article-title",
"date_selector": "time.published"
}
}Modify scraper.py:
def extract_custom_entities(text: str) -> Dict:
# Add custom extraction logic
crypto_pattern = r'\b(BTC|ETH|XRP)\b'
cryptos = re.findall(crypto_pattern, text)
return {'cryptocurrencies': cryptos}from textblob import TextBlob
def custom_sentiment(text: str) -> float:
blob = TextBlob(text)
# Custom weighting or model
return blob.sentiment.polarity * 1.5- Rate Limiting: Add delays between requests
- Error Handling: Implement retry logic
- Data Validation: Verify extracted data
- Logging: Monitor scraping activity
- Backups: Regular database backups
- Monitoring: Track success rates
See Troubleshooting Guide for common issues and solutions.