jan26_intern_C/process_articles.py at main · atf-inc/jan26_intern_C · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import os
sys.stdout.reconfigure(encoding='utf-8', errors='replace')

project_root = os.path.dirname(os.path.abspath(__file__))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

from datetime import datetime
from typing import List, Dict, Tuple

from config.settings import settings
from utils.logger import get_logger
from database.connection import get_supabase_client

logger = get_logger(__name__)


def get_keywords_from_database() -> List[str]:
    supabase = get_supabase_client()
    try:
        result = supabase.table("admin_keywords").select("keyword").eq("is_active", True).execute()
        return [row["keyword"] for row in result.data or []]
    except Exception as e:
        logger.error(f"Error fetching keywords: {e}")
        return [settings.target_company, settings.target_company_ticker, "mobile gaming", "gacha", "Japan games"]


def count_keyword_matches(article: Dict, keywords: List[str]) -> Tuple[int, List[str]]:
    title = (article.get("title") or "").lower()
    content = (article.get("content") or article.get("summary") or "").lower()
    text = f"{title} {content}"
    matched = [kw for kw in keywords if kw.lower() in text]
    return len(matched), matched


def filter_and_rank_articles(articles: List[Dict], keywords: List[str]) -> List[Dict]:
    ranked = []
    for article in articles:
        match_count, matched = count_keyword_matches(article, keywords)
        if match_count > 0:
            article["keyword_match_count"] = match_count
            article["matched_keywords"] = matched
            ranked.append(article)
    ranked.sort(key=lambda x: x["keyword_match_count"], reverse=True)
    return ranked


def categorize_article(article: Dict) -> str:
    category = (article.get("category") or "").lower()
    source = (article.get("source") or "").lower()
    japan_indicators = ["japan_games", "japan", "famitsu", "4gamer", "dengeki", "automaton", "gematsu", "gamespark", "ign japan"]
    if any(indicator in category or indicator in source for indicator in japan_indicators):
        return "japan"
    return "global"


def generate_short_summary(title: str, content: str, max_length: int = 100) -> str:
    from utils.gemini_client import generate_text
    prompt = f"Summarize in ONE sentence (max {max_length} chars):\n\nTitle: {title}\nContent: {content[:500]}\n\nSummary:"
    try:
        summary = generate_text(prompt)
        if summary:
            summary = summary.strip()
            return summary[:max_length-3] + "..." if len(summary) > max_length else summary
    except Exception as e:
        logger.warning(f"Error generating summary: {e}")
    return content[:max_length-3].rsplit(' ', 1)[0] + "..." if content and len(content) > max_length else (content or title[:max_length])


def run_processing_pipeline():
    print("=" * 60)
    print("   ARTICLE PROCESSING PIPELINE v3")
    print(f"   {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print("=" * 60)

    print("\n[1/8] FETCHING ARTICLES FROM RSS FEEDS")
    print("-" * 40)

    from ingestion.news_ingestion import ingest_news_feeds, load_feed_config

    feed_config = load_feed_config()
    feed_configs = []
    for category, feeds in feed_config.get("feeds", {}).items():
        for feed in feeds:
            if feed.get("enabled", True):
                feed_configs.append({"url": feed["url"], "category": category, "name": feed.get("name", "Unknown")})

    print(f"  Loading {len(feed_configs)} RSS feeds...")
    raw_articles = ingest_news_feeds(feed_configs, days_lookback=7)
    print(f"  [OK] Fetched {len(raw_articles)} articles (last 7 days)")

    if not raw_articles:
        return {"success": False, "message": "No articles fetched"}

    print("\n[2/8] REMOVING DUPLICATES")
    print("-" * 40)
    from processing.deduplication import remove_duplicates
    unique_articles = remove_duplicates(raw_articles)
    print(f"  [OK] {len(unique_articles)} unique articles")

    print("\n[3/8] LOADING KEYWORDS")
    print("-" * 40)
    keywords = get_keywords_from_database()
    print(f"  [OK] Loaded {len(keywords)} keywords")

    print("\n[4/8] FILTERING & RANKING")
    print("-" * 40)
    ranked_articles = filter_and_rank_articles(unique_articles, keywords)
    print(f"  [OK] {len(ranked_articles)} articles matched")

    if not ranked_articles:
        return {"success": False, "message": "No articles matched keywords"}

    print("\n[5/8] SELECTING TOP 10 PER CATEGORY")
    print("-" * 40)
    all_japan = [a for a in ranked_articles if categorize_article(a) == "japan"]
    all_global = [a for a in ranked_articles if categorize_article(a) == "global"]
    top_10_japan_candidates = all_japan[:10]
    top_10_global_candidates = all_global[:10]
    candidates_for_ai = top_10_japan_candidates + top_10_global_candidates
    print(f"  [OK] Japan: {len(top_10_japan_candidates)}, Global: {len(top_10_global_candidates)}")

    print("\n[6/8] AI SCORING")
    print("-" * 40)
    from utils.gemini_client import calculate_importance_score, analyze_sentiment
    supabase = get_supabase_client()

    processed_japan, processed_global = [], []
    for i, article in enumerate(candidates_for_ai, 1):
        try:
            title = article.get("title", "")
            content = article.get("content") or article.get("summary") or ""
            category = categorize_article(article)

            importance = calculate_importance_score(title, content, company_name=settings.target_company)
            sentiment = analyze_sentiment(title, content)

            processed = {
                "title": title, "content": content, "source": article.get("source", "Unknown"),
                "source_url": article.get("url", ""), "published_at": article.get("published_at"),
                "importance_score": importance, "sentiment": sentiment,
                "source_category": article.get("category", "general"),
                "keyword_match_count": article.get("keyword_match_count", 0),
                "matched_keywords": article.get("matched_keywords", []), "news_type": category
            }

            if category == "japan":
                processed_japan.append(processed)
            else:
                processed_global.append(processed)
        except Exception as e:
            logger.error(f"Error: {e}")

    processed_japan.sort(key=lambda x: x["importance_score"], reverse=True)
    processed_global.sort(key=lambda x: x["importance_score"], reverse=True)

    print("\n[7/8] GENERATING SUMMARIES")
    print("-" * 40)
    top_10_japan = processed_japan[:10]
    top_10_global = processed_global[:10]

    for article in top_10_japan + top_10_global:
        article["short_summary"] = generate_short_summary(article["title"], article.get("content", ""))

    print(f"  [OK] Generated {len(top_10_japan) + len(top_10_global)} summaries")

    print("\n[8/8] SAVING TO DATABASE")
    print("-" * 40)
    saved_count = 0

    for article in top_10_japan + top_10_global:
        try:
            source_url = article.get("source_url", "")
            existing = supabase.table("raw_articles").select("id").eq("source_url", source_url).limit(1).execute()

            if existing.data:
                raw_id = existing.data[0]["id"]
            else:
                raw_result = supabase.table("raw_articles").insert({
                    "title": article["title"], "content": article.get("short_summary", ""),
                    "source_url": source_url, "source_name": article.get("source", "Unknown"),
                    "published_at": article.get("published_at").isoformat() if article.get("published_at") else None,
                    "category": article.get("source_category", "general"), "article_type": "company", "processed": True
                }).execute()
                raw_id = raw_result.data[0]["id"] if raw_result.data else None

            if not raw_id:
                continue

            existing_proc = supabase.table("processed_articles").select("id").eq("raw_article_id", raw_id).limit(1).execute()
            if existing_proc.data:
                continue

            db_category = "japan_local" if article["news_type"] == "japan" else "global"
            supabase.table("processed_articles").insert({
                "raw_article_id": raw_id, "summary": article.get("short_summary", ""),
                "importance_score": article["importance_score"] / 10.0,
                "relevance_tags": [db_category], "key_points": article.get("matched_keywords", [])[:5],
                "sentiment": article.get("sentiment", "neutral"), "article_type": "company",
                "is_competitor_news": False,
                "ai_metadata": {"model": "gemini-1.5-flash", "news_type": db_category, "processed_at": datetime.now().isoformat()}
            }).execute()
            saved_count += 1
        except Exception as e:
            logger.warning(f"Error saving: {e}")

    print(f"  [OK] Saved {saved_count} articles")
    print("\n" + "=" * 60)
    print("   PIPELINE COMPLETE")
    print("=" * 60)

    return {
        "success": True,
        "stats": {"saved": saved_count, "top_10_japan": len(top_10_japan), "top_10_global": len(top_10_global)},
        "top_10_japan": top_10_japan, "top_10_global": top_10_global
    }


if __name__ == "__main__":
    result = run_processing_pipeline()
    print("\n✅ Done!" if result.get("success") else "\n❌ Failed")