-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_engine.py
More file actions
184 lines (149 loc) · 7.9 KB
/
data_engine.py
File metadata and controls
184 lines (149 loc) · 7.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
import requests
import json
import os
import time
from datetime import datetime, timedelta
from typing import Dict, List, Any
import pandas as pd
import urllib3
# Disable SSL warnings globally as per the original script's approach
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
class DailyCashCrawler:
"""今彩539資料爬蟲類別"""
def __init__(self):
self.base_url = "https://api.taiwanlottery.com/TLCAPIWeB/Lottery"
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Accept': 'application/json',
'Accept-Language': 'zh-TW,zh;q=0.9,en;q=0.8'
}
self.session = requests.Session()
self.session.headers.update(self.headers)
self.session.verify = False # Ignore SSL certificate verification
def crawl_daily_cash(self, year_month: str, page_num: int = 1, page_size: int = 50) -> Dict[str, Any]:
"""爬取今彩539資料"""
url = f"{self.base_url}/Daily539Result"
params = {
'month': year_month,
'pageNum': page_num,
'pageSize': page_size
}
try:
print(f"正在爬取今彩539資料: {year_month}, 頁數: {page_num}")
response = self.session.get(url, params=params, timeout=10)
response.raise_for_status()
data = response.json()
if data.get('rtCode') == 0:
return data.get('content', {})
else:
print(f"API 錯誤: {data.get('rtMsg', '未知錯誤')}")
return {}
except requests.exceptions.RequestException as e:
print(f"請求今彩539資料失敗: {e}")
return {}
except json.JSONDecodeError as e:
print(f"解析今彩539 JSON 失敗: {e}")
return {}
def process_daily_cash_data(self, raw_data: List[Dict]) -> List[Dict]:
"""處理今彩539資料格式,返回列表以便於轉換為DataFrame"""
processed_list = []
for item in raw_data:
try:
lottery_date_str = item['lotteryDate'].replace('T00:00:00', '')
lottery_date = datetime.fromisoformat(lottery_date_str)
tw_year = lottery_date.year - 1911
period = str(item['period'])
draw_numbers = item['drawNumberAppear']
numbers = sorted(draw_numbers[:5])
processed_list.append({
'draw': period,
'date': f"{tw_year}/{lottery_date.month:02d}/{lottery_date.day:02d}",
'ad_date': lottery_date.strftime('%Y-%m-%d'), # For sorting and internal use
'numbers': ','.join(f"{n:02d}" for n in numbers), # Format as "01,02,03,04,05"
'price': 8000000,
'lottery_type': 'daily_cash'
})
except (KeyError, ValueError, IndexError) as e:
print(f"處理今彩539資料錯誤: {e}, 項目: {item}")
continue
return processed_list
def get_existing_data_df(self, filepath: str) -> pd.DataFrame:
"""讀取現有CSV資料為DataFrame"""
if os.path.exists(filepath):
try:
df = pd.read_csv(filepath, dtype={'draw': str}) # Ensure 'draw' is read as string
# Convert 'ad_date' back to datetime for proper comparison
df['ad_date'] = pd.to_datetime(df['ad_date'])
return df
except Exception as e:
print(f"讀取現有CSV資料失敗: {e}")
return pd.DataFrame()
return pd.DataFrame()
def get_latest_ad_date(self, df: pd.DataFrame) -> datetime:
"""從DataFrame中找到最新日期 (AD calendar)"""
if not df.empty and 'ad_date' in df.columns:
return df['ad_date'].max().to_pydatetime() # Convert Timestamp to datetime.datetime
return None
def crawl_and_save_daily_cash(self, start_year: int = 2014, start_month: int = 1):
"""智能爬取今彩539資料並儲存到CSV"""
output_dir = 'lottery_data'
os.makedirs(output_dir, exist_ok=True)
filepath = os.path.join(output_dir, 'lottery_data.csv')
existing_df = self.get_existing_data_df(filepath)
latest_date_ad = self.get_latest_ad_date(existing_df)
current_date = datetime.now()
if latest_date_ad:
# 從最新資料的月份開始爬取(確保不遺漏)
start_crawl_date = latest_date_ad.replace(day=1)
print(f"📊 檢測到現有資料,最新日期: {latest_date_ad.strftime('%Y-%m-%d')}")
print(f"🎯 增量更新:從 {start_crawl_date.strftime('%Y-%m')} 開始爬取新資料")
else:
start_crawl_date = datetime(start_year, start_month, 1)
print(f"🆕 首次爬取今彩539資料,從 {start_year}-{start_month:02d} 開始")
all_new_records = []
new_count = 0
current_month_iter = start_crawl_date
while current_month_iter <= current_date:
year_month = f"{current_month_iter.year}-{current_month_iter.month:02d}"
try:
raw_content = self.crawl_daily_cash(year_month)
if raw_content and 'daily539Res' in raw_content:
processed_list = self.process_daily_cash_data(raw_content['daily539Res'])
for record in processed_list:
# Only add if it's newer than the latest existing record
# Or if there's no existing data
if latest_date_ad is None or datetime.fromisoformat(record['ad_date']) > latest_date_ad:
all_new_records.append(record)
new_count += 1
if processed_list: # Only print if there was data for the month
print(f"✅ {year_month}: 處理了 {len(processed_list)} 筆資料")
time.sleep(1) # Avoid frequent requests
except Exception as e:
print(f"❌ 爬取 {year_month} 失敗: {e}")
# Move to the next month
if current_month_iter.month == 12:
current_month_iter = current_month_iter.replace(year=current_month_iter.year + 1, month=1)
else:
current_month_iter = current_month_iter.replace(month=current_month_iter.month + 1)
if all_new_records:
new_df = pd.DataFrame(all_new_records)
# Combine existing and new data, remove duplicates based on 'draw' and sort by 'ad_date'
combined_df = pd.concat([existing_df, new_df]).drop_duplicates(subset=['draw'])
# CRITICAL FIX: Ensure 'ad_date' column is uniformly datetime before sorting
combined_df['ad_date'] = pd.to_datetime(combined_df['ad_date'])
# Now sort
combined_df = combined_df.sort_values(by='ad_date').reset_index(drop=True)
# Save to CSV
combined_df.to_csv(filepath, index=False, encoding='utf-8')
print(f"\n🎉 今彩539資料更新完成!")
print(f"📈 本次新增 {new_count} 筆記錄,總計 {len(combined_df)} 筆")
print(f"✅ 資料已儲存到: {filepath}")
elif not existing_df.empty:
print(f"\n🎉 今彩539資料已是最新,無需更新。總計 {len(existing_df)} 筆記錄")
else:
print(f"\n⚠️ 未能獲取任何今彩539資料。")
def main():
crawler = DailyCashCrawler()
crawler.crawl_and_save_daily_cash()
if __name__ == "__main__":
main()