avhub/main.py at main · levywang/avhub · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
# -*- encoding: utf-8 -*-
import os
import requests
import json
from bs4 import BeautifulSoup
from typing import Union
from fastapi import FastAPI
from fastapi.responses import JSONResponse, FileResponse
from fastapi.middleware.cors import CORSMiddleware
from fastapi import FastAPI, HTTPException
from fastapi.staticfiles import StaticFiles
import random
from utils.spider import *
import hydra
from utils.logger import setup_logger
import schedule
import time
from contextlib import asynccontextmanager
import pathlib
import re
from concurrent.futures import ThreadPoolExecutor
import asyncio
from collections import Counter

@hydra.main(config_path='data/', config_name='config', version_base=None)
def main(cfg: DictConfig):
    # 初始化日志记录器
    global logger
    logger = setup_logger(cfg)

    @asynccontextmanager
    async def lifespan(app: FastAPI):
        # 启动前的操作
        logger.info("Application startup")
        yield
        # 关闭时的操作
        logger.info("Application shutdown")

    app = FastAPI(lifespan=lifespan)

    app.add_middleware(
        CORSMiddleware,
        allow_origins=cfg.app.cors_origins,
        allow_credentials=cfg.app.cors_credentials,
        allow_methods=cfg.app.cors_methods,
        allow_headers=cfg.app.cors_headers,
    )

    from fastapi import APIRouter, Request
    api_router = APIRouter(prefix="/api")

    # API Key 认证 middleware
    auth_enabled = str(cfg.app.auth_enabled).lower() == 'true'
    if auth_enabled:
        from starlette.middleware.base import BaseHTTPMiddleware
        from starlette.responses import JSONResponse as StarletteJSON

        class APIKeyMiddleware(BaseHTTPMiddleware):
            async def dispatch(self, request: Request, call_next):
                path = request.url.path
                # 静态文件、index.html、img_proxy 不需要认证
                if not path.startswith("/api/") or path.startswith("/api/v1/img_proxy"):
                    return await call_next(request)
                key = request.headers.get("X-API-Key") or request.query_params.get("api_key")
                if key != cfg.app.api_key:
                    return StarletteJSON({"detail": "Unauthorized"}, status_code=401)
                return await call_next(request)

        app.add_middleware(APIKeyMiddleware)
        logger.info("API Key authentication enabled")

    # 创建线程池
    executor = ThreadPoolExecutor(max_workers=10)

    def _fetch_url(url: str) -> str:
        """获取URL内容"""
        try:
            response = requests.get(url, timeout=10)  # 减少超时时间到10秒
            response.raise_for_status()
            return response.text
        except Exception as e:
            logger.error(f"Failed to fetch URL {url}: {str(e)}")
            return ""

    def _parse_html(html_content: str, image_dir_url: str) -> list:
        """解析HTML内容并提取链接"""
        try:
            soup = BeautifulSoup(html_content, 'html.parser')
            a_tags = soup.find_all('a', href=True)
            links = [image_dir_url + tag['href'] for tag in a_tags if tag['href'] != '../']
            return [link for link in links if link.endswith('.webp')] or links
        except Exception as e:
            logger.error(f"Failed to parse HTML: {str(e)}")
            return []

    async def get_image_url(video_url: str) -> str:
        """异步获取图片URL"""
        try:
            # 构建图片目录URL
            image_dir_url = video_url.replace('index.m3u8', 'image/')

            # 设置超时时间为15秒的Future
            loop = asyncio.get_event_loop()
            html_content = await asyncio.wait_for(
                loop.run_in_executor(executor, _fetch_url, image_dir_url),
                timeout=15
            )

            if not html_content:
                return None

            # HTML解析设置5秒超时
            links = await asyncio.wait_for(
                loop.run_in_executor(executor, _parse_html, html_content, image_dir_url),
                timeout=5
            )

            if not links:
                logger.warning("No image links found.")
                return None

            return random.choice(links)
        except asyncio.TimeoutError:
            logger.error(f"Timeout while processing image URL for {video_url}")
            return None
        except Exception as e:
            logger.error(f"Failed to obtain the image URL: {str(e)}")
            return None

    async def read_random_line(file_path: str) -> tuple[str, str]:
        """异步读取随机行并获取图片URL"""
        if not os.path.isfile(file_path):
            logger.error("File not found")
            raise HTTPException(status_code=404, detail="File not found")

        try:
            loop = asyncio.get_event_loop()
            # 文件读取设置2秒超时
            lines = await asyncio.wait_for(
                loop.run_in_executor(executor, lambda: open(file_path, 'r').readlines()),
                timeout=2
            )

            if not lines:
                logger.error("File is empty")
                raise HTTPException(status_code=400, detail="File is empty")

            random_line = random.choice(lines).strip()
            # 获取图片URL设置总超时20秒
            img_url = await asyncio.wait_for(get_image_url(random_line), timeout=20)

            return random_line, img_url
        except asyncio.TimeoutError:
            logger.error("Timeout while reading random line or fetching image URL")
            # 如果超时，返回视频URL但不返回图片URL
            return random.choice(lines).strip() if lines else None, None
        except Exception as e:
            logger.error(f"Error in read_random_line: {str(e)}")
            raise HTTPException(status_code=500, detail=str(e))

    @api_router.get("/v1/hacg/refresh")
    async def refresh_hacg():
        """立即触发里番合集 JSON 更新"""
        def _run():
            hacg_spider = HacgSpider(url=cfg.hacg_spider.source_url, filepath=cfg.files.hacg_json_path, cfg=cfg)
            hacg_spider.update_json_file()
            logger.info("HacgSpider manual refresh completed.")
        try:
            loop = asyncio.get_event_loop()
            await loop.run_in_executor(executor, _run)
            return {"status": "succeed", "message": "HACG data refreshed"}
        except Exception as e:
            logger.error(f"Failed to refresh HACG data: {str(e)}")
            raise HTTPException(status_code=500, detail=str(e))

    @api_router.get("/v1/hacg")
    async def read_hacg():
        try:
            with open(cfg.files.hacg_json_path, 'r', encoding='utf-8') as file:
                data = json.load(file)
            logger.info("HACG data fetched successfully")
            return JSONResponse({"data": data}, headers={'content-type': 'application/json;charset=utf-8'})
        except Exception as e:
            logger.error(f"Failed to fetch HACG data: {str(e)}")
            raise HTTPException(status_code=500, detail="Internal Server Error")

    @api_router.get("/v1/avcode/{code_str}")
    async def crawl_av(code_str: str):
        # 规范化code_str，只保留字母和数字
        code_str = re.sub(r'[^a-zA-Z0-9]', '', code_str).lower()

        # 如果启用了缓存，确保缓存目录存在并尝试从缓存读取
        use_cache = str(cfg.av_spider.use_cache).lower() == 'true'
        if use_cache:
            # 确保缓存目录存在
            pathlib.Path(cfg.av_spider.cache_dir).mkdir(parents=True, exist_ok=True)

            cache_path = os.path.join(cfg.av_spider.cache_dir, f"{code_str}.json")
            try:
                if os.path.exists(cache_path):
                    with open(cache_path, 'r', encoding='utf-8') as f:
                        cached_data = json.load(f)
                        logger.info(f"Cache hit for AV code: {code_str}")
                        return {"status": "succeed", "data": cached_data}
            except Exception as e:
                logger.error(f"Error reading cache file: {str(e)}")

        # 如果没有缓存或缓存读取失败，从网络获取
        crawler = AVSpider(av_code=code_str,
                          source_url=cfg.av_spider.source_url,
                          proxy_url=cfg.av_spider.proxy_url,
                          use_proxy=str(cfg.av_spider.use_proxy).lower() == 'true',
                          cfg=cfg)

        try:
            magnet_links = await crawler.process_av_code()

            if not magnet_links:
                logger.error(f"No magnet links found for AV code: {code_str}")
                raise HTTPException(status_code=404, detail="No magnet links found")

            # 准备数据
            magnet_data = [str(item) for item in magnet_links]

            # 如果启用了缓存，保存到缓存文件（只保存数据部分）
            if use_cache:
                try:
                    with open(cache_path, 'w', encoding='utf-8') as f:
                        json.dump(magnet_data, f, ensure_ascii=False, indent=4)
                    logger.info(f"Cache written for AV code: {code_str}")
                except Exception as e:
                    logger.error(f"Error writing cache file: {str(e)}")

            logger.info(f"Magnet links found for AV code: {code_str}")
            return {"status": "succeed", "data": magnet_data}
        except Exception as e:
            logger.error(f"Error processing AV code {code_str}: {str(e)}")
            # 如果是404错误，返回404而不是500
            if "404" in str(e) or "No magnet links found" in str(e):
                raise HTTPException(status_code=404, detail="No magnet links found")
            raise HTTPException(status_code=500, detail=str(e))
        finally:
            del crawler  # 确保资源被正确释放

    @api_router.get("/v1/get_video")
    async def get_random_video_url():
        """Returns a random video URL and its corresponding image URL."""
        try:
            file_path = cfg.files.video_urls_txt_path
            # 设置整体操作超时为25秒
            video_url, img_url = await asyncio.wait_for(
                read_random_line(file_path),
                timeout=25
            )

            if not video_url:
                raise HTTPException(status_code=500, detail="Failed to get video URL")

            logger.info("Random video URL and image URL fetched successfully")
            return {
                "url": video_url,
                "img_url": img_url or ""
            }
        except asyncio.TimeoutError:
            logger.error("Global timeout in get_random_video_url")
            raise HTTPException(status_code=504, detail="Request timeout")
        except Exception as e:
            logger.error(f"Failed to fetch random video URL: {str(e)}")
            raise HTTPException(status_code=500, detail=str(e))

    def run_hacg_spider():
        hacg_spider = HacgSpider(url=cfg.hacg_spider.source_url, filepath=cfg.files.hacg_json_path, cfg=cfg)
        hacg_spider.update_json_file()
        logger.info("HacgSpider task completed.")

    # Schedule the HacgSpider task to run daily at 1 AM
    schedule.every().day.at("01:00").do(run_hacg_spider)

    # Function to keep running the scheduler in the background
    def run_scheduler():
        while True:
            schedule.run_pending()
            time.sleep(60)  # Check every minute

    import threading
    # Start the scheduler in a separate thread
    scheduler_thread = threading.Thread(target=run_scheduler)
    scheduler_thread.daemon = True
    scheduler_thread.start()

    @api_router.get("/v1/hot_searches")
    async def get_hot_searches(top_n: int = 5, last_n_lines: int = 2000):
        """返回最热门的搜索词

        Args:
            top_n: 返回的热门搜索词数量，默认为5
            last_n_lines: 读取日志文件的最后行数，默认为1000行
        """
        try:
            # 参数基本验证
            if top_n < 1:
                top_n = 5
            if last_n_lines < 100:
                last_n_lines = 1000

            log_file_path = cfg.logging.log_file
            if not os.path.exists(log_file_path):
                logger.error(f"Log file does not exist: {log_file_path}")
                raise HTTPException(status_code=404, detail="Log file does not exist")

            # 使用线程池异步读取日志文件的最后N行
            def read_last_n_lines():
                encodings = ['utf-8', 'gbk', 'iso-8859-1']

                for encoding in encodings:
                    try:
                        with open(log_file_path, 'r', encoding=encoding) as f:
                            # 使用deque优化内存使用
                            from collections import deque
                            return deque(f, last_n_lines)
                    except UnicodeDecodeError:
                        continue
                    except Exception as e:
                        logger.error(f"Error reading log file with {encoding}: {str(e)}")
                        continue

                raise HTTPException(status_code=500, detail="Unable to read log file with any encoding")

            # 读取日志文件最后N行
            log_content = await asyncio.wait_for(
                asyncio.get_event_loop().run_in_executor(executor, read_last_n_lines),
                timeout=10
            )

            # 提取包含"AV code"但不包含"404"的行
            av_code_lines = [line for line in log_content if "AV code" in line and "404" not in line]

            # 从每行中提取代码
            search_terms = []
            for line in av_code_lines:
                try:
                    parts = line.split(":")
                    if len(parts) >= 2:
                        search_term = parts[-1].strip().lower()
                        # 规范化搜索词，只保留字母和数字
                        search_term = re.sub(r'[^a-zA-Z0-9]', '', search_term)
                        if search_term:
                            search_terms.append(search_term)
                except Exception as e:
                    logger.warning(f"Error processing line: {str(e)}")
                    continue

            if not search_terms:
                return {"status": "succeed", "data": []}

            # 统计每个搜索词的出现次数并获取指定范围的数据
            term_counts = Counter(search_terms)
            most_common_all = term_counts.most_common()  # 获取全部排序结果
            start_index = top_n
            end_index = start_index + top_n
            selected_terms = most_common_all[start_index:end_index]  # 切片获取指定范围
            top_terms = [term for term, _ in selected_terms]

            logger.info(f"Retrieved top {top_n*2} popular search terms from last {last_n_lines} lines")
            return {"status": "succeed", "data": top_terms}
        except asyncio.TimeoutError:
            logger.error("Timeout while reading log file")
            raise HTTPException(status_code=504, detail="Request timeout")
        except Exception as e:
            logger.error(f"Failed to obtain popular search terms: {str(e)}")
            raise HTTPException(status_code=500, detail=str(e))

    @api_router.get("/v1/img_proxy")
    async def img_proxy(url: str):
        """图片反代，绕过防盗链"""
        import aiohttp
        from fastapi.responses import Response
        try:
            async with aiohttp.ClientSession() as session:
                async with session.get(url, headers={
                    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
                }, allow_redirects=True, timeout=aiohttp.ClientTimeout(total=10)) as resp:
                    if resp.status != 200:
                        raise HTTPException(status_code=resp.status, detail="Image fetch failed")
                    content = await resp.read()
                    content_type = resp.headers.get("Content-Type", "image/jpeg")
            return Response(content=content, media_type=content_type)
        except HTTPException:
            raise
        except Exception as e:
            logger.error(f"img_proxy error: {str(e)}")
            raise HTTPException(status_code=500, detail=str(e))

    # 注册API路由
    app.include_router(api_router)

    # 挂载前端静态文件
    web_dist_path = pathlib.Path(__file__).parent / "web" / "dist"
    if web_dist_path.exists():
        # 挂载所有静态子目录和根级静态文件
        for item in web_dist_path.iterdir():
            if item.is_dir():
                app.mount(f"/{item.name}", StaticFiles(directory=str(item)), name=item.name)

        @app.get("/{full_path:path}")
        async def serve_spa(full_path: str):
            """先尝试返回实际文件，找不到才返回 index.html（SPA fallback）"""
            # 尝试直接返回对应文件
            target = web_dist_path / full_path
            if target.is_file():
                # 根据文件扩展名设置正确的 Content-Type
                media_type = None
                suffix = target.suffix.lower()
                if suffix == '.webmanifest':
                    media_type = 'application/manifest+json'
                elif suffix == '.js':
                    media_type = 'application/javascript'
                elif suffix == '.css':
                    media_type = 'text/css'
                elif suffix == '.json':
                    media_type = 'application/json'
                elif suffix in ['.png', '.jpg', '.jpeg', '.gif', '.ico', '.svg', '.webp']:
                    media_type = f'image/{suffix[1:]}'
                return FileResponse(str(target), media_type=media_type)
            # SPA fallback
            index_file = web_dist_path / "index.html"
            if index_file.exists():
                return FileResponse(str(index_file), media_type='text/html')
            raise HTTPException(status_code=404, detail="Frontend not built")
    else:
        logger.warning("web/dist directory not found, frontend will not be served")

    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8000)

if __name__ == "__main__":
    main()