|
| 1 | +import * as fs from 'node:fs'; |
| 2 | +import * as path from 'node:path'; |
| 3 | +import { createHash } from 'node:crypto'; |
| 4 | +import { formatCookieHeader, httpDownload } from '@jackwener/opencli/download'; |
| 5 | +function sanitizeExtname(url) { |
| 6 | + try { |
| 7 | + const pathname = new URL(url).pathname || ''; |
| 8 | + const ext = path.extname(pathname).toLowerCase(); |
| 9 | + if (ext && ext.length <= 6) |
| 10 | + return ext; |
| 11 | + } |
| 12 | + catch { |
| 13 | + // ignore invalid URL and fall back |
| 14 | + } |
| 15 | + return '.jpg'; |
| 16 | +} |
| 17 | +function hashUrl(url) { |
| 18 | + return createHash('sha1').update(url).digest('hex'); |
| 19 | +} |
| 20 | +function buildDownloadPlan(rows, output) { |
| 21 | + const cacheDir = path.join(output, '.cache'); |
| 22 | + const byUrl = new Map(); |
| 23 | + rows.forEach((row, rowIndex) => { |
| 24 | + const courseId = row.course_id || 'course'; |
| 25 | + const chapterId = row.chapter_id || 'root'; |
| 26 | + const imageUrls = Array.isArray(row.images) ? row.images.filter(Boolean) : []; |
| 27 | + imageUrls.forEach((url, imageIndex) => { |
| 28 | + const ext = sanitizeExtname(url); |
| 29 | + const cachePath = path.join(cacheDir, `${hashUrl(url)}${ext}`); |
| 30 | + const destPath = path.join(output, courseId, chapterId, `${courseId}_${chapterId}_${imageIndex + 1}${ext}`); |
| 31 | + const existing = byUrl.get(url); |
| 32 | + if (existing) { |
| 33 | + existing.copies.push({ rowIndex, destPath }); |
| 34 | + return; |
| 35 | + } |
| 36 | + byUrl.set(url, { |
| 37 | + url, |
| 38 | + cachePath, |
| 39 | + copies: [{ rowIndex, destPath }], |
| 40 | + }); |
| 41 | + }); |
| 42 | + }); |
| 43 | + return Array.from(byUrl.values()); |
| 44 | +} |
| 45 | +async function runWithConcurrency(items, concurrency, worker) { |
| 46 | + const limit = Math.max(1, Math.floor(concurrency)); |
| 47 | + let cursor = 0; |
| 48 | + async function consume() { |
| 49 | + while (cursor < items.length) { |
| 50 | + const index = cursor; |
| 51 | + cursor += 1; |
| 52 | + await worker(items[index]); |
| 53 | + } |
| 54 | + } |
| 55 | + await Promise.all(Array.from({ length: Math.min(limit, items.length) }, () => consume())); |
| 56 | +} |
| 57 | +function createDefaultDeps() { |
| 58 | + return { |
| 59 | + concurrency: 8, |
| 60 | + downloadToPath: async (url, destPath, cookies) => { |
| 61 | + const result = await httpDownload(url, destPath, { |
| 62 | + cookies, |
| 63 | + timeout: 60_000, |
| 64 | + }); |
| 65 | + return result.success; |
| 66 | + }, |
| 67 | + }; |
| 68 | +} |
| 69 | +export async function downloadScysCourseImagesInternal(data, output, cookies, overrides = {}) { |
| 70 | + const rows = Array.isArray(data) ? data : [data]; |
| 71 | + const deps = { ...createDefaultDeps(), ...overrides }; |
| 72 | + const withDownloads = rows.map((row) => ({ ...row, image_count: 0, image_dir: '' })); |
| 73 | + const plan = buildDownloadPlan(withDownloads, output); |
| 74 | + const successCounts = new Array(withDownloads.length).fill(0); |
| 75 | + await fs.promises.mkdir(path.join(output, '.cache'), { recursive: true }); |
| 76 | + await runWithConcurrency(plan, deps.concurrency, async (entry) => { |
| 77 | + let available = false; |
| 78 | + try { |
| 79 | + await fs.promises.access(entry.cachePath, fs.constants.F_OK); |
| 80 | + available = true; |
| 81 | + } |
| 82 | + catch { |
| 83 | + await fs.promises.mkdir(path.dirname(entry.cachePath), { recursive: true }); |
| 84 | + available = await deps.downloadToPath(entry.url, entry.cachePath, cookies); |
| 85 | + } |
| 86 | + if (!available) |
| 87 | + return; |
| 88 | + await Promise.all(entry.copies.map(async (copy) => { |
| 89 | + await fs.promises.mkdir(path.dirname(copy.destPath), { recursive: true }); |
| 90 | + await fs.promises.copyFile(entry.cachePath, copy.destPath); |
| 91 | + successCounts[copy.rowIndex] += 1; |
| 92 | + })); |
| 93 | + }); |
| 94 | + const result = withDownloads.map((row, index) => ({ |
| 95 | + ...row, |
| 96 | + image_count: successCounts[index] ?? 0, |
| 97 | + image_dir: row.images.length > 0 ? path.join(output, row.course_id || 'course', row.chapter_id || 'root') : '', |
| 98 | + })); |
| 99 | + return Array.isArray(data) ? result : result[0]; |
| 100 | +} |
| 101 | +export async function downloadScysCourseImages(page, data, output) { |
| 102 | + const cookies = formatCookieHeader(await page.getCookies({ domain: 'scys.com' })); |
| 103 | + return downloadScysCourseImagesInternal(data, output, cookies); |
| 104 | +} |
0 commit comments