Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,321 changes: 1,005 additions & 316 deletions cli-manifest.json

Large diffs are not rendered by default.

20 changes: 20 additions & 0 deletions clis/scys/activity.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import { cli, Strategy } from '@jackwener/opencli/registry';
import { extractScysActivity } from './extractors.js';
cli({
site: 'scys',
name: 'activity',
description: 'Extract SCYS activity landing page structure (tabs, stages, tasks)',
domain: 'scys.com',
strategy: Strategy.COOKIE,
navigateBefore: false,
args: [
{ name: 'url', required: true, positional: true, help: 'Activity landing URL: /activity/landing/:id' },
{ name: 'wait', type: 'int', default: 3, help: 'Seconds to wait after page load' },
],
columns: ['title', 'subtitle', 'tabs', 'stages', 'url'],
func: async (page, kwargs) => {
return extractScysActivity(page, String(kwargs.url), {
waitSeconds: Number(kwargs.wait ?? 3),
});
},
});
22 changes: 22 additions & 0 deletions clis/scys/article.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import { cli, Strategy } from '@jackwener/opencli/registry';
import { extractScysArticle } from './extractors.js';
cli({
site: 'scys',
name: 'article',
description: 'Extract SCYS article detail page content and metadata',
domain: 'scys.com',
strategy: Strategy.COOKIE,
navigateBefore: false,
args: [
{ name: 'url', required: true, positional: true, help: 'Article URL or topic id: /articleDetail/<entityType>/<topicId>' },
{ name: 'wait', type: 'int', default: 5, help: 'Seconds to wait after page load' },
{ name: 'max-length', type: 'int', default: 4000, help: 'Max content length for long text fields' },
],
columns: ['topic_id', 'entity_type', 'title', 'author', 'time', 'tags', 'flags', 'image_count', 'external_link_count', 'content', 'ai_summary', 'url'],
func: async (page, kwargs) => {
return extractScysArticle(page, String(kwargs.url), {
waitSeconds: Number(kwargs.wait ?? 5),
maxLength: Number(kwargs['max-length'] ?? 4000),
});
},
});
99 changes: 99 additions & 0 deletions clis/scys/common.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import { ArgumentError } from '@jackwener/opencli/errors';
const SCYS_ORIGIN = 'https://scys.com';
export function normalizeScysUrl(input) {
const raw = String(input ?? '').trim();
if (!raw) {
throw new ArgumentError('SCYS URL is required');
}
if (/^https?:\/\//i.test(raw)) {
return raw;
}
if (raw.startsWith('/')) {
return `${SCYS_ORIGIN}${raw}`;
}
if (raw.startsWith('scys.com')) {
return `https://${raw}`;
}
return `${SCYS_ORIGIN}/${raw.replace(/^\/+/, '')}`;
}
export function toScysCourseUrl(input) {
const raw = String(input ?? '').trim();
if (!raw)
throw new ArgumentError('Course URL or course id is required');
if (/^\d+$/.test(raw)) {
return `${SCYS_ORIGIN}/course/detail/${raw}`;
}
return normalizeScysUrl(raw);
}
export function toScysArticleUrl(input) {
const raw = String(input ?? '').trim();
if (!raw)
throw new ArgumentError('Article URL is required');
if (/^\d{8,}$/.test(raw)) {
return `${SCYS_ORIGIN}/articleDetail/xq_topic/${raw}`;
}
const url = normalizeScysUrl(raw);
const parsed = new URL(url);
const match = parsed.pathname.match(/^\/articleDetail\/([^/]+)\/([^/]+)$/);
if (!match) {
throw new ArgumentError(`Unsupported SCYS article URL: ${input}`, 'Use /articleDetail/<entityType>/<topicId> or pass a numeric topic id');
}
return url;
}
export function detectScysPageType(input) {
const url = new URL(normalizeScysUrl(input));
const pathname = url.pathname;
if (pathname.startsWith('/course/detail/'))
return 'course';
if (pathname.startsWith('/opportunity'))
return 'opportunity';
if (pathname.startsWith('/activity/landing/'))
return 'activity';
if (/^\/articleDetail\/[^/]+\/[^/]+$/.test(pathname))
return 'article';
if (pathname.startsWith('/personal/')) {
const tab = (url.searchParams.get('tab') || '').toLowerCase();
if (tab === 'posts')
return 'feed';
}
if (pathname === '/' || pathname === '') {
const filter = (url.searchParams.get('filter') || '').toLowerCase();
if (filter === 'essence')
return 'feed';
}
return 'unknown';
}
export function extractScysCourseId(input) {
const url = new URL(toScysCourseUrl(input));
const match = url.pathname.match(/\/course\/detail\/(\d+)/);
return match?.[1] ?? '';
}
export function extractScysArticleMeta(input) {
const url = new URL(toScysArticleUrl(input));
const match = url.pathname.match(/^\/articleDetail\/([^/]+)\/([^/]+)$/);
return {
entityType: match?.[1] ?? '',
topicId: match?.[2] ?? '',
};
}
export function cleanText(value) {
return String(value ?? '').replace(/\s+/g, ' ').trim();
}
export function extractInteractions(raw) {
const text = cleanText(raw);
if (!text)
return '';
const pieces = text.match(/[0-9]+(?:\.[0-9]+)?(?:万|亿)?/g);
if (!pieces || pieces.length === 0)
return text;
return pieces.join(' ');
}
export function inferScysReadUrl(input) {
return normalizeScysUrl(input);
}
export function buildScysHomeEssenceUrl() {
return `${SCYS_ORIGIN}/?filter=essence`;
}
export function buildScysOpportunityUrl() {
return `${SCYS_ORIGIN}/opportunity`;
}
68 changes: 68 additions & 0 deletions clis/scys/common.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import { describe, expect, it } from 'vitest';
import { cleanText, detectScysPageType, extractScysArticleMeta, extractInteractions, normalizeScysUrl, toScysArticleUrl, toScysCourseUrl, } from './common.js';
describe('normalizeScysUrl', () => {
it('normalizes bare domain and keeps path/query', () => {
expect(normalizeScysUrl('scys.com/course/detail/142?chapterId=9445')).toBe('https://scys.com/course/detail/142?chapterId=9445');
});
it('normalizes root-relative paths', () => {
expect(normalizeScysUrl('/opportunity')).toBe('https://scys.com/opportunity');
});
});
describe('toScysCourseUrl', () => {
it('accepts numeric course id', () => {
expect(toScysCourseUrl('92')).toBe('https://scys.com/course/detail/92');
});
it('keeps full course detail URL unchanged', () => {
expect(toScysCourseUrl('https://scys.com/course/detail/142?chapterId=9445')).toBe('https://scys.com/course/detail/142?chapterId=9445');
});
});
describe('toScysArticleUrl', () => {
it('accepts numeric topic id', () => {
expect(toScysArticleUrl('55188458224514554')).toBe('https://scys.com/articleDetail/xq_topic/55188458224514554');
});
it('keeps full article detail url', () => {
expect(toScysArticleUrl('https://scys.com/articleDetail/xq_topic/55188458224514554')).toBe('https://scys.com/articleDetail/xq_topic/55188458224514554');
});
});
describe('extractScysArticleMeta', () => {
it('extracts entity type and topic id from url', () => {
expect(extractScysArticleMeta('https://scys.com/articleDetail/xq_topic/55188458224514554')).toEqual({
entityType: 'xq_topic',
topicId: '55188458224514554',
});
});
});
describe('detectScysPageType', () => {
it('detects course detail with chapterId', () => {
expect(detectScysPageType('https://scys.com/course/detail/142?chapterId=9445')).toBe('course');
});
it('detects course detail without chapterId', () => {
expect(detectScysPageType('https://scys.com/course/detail/92')).toBe('course');
});
it('detects essence feed on homepage', () => {
expect(detectScysPageType('https://scys.com/?filter=essence')).toBe('feed');
});
it('detects profile posts feed', () => {
expect(detectScysPageType('https://scys.com/personal/421122582111848?number=18563&tab=posts')).toBe('feed');
});
it('detects opportunity page', () => {
expect(detectScysPageType('https://scys.com/opportunity')).toBe('opportunity');
});
it('detects activity landing page', () => {
expect(detectScysPageType('https://scys.com/activity/landing/5505?tabIndex=1')).toBe('activity');
});
it('detects article detail page', () => {
expect(detectScysPageType('https://scys.com/articleDetail/xq_topic/55188458224514554')).toBe('article');
});
it('returns unknown for unsupported pages', () => {
expect(detectScysPageType('https://scys.com/help')).toBe('unknown');
});
});
describe('text helpers', () => {
it('cleanText collapses whitespace', () => {
expect(cleanText(' hello\n\nworld ')).toBe('hello world');
});
it('extractInteractions keeps compact numeric text', () => {
expect(extractInteractions('赞 1.2万 评论 35')).toBe('1.2万 35');
});
});
104 changes: 104 additions & 0 deletions clis/scys/course-download.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import * as fs from 'node:fs';
import * as path from 'node:path';
import { createHash } from 'node:crypto';
import { formatCookieHeader, httpDownload } from '@jackwener/opencli/download';
function sanitizeExtname(url) {
try {
const pathname = new URL(url).pathname || '';
const ext = path.extname(pathname).toLowerCase();
if (ext && ext.length <= 6)
return ext;
}
catch {
// ignore invalid URL and fall back
}
return '.jpg';
}
function hashUrl(url) {
return createHash('sha1').update(url).digest('hex');
}
function buildDownloadPlan(rows, output) {
const cacheDir = path.join(output, '.cache');
const byUrl = new Map();
rows.forEach((row, rowIndex) => {
const courseId = row.course_id || 'course';
const chapterId = row.chapter_id || 'root';
const imageUrls = Array.isArray(row.images) ? row.images.filter(Boolean) : [];
imageUrls.forEach((url, imageIndex) => {
const ext = sanitizeExtname(url);
const cachePath = path.join(cacheDir, `${hashUrl(url)}${ext}`);
const destPath = path.join(output, courseId, chapterId, `${courseId}_${chapterId}_${imageIndex + 1}${ext}`);
const existing = byUrl.get(url);
if (existing) {
existing.copies.push({ rowIndex, destPath });
return;
}
byUrl.set(url, {
url,
cachePath,
copies: [{ rowIndex, destPath }],
});
});
});
return Array.from(byUrl.values());
}
async function runWithConcurrency(items, concurrency, worker) {
const limit = Math.max(1, Math.floor(concurrency));
let cursor = 0;
async function consume() {
while (cursor < items.length) {
const index = cursor;
cursor += 1;
await worker(items[index]);
}
}
await Promise.all(Array.from({ length: Math.min(limit, items.length) }, () => consume()));
}
function createDefaultDeps() {
return {
concurrency: 8,
downloadToPath: async (url, destPath, cookies) => {
const result = await httpDownload(url, destPath, {
cookies,
timeout: 60_000,
});
return result.success;
},
};
}
export async function downloadScysCourseImagesInternal(data, output, cookies, overrides = {}) {
const rows = Array.isArray(data) ? data : [data];
const deps = { ...createDefaultDeps(), ...overrides };
const withDownloads = rows.map((row) => ({ ...row, image_count: 0, image_dir: '' }));
const plan = buildDownloadPlan(withDownloads, output);
const successCounts = new Array(withDownloads.length).fill(0);
await fs.promises.mkdir(path.join(output, '.cache'), { recursive: true });
await runWithConcurrency(plan, deps.concurrency, async (entry) => {
let available = false;
try {
await fs.promises.access(entry.cachePath, fs.constants.F_OK);
available = true;
}
catch {
await fs.promises.mkdir(path.dirname(entry.cachePath), { recursive: true });
available = await deps.downloadToPath(entry.url, entry.cachePath, cookies);
}
if (!available)
return;
await Promise.all(entry.copies.map(async (copy) => {
await fs.promises.mkdir(path.dirname(copy.destPath), { recursive: true });
await fs.promises.copyFile(entry.cachePath, copy.destPath);
successCounts[copy.rowIndex] += 1;
}));
});
const result = withDownloads.map((row, index) => ({
...row,
image_count: successCounts[index] ?? 0,
image_dir: row.images.length > 0 ? path.join(output, row.course_id || 'course', row.chapter_id || 'root') : '',
}));
return Array.isArray(data) ? result : result[0];
}
export async function downloadScysCourseImages(page, data, output) {
const cookies = formatCookieHeader(await page.getCookies({ domain: 'scys.com' }));
return downloadScysCourseImagesInternal(data, output, cookies);
}
81 changes: 81 additions & 0 deletions clis/scys/course-download.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import * as fs from 'node:fs';
import * as os from 'node:os';
import * as path from 'node:path';
import { describe, expect, it } from 'vitest';
import { downloadScysCourseImagesInternal } from './course-download.js';
function makeRow(overrides) {
return {
course_title: 'Course',
chapter_title: 'Chapter',
breadcrumb: 'A > B > C',
content: 'body',
chapter_id: '1',
course_id: '92',
toc_summary: '',
url: 'https://scys.com/course/detail/92?chapterId=1',
raw_url: 'https://scys.com/course/detail/92?chapterId=1',
updated_at_text: '',
copyright_text: '',
prev_chapter: '',
next_chapter: '',
participant_count: 0,
discussion_hint: '',
links: [],
images: [],
image_count: 0,
content_images: [],
content_image_count: 0,
image_dir: '',
...overrides,
};
}
describe('downloadScysCourseImagesInternal', () => {
it('deduplicates repeated image urls across chapters and copies cached files', async () => {
const output = fs.mkdtempSync(path.join(os.tmpdir(), 'scys-course-download-'));
const rows = [
makeRow({ chapter_id: '4038', images: ['https://cdn.example.com/shared.png', 'https://cdn.example.com/unique-a.png'] }),
makeRow({ chapter_id: '4039', images: ['https://cdn.example.com/shared.png'] }),
];
const calls = [];
const result = await downloadScysCourseImagesInternal(rows, output, 'cookie=a', {
concurrency: 2,
downloadToPath: async (url, destPath) => {
calls.push(url);
await fs.promises.mkdir(path.dirname(destPath), { recursive: true });
await fs.promises.writeFile(destPath, `downloaded:${url}`);
return true;
},
});
expect(calls).toEqual([
'https://cdn.example.com/shared.png',
'https://cdn.example.com/unique-a.png',
]);
expect(result[0]?.image_count).toBe(2);
expect(result[1]?.image_count).toBe(1);
expect(fs.existsSync(path.join(output, '92', '4038', '92_4038_1.png'))).toBe(true);
expect(fs.existsSync(path.join(output, '92', '4038', '92_4038_2.png'))).toBe(true);
expect(fs.existsSync(path.join(output, '92', '4039', '92_4039_1.png'))).toBe(true);
});
it('downloads unique image urls concurrently instead of one-by-one', async () => {
const output = fs.mkdtempSync(path.join(os.tmpdir(), 'scys-course-download-'));
const rows = [
makeRow({ chapter_id: '4038', images: ['https://cdn.example.com/a.png', 'https://cdn.example.com/b.png'] }),
makeRow({ chapter_id: '4039', images: ['https://cdn.example.com/c.png', 'https://cdn.example.com/d.png'] }),
];
let active = 0;
let maxActive = 0;
await downloadScysCourseImagesInternal(rows, output, 'cookie=a', {
concurrency: 3,
downloadToPath: async (_url, destPath) => {
active += 1;
maxActive = Math.max(maxActive, active);
await new Promise((resolve) => setTimeout(resolve, 30));
await fs.promises.mkdir(path.dirname(destPath), { recursive: true });
await fs.promises.writeFile(destPath, 'x');
active -= 1;
return true;
},
});
expect(maxActive).toBeGreaterThan(1);
});
});
Loading