Skip to content

Commit fb477b7

Browse files
committed
feat(scys): migrate adapters onto latest upstream
1 parent 4a0b805 commit fb477b7

21 files changed

+3411
-316
lines changed

cli-manifest.json

Lines changed: 1005 additions & 316 deletions
Large diffs are not rendered by default.

clis/scys/activity.js

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
import { cli, Strategy } from '@jackwener/opencli/registry';
2+
import { extractScysActivity } from './extractors.js';
3+
cli({
4+
site: 'scys',
5+
name: 'activity',
6+
description: 'Extract SCYS activity landing page structure (tabs, stages, tasks)',
7+
domain: 'scys.com',
8+
strategy: Strategy.COOKIE,
9+
navigateBefore: false,
10+
args: [
11+
{ name: 'url', required: true, positional: true, help: 'Activity landing URL: /activity/landing/:id' },
12+
{ name: 'wait', type: 'int', default: 3, help: 'Seconds to wait after page load' },
13+
],
14+
columns: ['title', 'subtitle', 'tabs', 'stages', 'url'],
15+
func: async (page, kwargs) => {
16+
return extractScysActivity(page, String(kwargs.url), {
17+
waitSeconds: Number(kwargs.wait ?? 3),
18+
});
19+
},
20+
});

clis/scys/article.js

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import { cli, Strategy } from '@jackwener/opencli/registry';
2+
import { extractScysArticle } from './extractors.js';
3+
cli({
4+
site: 'scys',
5+
name: 'article',
6+
description: 'Extract SCYS article detail page content and metadata',
7+
domain: 'scys.com',
8+
strategy: Strategy.COOKIE,
9+
navigateBefore: false,
10+
args: [
11+
{ name: 'url', required: true, positional: true, help: 'Article URL or topic id: /articleDetail/<entityType>/<topicId>' },
12+
{ name: 'wait', type: 'int', default: 5, help: 'Seconds to wait after page load' },
13+
{ name: 'max-length', type: 'int', default: 4000, help: 'Max content length for long text fields' },
14+
],
15+
columns: ['topic_id', 'entity_type', 'title', 'author', 'time', 'tags', 'flags', 'image_count', 'external_link_count', 'content', 'ai_summary', 'url'],
16+
func: async (page, kwargs) => {
17+
return extractScysArticle(page, String(kwargs.url), {
18+
waitSeconds: Number(kwargs.wait ?? 5),
19+
maxLength: Number(kwargs['max-length'] ?? 4000),
20+
});
21+
},
22+
});

clis/scys/common.js

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
import { ArgumentError } from '@jackwener/opencli/errors';
2+
const SCYS_ORIGIN = 'https://scys.com';
3+
export function normalizeScysUrl(input) {
4+
const raw = String(input ?? '').trim();
5+
if (!raw) {
6+
throw new ArgumentError('SCYS URL is required');
7+
}
8+
if (/^https?:\/\//i.test(raw)) {
9+
return raw;
10+
}
11+
if (raw.startsWith('/')) {
12+
return `${SCYS_ORIGIN}${raw}`;
13+
}
14+
if (raw.startsWith('scys.com')) {
15+
return `https://${raw}`;
16+
}
17+
return `${SCYS_ORIGIN}/${raw.replace(/^\/+/, '')}`;
18+
}
19+
export function toScysCourseUrl(input) {
20+
const raw = String(input ?? '').trim();
21+
if (!raw)
22+
throw new ArgumentError('Course URL or course id is required');
23+
if (/^\d+$/.test(raw)) {
24+
return `${SCYS_ORIGIN}/course/detail/${raw}`;
25+
}
26+
return normalizeScysUrl(raw);
27+
}
28+
export function toScysArticleUrl(input) {
29+
const raw = String(input ?? '').trim();
30+
if (!raw)
31+
throw new ArgumentError('Article URL is required');
32+
if (/^\d{8,}$/.test(raw)) {
33+
return `${SCYS_ORIGIN}/articleDetail/xq_topic/${raw}`;
34+
}
35+
const url = normalizeScysUrl(raw);
36+
const parsed = new URL(url);
37+
const match = parsed.pathname.match(/^\/articleDetail\/([^/]+)\/([^/]+)$/);
38+
if (!match) {
39+
throw new ArgumentError(`Unsupported SCYS article URL: ${input}`, 'Use /articleDetail/<entityType>/<topicId> or pass a numeric topic id');
40+
}
41+
return url;
42+
}
43+
export function detectScysPageType(input) {
44+
const url = new URL(normalizeScysUrl(input));
45+
const pathname = url.pathname;
46+
if (pathname.startsWith('/course/detail/'))
47+
return 'course';
48+
if (pathname.startsWith('/opportunity'))
49+
return 'opportunity';
50+
if (pathname.startsWith('/activity/landing/'))
51+
return 'activity';
52+
if (/^\/articleDetail\/[^/]+\/[^/]+$/.test(pathname))
53+
return 'article';
54+
if (pathname.startsWith('/personal/')) {
55+
const tab = (url.searchParams.get('tab') || '').toLowerCase();
56+
if (tab === 'posts')
57+
return 'feed';
58+
}
59+
if (pathname === '/' || pathname === '') {
60+
const filter = (url.searchParams.get('filter') || '').toLowerCase();
61+
if (filter === 'essence')
62+
return 'feed';
63+
}
64+
return 'unknown';
65+
}
66+
export function extractScysCourseId(input) {
67+
const url = new URL(toScysCourseUrl(input));
68+
const match = url.pathname.match(/\/course\/detail\/(\d+)/);
69+
return match?.[1] ?? '';
70+
}
71+
export function extractScysArticleMeta(input) {
72+
const url = new URL(toScysArticleUrl(input));
73+
const match = url.pathname.match(/^\/articleDetail\/([^/]+)\/([^/]+)$/);
74+
return {
75+
entityType: match?.[1] ?? '',
76+
topicId: match?.[2] ?? '',
77+
};
78+
}
79+
export function cleanText(value) {
80+
return String(value ?? '').replace(/\s+/g, ' ').trim();
81+
}
82+
export function extractInteractions(raw) {
83+
const text = cleanText(raw);
84+
if (!text)
85+
return '';
86+
const pieces = text.match(/[0-9]+(?:\.[0-9]+)?(?:|亿)?/g);
87+
if (!pieces || pieces.length === 0)
88+
return text;
89+
return pieces.join(' ');
90+
}
91+
export function inferScysReadUrl(input) {
92+
return normalizeScysUrl(input);
93+
}
94+
export function buildScysHomeEssenceUrl() {
95+
return `${SCYS_ORIGIN}/?filter=essence`;
96+
}
97+
export function buildScysOpportunityUrl() {
98+
return `${SCYS_ORIGIN}/opportunity`;
99+
}

clis/scys/common.test.js

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
import { describe, expect, it } from 'vitest';
2+
import { cleanText, detectScysPageType, extractScysArticleMeta, extractInteractions, normalizeScysUrl, toScysArticleUrl, toScysCourseUrl, } from './common.js';
3+
describe('normalizeScysUrl', () => {
4+
it('normalizes bare domain and keeps path/query', () => {
5+
expect(normalizeScysUrl('scys.com/course/detail/142?chapterId=9445')).toBe('https://scys.com/course/detail/142?chapterId=9445');
6+
});
7+
it('normalizes root-relative paths', () => {
8+
expect(normalizeScysUrl('/opportunity')).toBe('https://scys.com/opportunity');
9+
});
10+
});
11+
describe('toScysCourseUrl', () => {
12+
it('accepts numeric course id', () => {
13+
expect(toScysCourseUrl('92')).toBe('https://scys.com/course/detail/92');
14+
});
15+
it('keeps full course detail URL unchanged', () => {
16+
expect(toScysCourseUrl('https://scys.com/course/detail/142?chapterId=9445')).toBe('https://scys.com/course/detail/142?chapterId=9445');
17+
});
18+
});
19+
describe('toScysArticleUrl', () => {
20+
it('accepts numeric topic id', () => {
21+
expect(toScysArticleUrl('55188458224514554')).toBe('https://scys.com/articleDetail/xq_topic/55188458224514554');
22+
});
23+
it('keeps full article detail url', () => {
24+
expect(toScysArticleUrl('https://scys.com/articleDetail/xq_topic/55188458224514554')).toBe('https://scys.com/articleDetail/xq_topic/55188458224514554');
25+
});
26+
});
27+
describe('extractScysArticleMeta', () => {
28+
it('extracts entity type and topic id from url', () => {
29+
expect(extractScysArticleMeta('https://scys.com/articleDetail/xq_topic/55188458224514554')).toEqual({
30+
entityType: 'xq_topic',
31+
topicId: '55188458224514554',
32+
});
33+
});
34+
});
35+
describe('detectScysPageType', () => {
36+
it('detects course detail with chapterId', () => {
37+
expect(detectScysPageType('https://scys.com/course/detail/142?chapterId=9445')).toBe('course');
38+
});
39+
it('detects course detail without chapterId', () => {
40+
expect(detectScysPageType('https://scys.com/course/detail/92')).toBe('course');
41+
});
42+
it('detects essence feed on homepage', () => {
43+
expect(detectScysPageType('https://scys.com/?filter=essence')).toBe('feed');
44+
});
45+
it('detects profile posts feed', () => {
46+
expect(detectScysPageType('https://scys.com/personal/421122582111848?number=18563&tab=posts')).toBe('feed');
47+
});
48+
it('detects opportunity page', () => {
49+
expect(detectScysPageType('https://scys.com/opportunity')).toBe('opportunity');
50+
});
51+
it('detects activity landing page', () => {
52+
expect(detectScysPageType('https://scys.com/activity/landing/5505?tabIndex=1')).toBe('activity');
53+
});
54+
it('detects article detail page', () => {
55+
expect(detectScysPageType('https://scys.com/articleDetail/xq_topic/55188458224514554')).toBe('article');
56+
});
57+
it('returns unknown for unsupported pages', () => {
58+
expect(detectScysPageType('https://scys.com/help')).toBe('unknown');
59+
});
60+
});
61+
describe('text helpers', () => {
62+
it('cleanText collapses whitespace', () => {
63+
expect(cleanText(' hello\n\nworld ')).toBe('hello world');
64+
});
65+
it('extractInteractions keeps compact numeric text', () => {
66+
expect(extractInteractions('赞 1.2万 评论 35')).toBe('1.2万 35');
67+
});
68+
});

clis/scys/course-download.js

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
import * as fs from 'node:fs';
2+
import * as path from 'node:path';
3+
import { createHash } from 'node:crypto';
4+
import { formatCookieHeader, httpDownload } from '@jackwener/opencli/download';
5+
function sanitizeExtname(url) {
6+
try {
7+
const pathname = new URL(url).pathname || '';
8+
const ext = path.extname(pathname).toLowerCase();
9+
if (ext && ext.length <= 6)
10+
return ext;
11+
}
12+
catch {
13+
// ignore invalid URL and fall back
14+
}
15+
return '.jpg';
16+
}
17+
function hashUrl(url) {
18+
return createHash('sha1').update(url).digest('hex');
19+
}
20+
function buildDownloadPlan(rows, output) {
21+
const cacheDir = path.join(output, '.cache');
22+
const byUrl = new Map();
23+
rows.forEach((row, rowIndex) => {
24+
const courseId = row.course_id || 'course';
25+
const chapterId = row.chapter_id || 'root';
26+
const imageUrls = Array.isArray(row.images) ? row.images.filter(Boolean) : [];
27+
imageUrls.forEach((url, imageIndex) => {
28+
const ext = sanitizeExtname(url);
29+
const cachePath = path.join(cacheDir, `${hashUrl(url)}${ext}`);
30+
const destPath = path.join(output, courseId, chapterId, `${courseId}_${chapterId}_${imageIndex + 1}${ext}`);
31+
const existing = byUrl.get(url);
32+
if (existing) {
33+
existing.copies.push({ rowIndex, destPath });
34+
return;
35+
}
36+
byUrl.set(url, {
37+
url,
38+
cachePath,
39+
copies: [{ rowIndex, destPath }],
40+
});
41+
});
42+
});
43+
return Array.from(byUrl.values());
44+
}
45+
async function runWithConcurrency(items, concurrency, worker) {
46+
const limit = Math.max(1, Math.floor(concurrency));
47+
let cursor = 0;
48+
async function consume() {
49+
while (cursor < items.length) {
50+
const index = cursor;
51+
cursor += 1;
52+
await worker(items[index]);
53+
}
54+
}
55+
await Promise.all(Array.from({ length: Math.min(limit, items.length) }, () => consume()));
56+
}
57+
function createDefaultDeps() {
58+
return {
59+
concurrency: 8,
60+
downloadToPath: async (url, destPath, cookies) => {
61+
const result = await httpDownload(url, destPath, {
62+
cookies,
63+
timeout: 60_000,
64+
});
65+
return result.success;
66+
},
67+
};
68+
}
69+
export async function downloadScysCourseImagesInternal(data, output, cookies, overrides = {}) {
70+
const rows = Array.isArray(data) ? data : [data];
71+
const deps = { ...createDefaultDeps(), ...overrides };
72+
const withDownloads = rows.map((row) => ({ ...row, image_count: 0, image_dir: '' }));
73+
const plan = buildDownloadPlan(withDownloads, output);
74+
const successCounts = new Array(withDownloads.length).fill(0);
75+
await fs.promises.mkdir(path.join(output, '.cache'), { recursive: true });
76+
await runWithConcurrency(plan, deps.concurrency, async (entry) => {
77+
let available = false;
78+
try {
79+
await fs.promises.access(entry.cachePath, fs.constants.F_OK);
80+
available = true;
81+
}
82+
catch {
83+
await fs.promises.mkdir(path.dirname(entry.cachePath), { recursive: true });
84+
available = await deps.downloadToPath(entry.url, entry.cachePath, cookies);
85+
}
86+
if (!available)
87+
return;
88+
await Promise.all(entry.copies.map(async (copy) => {
89+
await fs.promises.mkdir(path.dirname(copy.destPath), { recursive: true });
90+
await fs.promises.copyFile(entry.cachePath, copy.destPath);
91+
successCounts[copy.rowIndex] += 1;
92+
}));
93+
});
94+
const result = withDownloads.map((row, index) => ({
95+
...row,
96+
image_count: successCounts[index] ?? 0,
97+
image_dir: row.images.length > 0 ? path.join(output, row.course_id || 'course', row.chapter_id || 'root') : '',
98+
}));
99+
return Array.isArray(data) ? result : result[0];
100+
}
101+
export async function downloadScysCourseImages(page, data, output) {
102+
const cookies = formatCookieHeader(await page.getCookies({ domain: 'scys.com' }));
103+
return downloadScysCourseImagesInternal(data, output, cookies);
104+
}

clis/scys/course-download.test.js

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
import * as fs from 'node:fs';
2+
import * as os from 'node:os';
3+
import * as path from 'node:path';
4+
import { describe, expect, it } from 'vitest';
5+
import { downloadScysCourseImagesInternal } from './course-download.js';
6+
function makeRow(overrides) {
7+
return {
8+
course_title: 'Course',
9+
chapter_title: 'Chapter',
10+
breadcrumb: 'A > B > C',
11+
content: 'body',
12+
chapter_id: '1',
13+
course_id: '92',
14+
toc_summary: '',
15+
url: 'https://scys.com/course/detail/92?chapterId=1',
16+
raw_url: 'https://scys.com/course/detail/92?chapterId=1',
17+
updated_at_text: '',
18+
copyright_text: '',
19+
prev_chapter: '',
20+
next_chapter: '',
21+
participant_count: 0,
22+
discussion_hint: '',
23+
links: [],
24+
images: [],
25+
image_count: 0,
26+
content_images: [],
27+
content_image_count: 0,
28+
image_dir: '',
29+
...overrides,
30+
};
31+
}
32+
describe('downloadScysCourseImagesInternal', () => {
33+
it('deduplicates repeated image urls across chapters and copies cached files', async () => {
34+
const output = fs.mkdtempSync(path.join(os.tmpdir(), 'scys-course-download-'));
35+
const rows = [
36+
makeRow({ chapter_id: '4038', images: ['https://cdn.example.com/shared.png', 'https://cdn.example.com/unique-a.png'] }),
37+
makeRow({ chapter_id: '4039', images: ['https://cdn.example.com/shared.png'] }),
38+
];
39+
const calls = [];
40+
const result = await downloadScysCourseImagesInternal(rows, output, 'cookie=a', {
41+
concurrency: 2,
42+
downloadToPath: async (url, destPath) => {
43+
calls.push(url);
44+
await fs.promises.mkdir(path.dirname(destPath), { recursive: true });
45+
await fs.promises.writeFile(destPath, `downloaded:${url}`);
46+
return true;
47+
},
48+
});
49+
expect(calls).toEqual([
50+
'https://cdn.example.com/shared.png',
51+
'https://cdn.example.com/unique-a.png',
52+
]);
53+
expect(result[0]?.image_count).toBe(2);
54+
expect(result[1]?.image_count).toBe(1);
55+
expect(fs.existsSync(path.join(output, '92', '4038', '92_4038_1.png'))).toBe(true);
56+
expect(fs.existsSync(path.join(output, '92', '4038', '92_4038_2.png'))).toBe(true);
57+
expect(fs.existsSync(path.join(output, '92', '4039', '92_4039_1.png'))).toBe(true);
58+
});
59+
it('downloads unique image urls concurrently instead of one-by-one', async () => {
60+
const output = fs.mkdtempSync(path.join(os.tmpdir(), 'scys-course-download-'));
61+
const rows = [
62+
makeRow({ chapter_id: '4038', images: ['https://cdn.example.com/a.png', 'https://cdn.example.com/b.png'] }),
63+
makeRow({ chapter_id: '4039', images: ['https://cdn.example.com/c.png', 'https://cdn.example.com/d.png'] }),
64+
];
65+
let active = 0;
66+
let maxActive = 0;
67+
await downloadScysCourseImagesInternal(rows, output, 'cookie=a', {
68+
concurrency: 3,
69+
downloadToPath: async (_url, destPath) => {
70+
active += 1;
71+
maxActive = Math.max(maxActive, active);
72+
await new Promise((resolve) => setTimeout(resolve, 30));
73+
await fs.promises.mkdir(path.dirname(destPath), { recursive: true });
74+
await fs.promises.writeFile(destPath, 'x');
75+
active -= 1;
76+
return true;
77+
},
78+
});
79+
expect(maxActive).toBeGreaterThan(1);
80+
});
81+
});

0 commit comments

Comments
 (0)