-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcleanup.js
More file actions
91 lines (76 loc) · 2.94 KB
/
cleanup.js
File metadata and controls
91 lines (76 loc) · 2.94 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import {JSDOM} from "jsdom";
import {Readability} from "@mozilla/readability";
import {Article} from "./Article.js";
import {generalClean} from "./SpecificCleanup/generalClean.js";
import {cleanAfterArray, individualClean} from "./SpecificCleanup/pageSpecificClean.js";
import DOMPurify from "dompurify";
// Timeout wrapper for async operations
function withTimeout(promise, timeoutMs, errorMsg) {
return Promise.race([
promise,
new Promise((_, reject) =>
setTimeout(() => reject(new Error(errorMsg)), timeoutMs)
)
]);
}
// Strip <style> tags from HTML to dramatically speed up JSDOM parsing
// CSS is not needed for article text extraction and can be 300KB+ on some sites
function stripStyleTags(html) {
return html.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '');
}
export async function basic_readability_cleanup(url) {
return (await get_readability_article(url)).content
}
export async function advanced_readability_cleanup(url, htmlContent = null) {
// Wrap the entire operation in a 8-second timeout
return withTimeout(
(async () => {
let article;
if (htmlContent) {
// Parse provided HTML content
article = await get_readability_article(url, htmlContent);
} else {
// Fetch HTML from URL
article = await Article(url);
}
let cleanedContent = generalClean(article.content);
cleanedContent = individualClean(cleanedContent, url, cleanAfterArray);
const window = new JSDOM('').window;
const purify = DOMPurify(window);
cleanedContent = purify.sanitize(cleanedContent);
return cleanedContent;
})(),
8000,
'Processing timeout - HTML too complex'
);
}
export async function get_readability_article(url, htmlContent = null) {
// Wrap in 8-second timeout to prevent hanging
return withTimeout(
(async () => {
let html;
if (htmlContent) {
// Use provided HTML content
html = htmlContent;
} else {
// Fetch the HTML content of the provided URL
const response = await fetch(url);
html = await response.text();
}
// Strip <style> tags before JSDOM parsing - gives 100x+ speedup on CSS-heavy sites
html = stripStyleTags(html);
// Create a DOM from the HTML
const { window } = new JSDOM(html);
const doc = window.document;
// Use Readability to extract the article content
const reader = new Readability(doc);
const article = reader.parse();
if (article === null) {
throw new Error("Readability failed to parse article")
}
return article;
})(),
8000,
'Readability parsing timeout - HTML too complex'
);
}