11/**
2- * Content script — extracts main content from the current page using Defuddle .
3- * Falls back to CSS heuristics if Defuddle returns insufficient content .
2+ * Content script — extracts main content from the current page.
3+ * Uses Defuddle for article extraction, then Turndown for HTML → Markdown .
44 * Bundled with esbuild (not compiled by tsc) since content scripts can't use imports.
55 */
66
7- import { Defuddle } from "defuddle" ;
8- import { extractKagiResults } from "./extractors/kagi" ;
7+ import Defuddle from "defuddle" ;
8+ import TurndownService from "turndown" ;
9+
10+ const turndown = new TurndownService ( { headingStyle : "atx" } ) ;
911
1012const MIN_CONTENT_LENGTH = 200 ;
1113const MAX_CHARS = 50_000 ;
@@ -18,82 +20,32 @@ interface PageData {
1820 ready : boolean ;
1921}
2022
21- function extractWithDefuddle ( ) : string | null {
22- try {
23- const clone = document . cloneNode ( true ) as Document ;
24- const result = new Defuddle ( clone , { url : location . href } ) . parse ( ) ;
25- const text = result . content
26- ? stripHtmlTags ( result . content )
27- : null ;
28- if ( text && text . length >= MIN_CONTENT_LENGTH ) {
29- return text ;
30- }
31- } catch {
32- // Defuddle failed — fall through to heuristics
33- }
34- return null ;
35- }
36-
37- function extractWithHeuristics ( ) : string | null {
38- const selectors = [
39- "article" ,
40- "main" ,
41- '[role="main"]' ,
42- "#content" ,
43- ".post-content" ,
44- ".article-content" ,
45- ".entry-content" ,
46- ] ;
23+ /** Domain-specific readiness locators. For JS-rendered pages, Defuddle may
24+ * extract too early and get page chrome instead of content. These selectors
25+ * gate extraction — if the selector isn't present yet, we return ready=false
26+ * so pollForContent retries until the real content has rendered. */
27+ const READINESS_LOCATORS : [ match : ( hostname : string ) => boolean , selector : string ] [ ] = [
28+ [ ( h ) => h . includes ( "kagi.com" ) , ".search-result" ] ,
29+ ] ;
4730
48- for ( const selector of selectors ) {
49- const el = document . querySelector ( selector ) ;
50- if ( el ) {
51- const text = ( el as HTMLElement ) . innerText ?. trim ( ) ;
52- if ( text && text . length >= MIN_CONTENT_LENGTH ) {
53- return text ;
54- }
55- }
31+ function findReadinessSelector ( ) : string | null {
32+ for ( const [ match , selector ] of READINESS_LOCATORS ) {
33+ if ( match ( location . hostname ) ) return selector ;
5634 }
5735 return null ;
5836}
5937
60- function extractAllVisibleText ( ) : string {
61- const SKIP_TAGS = new Set ( [
62- "SCRIPT" , "STYLE" , "NOSCRIPT" , "SVG" , "IFRAME" ,
63- "NAV" , "ASIDE" , "FOOTER" , "HEADER" ,
64- ] ) ;
65-
66- const chunks : string [ ] = [ ] ;
67- const walker = document . createTreeWalker (
68- document . body ,
69- NodeFilter . SHOW_TEXT ,
70- {
71- acceptNode ( node : Text ) : number {
72- const parent = node . parentElement ;
73- if ( ! parent ) return NodeFilter . FILTER_REJECT ;
74- if ( SKIP_TAGS . has ( parent . tagName ) ) return NodeFilter . FILTER_REJECT ;
75- const style = window . getComputedStyle ( parent ) ;
76- if ( style . display === "none" || style . visibility === "hidden" ) {
77- return NodeFilter . FILTER_REJECT ;
78- }
79- return NodeFilter . FILTER_ACCEPT ;
80- } ,
81- } ,
82- ) ;
83-
84- let node : Text | null ;
85- while ( ( node = walker . nextNode ( ) as Text | null ) ) {
86- const text = node . textContent ?. trim ( ) ;
87- if ( text ) chunks . push ( text ) ;
38+ function extractWithDefuddle ( ) : string | null {
39+ const clone = document . cloneNode ( true ) as Document ;
40+ const result = new Defuddle ( clone , { url : location . href } ) . parse ( ) ;
41+ if ( ! result . content ) return null ;
42+ const text = turndown . turndown ( result . content ) ;
43+ if ( text && text . length >= MIN_CONTENT_LENGTH ) {
44+ return text ;
8845 }
89- return chunks . join ( "\n" ) ;
46+ return null ;
9047}
9148
92- function stripHtmlTags ( html : string ) : string {
93- const div = document . createElement ( "div" ) ;
94- div . innerHTML = html ;
95- return div . innerText || div . textContent || "" ;
96- }
9749
9850function extractMetaImage ( ) : string {
9951 const selectors = [
@@ -110,24 +62,12 @@ function extractMetaImage(): string {
11062}
11163
11264function extract ( ) : PageData {
113- // Kagi pages must use the Kagi extractor — no fallbacks.
114- // Returns ready=false when results haven't rendered yet;
115- // pollForContent re-injects until ready.
116- if ( location . hostname . includes ( "kagi.com" ) ) {
117- const kagi = extractKagiResults ( ) ;
118- return {
119- title : document . title ,
120- url : location . href ,
121- text : kagi ? kagi . slice ( 0 , MAX_CHARS ) : "" ,
122- image : "" ,
123- ready : kagi !== null ,
124- } ;
65+ const readinessSelector = findReadinessSelector ( ) ;
66+ if ( readinessSelector && ! document . querySelector ( readinessSelector ) ) {
67+ return { title : document . title , url : location . href , text : "" , image : "" , ready : false } ;
12568 }
12669
127- const text =
128- extractWithDefuddle ( ) ??
129- extractWithHeuristics ( ) ??
130- extractAllVisibleText ( ) ;
70+ const text = extractWithDefuddle ( ) ?? "Failed to extract page content" ;
13171
13272 return {
13373 title : document . title ,
0 commit comments