From 41b2ab0c0ab3807cda55d63617bb472b9c3284f1 Mon Sep 17 00:00:00 2001 From: Dan Peddle Date: Sun, 3 May 2026 08:40:31 +0100 Subject: [PATCH 1/8] add :image as a first-class metadata field Sourced from JSON-LD image (string or {url} object), then og:image, then twitter:image. metadata-to-frontmatter emits image: alongside icon:. upstream-frontmatter->metadata maps an upstream image: key to :image instead of overloading :icon. Fixes the markdown-negotiation follow-up where social-card images from upstream markdown (e.g. cf-twitter-card.png) landed in the favicon slot. --- src/r11y/lib/html.clj | 31 ++++++++++++++++++++--- test/r11y/lib/html_test.clj | 50 +++++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+), 4 deletions(-) diff --git a/src/r11y/lib/html.clj b/src/r11y/lib/html.clj index 8f0cccf..fcfbc3f 100644 --- a/src/r11y/lib/html.clj +++ b/src/r11y/lib/html.clj @@ -377,6 +377,22 @@ :else "")) :else "")))) +(defn- get-json-ld-image + "Extract image URL from JSON-LD. Image can be a string, an ImageObject map + with :url, or an array of either." + [json-ld] + (when json-ld + (let [val (:image json-ld)] + (cond + (string? val) val + (map? val) (or (:url val) "") + (sequential? val) (let [first-val (first val)] + (cond + (string? first-val) first-val + (map? first-val) (or (:url first-val) "") + :else "")) + :else "")))) + (defn- first-non-blank "Return first non-blank value" [& values] @@ -467,7 +483,11 @@ raw-canonical-url (safe-attr (.selectFirst doc "link[rel=canonical]") "href") canonical-url (resolve-canonical-url base-url raw-canonical-url) is-canonical (or (str/blank? canonical-url) (= canonical-url base-url)) - icon (extract-site-icon doc base-url)] + icon (extract-site-icon doc base-url) + image (first-non-blank (get-json-ld-image json-ld) + (safe-attr (.selectFirst doc "meta[property=og:image]") "content") + (safe-attr (.selectFirst doc "meta[name=twitter:image]") "content") + (safe-attr (.selectFirst doc "meta[property=twitter:image]") "content"))] {:title (or title "") :author (or author "") :url base-url @@ -477,7 +497,8 @@ :date (or date "") :canonical-url (or canonical-url "") :is-canonical is-canonical - :icon (or icon "")})) + :icon (or icon "") + :image (or image "")})) (defn metadata-to-frontmatter "Convert metadata map to YAML frontmatter string" @@ -491,7 +512,8 @@ [:description (:description metadata)] [:sitename (:sitename metadata)] [:date (:date metadata)] - [:icon (:icon metadata)]] + [:icon (:icon metadata)] + [:image (:image metadata)]] non-empty-fields (filter #(not (str/blank? (str (second %)))) fields) yaml-lines (map (fn [[k v]] (str (name k) ": " v)) @@ -633,7 +655,8 @@ :date (or (get fm "date") (get fm "published") "") :canonical-url "" :is-canonical false - :icon (or (get fm "image") (get fm "icon") "")})) + :icon (or (get fm "icon") "") + :image (or (get fm "image") "")})) (defn looks-like-markdown? "Sniff body bytes/string to detect markdown content. diff --git a/test/r11y/lib/html_test.clj b/test/r11y/lib/html_test.clj index e488bda..113422f 100644 --- a/test/r11y/lib/html_test.clj +++ b/test/r11y/lib/html_test.clj @@ -691,3 +691,53 @@ (is (re-find #"Only paragraph" (:markdown result)) "Should contain paragraph") (is (not (re-find #"

Body

" + result (html/extract-content-from-url + "https://example.com/page" + :content (.getBytes html-str "UTF-8") + :content-type "text/html" + :with-metadata true)] + (is (= "https://example.com/hero.jpg" (get-in result [:metadata :image]))) + (is (re-find #"image: https://example.com/hero.jpg" (:markdown result))))) + + (testing "twitter:image is used as a fallback" + (let [html-str "

Body

" + result (html/extract-content-from-url + "https://example.com/page" + :content (.getBytes html-str "UTF-8") + :content-type "text/html" + :with-metadata true)] + (is (= "https://example.com/twitter.jpg" (get-in result [:metadata :image]))))) + + (testing "JSON-LD image string takes precedence over meta tags" + (let [html-str "

Body

" + result (html/extract-content-from-url + "https://example.com/page" + :content (.getBytes html-str "UTF-8") + :content-type "text/html" + :with-metadata true)] + (is (= "https://example.com/jsonld.jpg" (get-in result [:metadata :image]))))) + + (testing "JSON-LD image as ImageObject map with :url" + (let [html-str "

Body

" + result (html/extract-content-from-url + "https://example.com/page" + :content (.getBytes html-str "UTF-8") + :content-type "text/html" + :with-metadata true)] + (is (= "https://example.com/obj.jpg" (get-in result [:metadata :image])))))) + +(deftest test-upstream-markdown-image-mapping + (testing "upstream YAML image: maps to :image, not :icon" + (let [md "---\ntitle: Hello\nimage: https://example.com/social.png\n---\n# Body" + result (html/extract-content-from-url + "https://example.com" + :content (.getBytes md "UTF-8") + :content-type "text/markdown" + :with-metadata true)] + (is (= "https://example.com/social.png" (get-in result [:metadata :image]))) + (is (str/blank? (get-in result [:metadata :icon])) + "icon should not be populated from upstream image")))) From c59b05928abcd8e39648a729812147a18f172f09 Mon Sep 17 00:00:00 2001 From: Dan Peddle Date: Sun, 3 May 2026 08:42:03 +0100 Subject: [PATCH 2/8] filter placeholder values from metadata sources MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some CMSes leak unresolved templates into meta tags or JSON-LD — {author.fullName}, #page.title, decorative '. -' strings. valid-metadata-value? rejects these at the leaves (safe-attr, safe-text, get-json-ld-value, get-json-ld-image) so first-non-blank chains fall through to the next real source instead of accepting garbage that happens to be non-blank. --- src/r11y/lib/html.clj | 80 +++++++++++++++++++++++-------------- test/r11y/lib/html_test.clj | 29 ++++++++++++++ 2 files changed, 80 insertions(+), 29 deletions(-) diff --git a/src/r11y/lib/html.clj b/src/r11y/lib/html.clj index fcfbc3f..8b14790 100644 --- a/src/r11y/lib/html.clj +++ b/src/r11y/lib/html.clj @@ -337,15 +337,35 @@ final)) ;; Metadata extraction +(defn- valid-metadata-value? + "Reject placeholder values that CMSes sometimes leak unresolved into + meta tags or JSON-LD: template literals like {author.fullName}, + anchor-style references like #author.fullName, and strings without + any letter or digit at all (e.g. dashes, dots, underscores)." + [s] + (and (string? s) + (not (str/blank? s)) + (not (re-find #"\{[^{}]+\}" s)) + (not (re-find #"^#[\p{L}_][\p{L}\p{N}_.]*$" s)) + (boolean (re-find #"[\p{L}\p{N}]" s)))) + (defn- safe-attr - "Safely get attribute from element, returning empty string if element is null" + "Safely get attribute from element, returning empty string if element is null + or the value is a placeholder." [^Element elem attr] - (if elem (.attr elem attr) "")) + (if elem + (let [v (.attr elem attr)] + (if (valid-metadata-value? v) v "")) + "")) (defn- safe-text - "Safely get text from element, returning empty string if element is null" + "Safely get text from element, returning empty string if element is null + or the value is a placeholder." [^Element elem] - (if elem (.text elem) "")) + (if elem + (let [v (.text elem)] + (if (valid-metadata-value? v) v "")) + "")) (defn- extract-json-ld "Extract and parse JSON-LD structured data" @@ -358,40 +378,42 @@ (catch Exception _ nil))) (defn- get-json-ld-value - "Safely extract value from JSON-LD data" + "Safely extract value from JSON-LD data. Rejects placeholder values." [json-ld & keys] (when json-ld - (let [val (get-in json-ld keys)] - (cond - (string? val) val - (map? val) (or (:name val) - (get val (keyword "@value")) - "") - (sequential? val) (let [first-val (first val)] - (cond - (string? first-val) first-val - (map? first-val) (or (:name first-val) - (get first-val (keyword "@value")) - "") - :else "")) - :else "")))) + (let [val (get-in json-ld keys) + raw (cond + (string? val) val + (map? val) (or (:name val) + (get val (keyword "@value")) + "") + (sequential? val) (let [first-val (first val)] + (cond + (string? first-val) first-val + (map? first-val) (or (:name first-val) + (get first-val (keyword "@value")) + "") + :else "")) + :else "")] + (if (valid-metadata-value? raw) raw "")))) (defn- get-json-ld-image "Extract image URL from JSON-LD. Image can be a string, an ImageObject map with :url, or an array of either." [json-ld] (when json-ld - (let [val (:image json-ld)] - (cond - (string? val) val - (map? val) (or (:url val) "") - (sequential? val) (let [first-val (first val)] - (cond - (string? first-val) first-val - (map? first-val) (or (:url first-val) "") - :else "")) - :else "")))) + (let [val (:image json-ld) + raw (cond + (string? val) val + (map? val) (or (:url val) "") + (sequential? val) (let [first-val (first val)] + (cond + (string? first-val) first-val + (map? first-val) (or (:url first-val) "") + :else "")) + :else "")] + (if (valid-metadata-value? raw) raw "")))) (defn- first-non-blank "Return first non-blank value" diff --git a/test/r11y/lib/html_test.clj b/test/r11y/lib/html_test.clj index 113422f..17fbd13 100644 --- a/test/r11y/lib/html_test.clj +++ b/test/r11y/lib/html_test.clj @@ -741,3 +741,32 @@ (is (= "https://example.com/social.png" (get-in result [:metadata :image]))) (is (str/blank? (get-in result [:metadata :icon])) "icon should not be populated from upstream image")))) + +(deftest test-placeholder-metadata-filter + (testing "Unresolved template literals fall through to next fallback" + (let [html-str "{page.title}

x

" + result (html/extract-content-from-url + "https://example.com/page" + :content (.getBytes html-str "UTF-8") + :content-type "text/html" + :with-metadata true)] + (is (= "Real Title" (get-in result [:metadata :title])) + "Template-literal title should be rejected, og:title used"))) + + (testing "Anchor-style placeholders are rejected" + (let [html-str "

x

" + result (html/extract-content-from-url + "https://example.com/page" + :content (.getBytes html-str "UTF-8") + :content-type "text/html" + :with-metadata true)] + (is (= "Jane Doe" (get-in result [:metadata :author]))))) + + (testing "Strings without letters or digits are rejected" + (let [html-str "

x

" + result (html/extract-content-from-url + "https://example.com/page" + :content (.getBytes html-str "UTF-8") + :content-type "text/html" + :with-metadata true)] + (is (= "Real description" (get-in result [:metadata :description])))))) From 181330baa46dfb81c8907b165df925d0e285a5e5 Mon Sep 17 00:00:00 2001 From: Dan Peddle Date: Sun, 3 May 2026 08:43:02 +0100 Subject: [PATCH 3/8] guard og:site_name against being over 6 words MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some sites mistakenly put the full article title in og:site_name. Reject anything over 6 words and fall through to the next fallback — keeps :sitename meaningful instead of leaking article titles into it. --- src/r11y/lib/html.clj | 11 ++++++++++- test/r11y/lib/html_test.clj | 20 ++++++++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/src/r11y/lib/html.clj b/src/r11y/lib/html.clj index 8b14790..c1dd7d0 100644 --- a/src/r11y/lib/html.clj +++ b/src/r11y/lib/html.clj @@ -478,6 +478,15 @@ (str (.resolve (URI. base-url) canonical-url)) (catch Exception _ nil)))) +(defn- guard-sitename + "Reject overly long site names — sites sometimes put the article title + in og:site_name. More than 6 words is almost certainly not a site name." + [s] + (if (and (not (str/blank? s)) + (> (count (re-seq #"\S+" s)) 6)) + "" + s)) + (defn extract-metadata "Extract metadata from document" [^Document doc base-url] @@ -494,7 +503,7 @@ (safe-attr (.selectFirst doc "meta[name=description]") "content") (safe-attr (.selectFirst doc "meta[property=og:description]") "content")) sitename (first-non-blank (get-json-ld-value json-ld :publisher :name) - (safe-attr (.selectFirst doc "meta[property=og:site_name]") "content")) + (guard-sitename (safe-attr (.selectFirst doc "meta[property=og:site_name]") "content"))) date (first-non-blank (get-json-ld-value json-ld :datePublished) (get-json-ld-value json-ld :dateCreated) (safe-attr (.selectFirst doc "meta[property=article:published_time]") "content") diff --git a/test/r11y/lib/html_test.clj b/test/r11y/lib/html_test.clj index 17fbd13..5904327 100644 --- a/test/r11y/lib/html_test.clj +++ b/test/r11y/lib/html_test.clj @@ -770,3 +770,23 @@ :content-type "text/html" :with-metadata true)] (is (= "Real description" (get-in result [:metadata :description])))))) + +(deftest test-sitename-word-count-guard + (testing "og:site_name longer than 6 words is rejected" + (let [html-str "

x

" + result (html/extract-content-from-url + "https://example.com/page" + :content (.getBytes html-str "UTF-8") + :content-type "text/html" + :with-metadata true)] + (is (str/blank? (get-in result [:metadata :sitename])) + "Long og:site_name should be rejected as misused"))) + + (testing "Reasonable og:site_name is preserved" + (let [html-str "

x

" + result (html/extract-content-from-url + "https://example.com/page" + :content (.getBytes html-str "UTF-8") + :content-type "text/html" + :with-metadata true)] + (is (= "Acme News" (get-in result [:metadata :sitename])))))) From 0da3d0496048d9b585a30bd218901ea641a30e47 Mon Sep 17 00:00:00 2001 From: Dan Peddle Date: Sun, 3 May 2026 08:45:14 +0100 Subject: [PATCH 4/8] walk JSON-LD @graph and prefer Article-typed objects Read all JSON-LD script tags (not just the first), strip /* */ and // comments before parsing, flatten any @graph arrays, and prefer objects whose @type is article-like (Article, NewsArticle, BlogPosting, ScholarlyArticle, TechArticle, Report, WebPage, AboutPage). Recursively decode HTML entities in the chosen object so & and ' don't leak into metadata strings. News and blog sites publish either multiple scripts (one Organization, one Article) or one @graph wrapping both. Previous code only saw the first object and missed metadata that lived in the article-typed entry. --- src/r11y/lib/html.clj | 87 ++++++++++++++++++++++++++++++++++--- test/r11y/lib/html_test.clj | 53 ++++++++++++++++++++++ 2 files changed, 133 insertions(+), 7 deletions(-) diff --git a/src/r11y/lib/html.clj b/src/r11y/lib/html.clj index c1dd7d0..042a1a5 100644 --- a/src/r11y/lib/html.clj +++ b/src/r11y/lib/html.clj @@ -367,15 +367,88 @@ (if (valid-metadata-value? v) v "")) "")) +(def ^:private json-ld-primary-types + #{"Article" "NewsArticle" "BlogPosting" "ScholarlyArticle" + "TechArticle" "Report" "WebPage" "AboutPage"}) + +(defn- decode-html-entities + "Decode HTML entities commonly found in JSON-LD strings (&, ', etc.). + Single-pass — each entity is decoded exactly once." + [^String s] + (let [named {"amp" "&" "lt" "<" "gt" ">" "quot" "\"" "apos" "'" "nbsp" " "}] + (str/replace s #"&(?:#(\d+)|#[xX]([0-9a-fA-F]+)|([a-zA-Z][a-zA-Z0-9]*));" + (fn [[whole dec hex name]] + (cond + dec (try (str (char (Integer/parseInt dec))) (catch Exception _ whole)) + hex (try (str (char (Integer/parseInt hex 16))) (catch Exception _ whole)) + name (or (named name) whole) + :else whole))))) + +(defn- decode-entities-deep + "Recursively decode HTML entities in all string values of a parsed JSON structure." + [val] + (cond + (string? val) (decode-html-entities val) + (map? val) (into {} (map (fn [[k v]] [k (decode-entities-deep v)]) val)) + (sequential? val) (mapv decode-entities-deep val) + :else val)) + +(defn- strip-json-comments + "Strip /* */ block comments and // line comments. Sites occasionally serve + non-strict JSON-LD with embedded comments; this is best-effort, not a + full JSON parser." + [^String s] + (-> s + (str/replace #"(?s)/\*.*?\*/" "") + (str/replace #"(?m)^\s*//.*$" ""))) + +(defn- flatten-graph + "Expand a JSON-LD value into a flat list of objects: a top-level @graph + array becomes its items; a sequential becomes its elements (recursively)." + [parsed] + (cond + (map? parsed) (if-let [g (get parsed (keyword "@graph"))] + (vec g) + [parsed]) + (sequential? parsed) (vec (mapcat flatten-graph parsed)) + :else [])) + +(defn- json-ld-types + "Return the set of @type values for an item (handles string or array)." + [item] + (let [t (get item (keyword "@type"))] + (cond + (string? t) #{t} + (sequential? t) (set t) + :else #{}))) + +(defn- pick-primary-json-ld + "Choose the best object from a flat list of JSON-LD items: prefer one + whose @type is an article-like type; fall back to the first item." + [items] + (or (first (filter (fn [item] + (and (map? item) + (some json-ld-primary-types (json-ld-types item)))) + items)) + (first (filter map? items)))) + (defn- extract-json-ld - "Extract and parse JSON-LD structured data" + "Extract and parse JSON-LD structured data. Reads all script tags, + strips JSON comments, flattens @graph, prefers Article-typed objects, + decodes HTML entities recursively in the chosen primary." [^Document doc] - (try (let [json-ld-scripts (.select doc "script[type='application/ld+json']")] - (when (seq json-ld-scripts) - (let [json-text (.html (first json-ld-scripts))] - (when-not (str/blank? json-text) - (try (json/parse json-text) (catch Exception _ nil)))))) - (catch Exception _ nil))) + (try + (let [scripts (.select doc "script[type='application/ld+json']") + parsed (->> scripts + (keep (fn [^Element script] + (let [text (.html script)] + (when-not (str/blank? text) + (try (json/parse (strip-json-comments text)) + (catch Exception _ nil))))))) + flattened (vec (mapcat flatten-graph parsed)) + primary (pick-primary-json-ld flattened)] + (when primary (decode-entities-deep primary))) + (catch Exception _ nil))) (defn- get-json-ld-value "Safely extract value from JSON-LD data. Rejects placeholder values." diff --git a/test/r11y/lib/html_test.clj b/test/r11y/lib/html_test.clj index 5904327..2f5bbc0 100644 --- a/test/r11y/lib/html_test.clj +++ b/test/r11y/lib/html_test.clj @@ -790,3 +790,56 @@ :content-type "text/html" :with-metadata true)] (is (= "Acme News" (get-in result [:metadata :sitename])))))) + +(deftest test-json-ld-graph-walking + (testing "@graph array — Article object inside is preferred over WebSite" + (let [html-str (str "

x

") + result (html/extract-content-from-url + "https://example.com/page" + :content (.getBytes html-str "UTF-8") + :content-type "text/html" + :with-metadata true)] + (is (= "Article Title" (get-in result [:metadata :title]))) + (is (= "Jane" (get-in result [:metadata :author]))))) + + (testing "Multiple JSON-LD scripts — Article-typed one is preferred" + (let [html-str (str "" + "" + "" + "

x

") + result (html/extract-content-from-url + "https://example.com/page" + :content (.getBytes html-str "UTF-8") + :content-type "text/html" + :with-metadata true)] + (is (= "News Title" (get-in result [:metadata :title]))))) + + (testing "HTML entities in JSON-LD strings are decoded" + (let [html-str (str "

x

") + result (html/extract-content-from-url + "https://example.com/page" + :content (.getBytes html-str "UTF-8") + :content-type "text/html" + :with-metadata true)] + (is (= "Tom & Jerry 'Best Of'" (get-in result [:metadata :title]))))) + + (testing "JSON-LD with /* block */ and // line comments still parses" + (let [html-str (str "

x

") + result (html/extract-content-from-url + "https://example.com/page" + :content (.getBytes html-str "UTF-8") + :content-type "text/html" + :with-metadata true)] + (is (= "Commented Title" (get-in result [:metadata :title])))))) From 10308c5a6409f2ae448fe5b505080f36e8002a14 Mon Sep 17 00:00:00 2001 From: Dan Peddle Date: Sun, 3 May 2026 08:46:10 +0100 Subject: [PATCH 5/8] standardize role-based semantic divs to native HTML tags React/Next.js sites emit
,
,
that JSoup serializes as plain divs. The markdown converter then treats them as opaque blocks with no paragraph or list semantics. Convert these to

,

    ,
  • early in clean-document so the rest of the pipeline sees real content structure. --- src/r11y/lib/html.clj | 15 +++++++++++++++ test/r11y/lib/html_test.clj | 30 ++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/src/r11y/lib/html.clj b/src/r11y/lib/html.clj index 042a1a5..499b00f 100644 --- a/src/r11y/lib/html.clj +++ b/src/r11y/lib/html.clj @@ -77,9 +77,24 @@ doc)) ;; Cleaning pipeline functions +(defn standardize-semantic-divs + "Convert role-based semantic divs to their proper HTML tags so the + markdown converter sees them as block content. Modern React/Next.js + sites emit
    ,
    ,
    + that JSoup serializes as plain divs without semantic meaning." + [^Document doc] + (doseq [^Element elem (.select doc "div[role=paragraph]")] + (.tagName elem "p")) + (doseq [^Element elem (.select doc "div[role=list]")] + (.tagName elem "ul")) + (doseq [^Element elem (.select doc "div[role=listitem]")] + (.tagName elem "li")) + doc) + (defn clean-document "Initial cleaning of the document." [^Document doc] + (standardize-semantic-divs doc) (doseq [elem (.select doc "script, style, noscript, iframe, object, embed, footer, header, nav, head link, aside, svg, canvas, applet, input, button, select, textarea, label, fieldset, legend, dialog")] diff --git a/test/r11y/lib/html_test.clj b/test/r11y/lib/html_test.clj index 2f5bbc0..332d5ac 100644 --- a/test/r11y/lib/html_test.clj +++ b/test/r11y/lib/html_test.clj @@ -843,3 +843,33 @@ :content-type "text/html" :with-metadata true)] (is (= "Commented Title" (get-in result [:metadata :title])))))) + +(deftest test-semantic-div-standardization + (testing "div[role=paragraph] is treated as a paragraph in output" + (let [html-str "
    First sentence.
    Second sentence.
    " + result (html/extract-content-from-url + "https://example.com" + :content (.getBytes html-str "UTF-8") + :content-type "text/html" + :with-metadata false)] + (is (re-find #"First sentence\." (:markdown result))) + (is (re-find #"Second sentence\." (:markdown result))) + (is (re-find #"First sentence\.\s*\n\s*\n\s*Second sentence\." (:markdown result)) + "Adjacent role=paragraph divs should produce paragraph breaks"))) + + (testing "div[role=list] with div[role=listitem] becomes a markdown list" + (let [html-str (str "
    " + "
    " + "
    Apple
    " + "
    Banana
    " + "
    Cherry
    " + "
    " + "
    ") + result (html/extract-content-from-url + "https://example.com" + :content (.getBytes html-str "UTF-8") + :content-type "text/html" + :with-metadata false)] + (is (re-find #"(?m)^\s*[-*]\s+Apple" (:markdown result))) + (is (re-find #"(?m)^\s*[-*]\s+Banana" (:markdown result))) + (is (re-find #"(?m)^\s*[-*]\s+Cherry" (:markdown result)))))) From 7f35948699bd218df4301b82992c13155068faca Mon Sep 17 00:00:00 2001 From: Dan Peddle Date: Sun, 3 May 2026 08:58:15 +0100 Subject: [PATCH 6/8] fix surrogate pair handling and clear role attr after rename MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit decode-html-entities used (char (Integer/parseInt ...)) which truncates to 16 bits — 😀 (U+1F600 😀) and other supplementary code points produced garbage. Switch to Character/toString(int) which handles BMP and non-BMP code points uniformly. standardize-semantic-divs now also clears the role attribute after renaming the tag (role=paragraph on a

    is redundant noise). --- src/r11y/lib/html.clj | 19 +++++++++++++------ test/r11y/lib/html_test.clj | 15 ++++++++++++++- 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/src/r11y/lib/html.clj b/src/r11y/lib/html.clj index 499b00f..84304a6 100644 --- a/src/r11y/lib/html.clj +++ b/src/r11y/lib/html.clj @@ -77,6 +77,12 @@ doc)) ;; Cleaning pipeline functions +(defn- rename-tag! + "Rename element's tag and clear the now-redundant role attribute." + [^Element elem ^String new-tag] + (.tagName elem new-tag) + (.removeAttr elem "role")) + (defn standardize-semantic-divs "Convert role-based semantic divs to their proper HTML tags so the markdown converter sees them as block content. Modern React/Next.js @@ -84,11 +90,11 @@ that JSoup serializes as plain divs without semantic meaning." [^Document doc] (doseq [^Element elem (.select doc "div[role=paragraph]")] - (.tagName elem "p")) + (rename-tag! elem "p")) (doseq [^Element elem (.select doc "div[role=list]")] - (.tagName elem "ul")) + (rename-tag! elem "ul")) (doseq [^Element elem (.select doc "div[role=listitem]")] - (.tagName elem "li")) + (rename-tag! elem "li")) doc) (defn clean-document @@ -388,14 +394,15 @@ (defn- decode-html-entities "Decode HTML entities commonly found in JSON-LD strings (&, ', etc.). - Single-pass — each entity is decoded exactly once." + Single-pass — each entity is decoded exactly once. Numeric entities use + Character/toString so non-BMP code points (emoji, etc.) decode correctly." [^String s] (let [named {"amp" "&" "lt" "<" "gt" ">" "quot" "\"" "apos" "'" "nbsp" " "}] (str/replace s #"&(?:#(\d+)|#[xX]([0-9a-fA-F]+)|([a-zA-Z][a-zA-Z0-9]*));" (fn [[whole dec hex name]] (cond - dec (try (str (char (Integer/parseInt dec))) (catch Exception _ whole)) - hex (try (str (char (Integer/parseInt hex 16))) (catch Exception _ whole)) + dec (try (Character/toString (Integer/parseInt dec)) (catch Exception _ whole)) + hex (try (Character/toString (Integer/parseInt hex 16)) (catch Exception _ whole)) name (or (named name) whole) :else whole))))) diff --git a/test/r11y/lib/html_test.clj b/test/r11y/lib/html_test.clj index 332d5ac..089e68c 100644 --- a/test/r11y/lib/html_test.clj +++ b/test/r11y/lib/html_test.clj @@ -842,7 +842,20 @@ :content (.getBytes html-str "UTF-8") :content-type "text/html" :with-metadata true)] - (is (= "Commented Title" (get-in result [:metadata :title])))))) + (is (= "Commented Title" (get-in result [:metadata :title]))))) + + (testing "Numeric entities outside the BMP decode to a valid surrogate pair" + (let [html-str (str "

    x

    ") + result (html/extract-content-from-url + "https://example.com/page" + :content (.getBytes html-str "UTF-8") + :content-type "text/html" + :with-metadata true)] + (is (= "Hello 😀 world" (get-in result [:metadata :title])) + "Code point > 0xFFFF should produce a valid surrogate pair, not a truncated char")))) (deftest test-semantic-div-standardization (testing "div[role=paragraph] is treated as a paragraph in output" From 92d4f12f51b47a669c47ca942121815fe4c19a84 Mon Sep 17 00:00:00 2001 From: Dan Peddle Date: Sun, 3 May 2026 09:01:21 +0100 Subject: [PATCH 7/8] =?UTF-8?q?refresh=20README=20=E2=80=94=20brew=20insta?= =?UTF-8?q?ll,=20current=20options,=20current=20metadata=20schema?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add brew tap install instructions - Update feature list to reflect current behaviour (markdown content negotiation, JSON-LD @graph walking, role-based pruning, semantic div standardisation) - Add --version to options - Refresh example output to include canonical-url, is-canonical, icon, image fields with a short note distinguishing icon from image - Bump SDKMAN GraalVM hint from 22-graal to 25-graal - Expand "How it works" with content negotiation and richer metadata description; mention Cloudflare-fronted sites and image dedupe in Special handling --- README.md | 51 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 35 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 0f4f4f4..3ddb2f0 100644 --- a/README.md +++ b/README.md @@ -6,8 +6,11 @@ A fast, GraalVM-compiled CLI tool for extracting readable content from web pages - Extract main content from any URL as clean Markdown - Preserves whitespace in preformatted blocks -- Rich metadata extraction with YAML frontmatter (title, author, date, description) -- JSON-LD structured data support +- Rich metadata extraction with YAML frontmatter (title, author, date, description, canonical URL, hero image, favicon, sitename) +- JSON-LD structured data support, including `@graph` walking and multi-script preference for article-typed objects +- Markdown content negotiation — sends `Accept: text/markdown` and recognises markdown bodies even when servers mis-label them as `text/html` (e.g. Cloudflare-fronted docs) +- Standardises React/Next.js semantic divs (`role=paragraph`, `role=list`) into proper HTML so content structure survives extraction +- Removes decorative SVGs, spacer images, layout tables, and duplicated UI chrome - GitHub-optimized extraction (README files, blob content) - Configurable link density threshold for content filtering - Fast startup with GraalVM native compilation (~40ms) @@ -22,9 +25,16 @@ It's not as battle-tested as other more mature extraction tools, but PRs are wel ## Installation -### Prebuilt Binary (Linux x86_64) +### Homebrew (macOS arm64, Linux x86_64) -Download the latest binary from [GitHub Releases](https://github.com/dazld/r11y/releases). +```bash +brew tap dazld/tap +brew install r11y +``` + +### Prebuilt Binary + +Download the latest binary for macOS (arm64) or Linux (x86_64) from [GitHub Releases](https://github.com/dazld/r11y/releases). ### Quick Build @@ -70,8 +80,8 @@ brew install --cask graalvm-jdk **Option 2: Using SDKMAN:** ```bash -sdk install java 22-graal -sdk use java 22-graal +sdk install java 25-graal +sdk use java 25-graal ``` #### Building @@ -109,6 +119,7 @@ r11y --help - `-m, --with-metadata` - Include YAML frontmatter with metadata (title, author, date, description, etc.) - `-l, --link-density N` - Link density threshold 0-1 (default: 0.5). Lower values are more aggressive at filtering link-heavy content. +- `-v, --version` - Show version - `-h, --help` - Show help message ### Example Output with Metadata @@ -117,16 +128,22 @@ r11y --help --- title: Intelligence on Earth Evolved Independently at Least Twice author: Yasemin Saplakoglu -url: https://www.wired.com/story/intelligence-evolved... +url: https://www.wired.com/story/intelligence-evolved-at-least-twice-in-vertebrate-animals/ +canonical-url: https://www.wired.com/story/intelligence-evolved-at-least-twice-in-vertebrate-animals/ +is-canonical: true hostname: www.wired.com -description: Complex neural circuits likely arose independently... +description: Complex neural circuits likely arose independently in birds and mammals... sitename: WIRED date: 2025-05-11T07:00:00.000-04:00 +icon: https://www.wired.com/verso/static/wired-us/assets/favicon.ico +image: https://media.wired.com/photos/.../NeuralIntelligence-crSamanthaMash-Lede.jpeg --- # Article content here... ``` +`icon` is the site favicon (largest available, Apple touch icon preferred). `image` is the article hero / social-card image (`og:image` / `twitter:image` / JSON-LD `image`). + ## Development ### Run with Clojure CLI @@ -145,19 +162,21 @@ clj -e "(require '[r11y.lib.html :as html]) (println (html/extract-content-from- r11y uses content extraction algorithms inspired by Mozilla's Readability and trafilatura to identify and extract the main content from web pages: -1. **Metadata extraction**: Pulls structured data from JSON-LD, OpenGraph tags, meta tags, `