From 41b2ab0c0ab3807cda55d63617bb472b9c3284f1 Mon Sep 17 00:00:00 2001
From: Dan Peddle <dan@flarework.com>
Date: Sun, 3 May 2026 08:40:31 +0100
Subject: [PATCH 1/8] add :image as a first-class metadata field

Sourced from JSON-LD image (string or {url} object), then og:image,
then twitter:image. metadata-to-frontmatter emits image: alongside
icon:. upstream-frontmatter->metadata maps an upstream image: key to
:image instead of overloading :icon.

Fixes the markdown-negotiation follow-up where social-card images from
upstream markdown (e.g. cf-twitter-card.png) landed in the favicon slot.
---
 src/r11y/lib/html.clj       | 31 ++++++++++++++++++++---
 test/r11y/lib/html_test.clj | 50 +++++++++++++++++++++++++++++++++++++
 2 files changed, 77 insertions(+), 4 deletions(-)

diff --git a/src/r11y/lib/html.clj b/src/r11y/lib/html.clj
index 8f0cccf..fcfbc3f 100644
--- a/src/r11y/lib/html.clj
+++ b/src/r11y/lib/html.clj
@@ -377,6 +377,22 @@
                               :else ""))
         :else ""))))
 
+(defn- get-json-ld-image
+  "Extract image URL from JSON-LD. Image can be a string, an ImageObject map
+   with :url, or an array of either."
+  [json-ld]
+  (when json-ld
+    (let [val (:image json-ld)]
+      (cond
+        (string? val) val
+        (map? val) (or (:url val) "")
+        (sequential? val) (let [first-val (first val)]
+                            (cond
+                              (string? first-val) first-val
+                              (map? first-val) (or (:url first-val) "")
+                              :else ""))
+        :else ""))))
+
 (defn- first-non-blank
   "Return first non-blank value"
   [& values]
@@ -467,7 +483,11 @@
         raw-canonical-url (safe-attr (.selectFirst doc "link[rel=canonical]") "href")
         canonical-url (resolve-canonical-url base-url raw-canonical-url)
         is-canonical (or (str/blank? canonical-url) (= canonical-url base-url))
-        icon (extract-site-icon doc base-url)]
+        icon (extract-site-icon doc base-url)
+        image (first-non-blank (get-json-ld-image json-ld)
+                               (safe-attr (.selectFirst doc "meta[property=og:image]") "content")
+                               (safe-attr (.selectFirst doc "meta[name=twitter:image]") "content")
+                               (safe-attr (.selectFirst doc "meta[property=twitter:image]") "content"))]
     {:title (or title "")
      :author (or author "")
      :url base-url
@@ -477,7 +497,8 @@
      :date (or date "")
      :canonical-url (or canonical-url "")
      :is-canonical is-canonical
-     :icon (or icon "")}))
+     :icon (or icon "")
+     :image (or image "")}))
 
 (defn metadata-to-frontmatter
   "Convert metadata map to YAML frontmatter string"
@@ -491,7 +512,8 @@
                 [:description (:description metadata)]
                 [:sitename (:sitename metadata)]
                 [:date (:date metadata)]
-                [:icon (:icon metadata)]]
+                [:icon (:icon metadata)]
+                [:image (:image metadata)]]
         non-empty-fields (filter #(not (str/blank? (str (second %)))) fields)
         yaml-lines (map (fn [[k v]]
                           (str (name k) ": " v))
@@ -633,7 +655,8 @@
      :date (or (get fm "date") (get fm "published") "")
      :canonical-url ""
      :is-canonical false
-     :icon (or (get fm "image") (get fm "icon") "")}))
+     :icon (or (get fm "icon") "")
+     :image (or (get fm "image") "")}))
 
 (defn looks-like-markdown?
   "Sniff body bytes/string to detect markdown content.
diff --git a/test/r11y/lib/html_test.clj b/test/r11y/lib/html_test.clj
index e488bda..113422f 100644
--- a/test/r11y/lib/html_test.clj
+++ b/test/r11y/lib/html_test.clj
@@ -691,3 +691,53 @@
       (is (re-find #"Only paragraph" (:markdown result)) "Should contain paragraph")
       (is (not (re-find #"<svg" (:markdown result))) "Should not contain SVG tags")
       (is (not (re-find #"role=" (:markdown result))) "Should not contain role attribute"))))
+
+(deftest test-image-metadata-extraction
+  (testing "og:image populates :image metadata field"
+    (let [html-str "<html><head><meta property=\"og:image\" content=\"https://example.com/hero.jpg\"></head><body><p>Body</p></body></html>"
+          result (html/extract-content-from-url
+                  "https://example.com/page"
+                  :content (.getBytes html-str "UTF-8")
+                  :content-type "text/html"
+                  :with-metadata true)]
+      (is (= "https://example.com/hero.jpg" (get-in result [:metadata :image])))
+      (is (re-find #"image: https://example.com/hero.jpg" (:markdown result)))))
+
+  (testing "twitter:image is used as a fallback"
+    (let [html-str "<html><head><meta name=\"twitter:image\" content=\"https://example.com/twitter.jpg\"></head><body><p>Body</p></body></html>"
+          result (html/extract-content-from-url
+                  "https://example.com/page"
+                  :content (.getBytes html-str "UTF-8")
+                  :content-type "text/html"
+                  :with-metadata true)]
+      (is (= "https://example.com/twitter.jpg" (get-in result [:metadata :image])))))
+
+  (testing "JSON-LD image string takes precedence over meta tags"
+    (let [html-str "<html><head><script type=\"application/ld+json\">{\"@type\":\"Article\",\"headline\":\"T\",\"image\":\"https://example.com/jsonld.jpg\"}</script><meta property=\"og:image\" content=\"https://example.com/og.jpg\"></head><body><p>Body</p></body></html>"
+          result (html/extract-content-from-url
+                  "https://example.com/page"
+                  :content (.getBytes html-str "UTF-8")
+                  :content-type "text/html"
+                  :with-metadata true)]
+      (is (= "https://example.com/jsonld.jpg" (get-in result [:metadata :image])))))
+
+  (testing "JSON-LD image as ImageObject map with :url"
+    (let [html-str "<html><head><script type=\"application/ld+json\">{\"@type\":\"Article\",\"headline\":\"T\",\"image\":{\"@type\":\"ImageObject\",\"url\":\"https://example.com/obj.jpg\"}}</script></head><body><p>Body</p></body></html>"
+          result (html/extract-content-from-url
+                  "https://example.com/page"
+                  :content (.getBytes html-str "UTF-8")
+                  :content-type "text/html"
+                  :with-metadata true)]
+      (is (= "https://example.com/obj.jpg" (get-in result [:metadata :image]))))))
+
+(deftest test-upstream-markdown-image-mapping
+  (testing "upstream YAML image: maps to :image, not :icon"
+    (let [md "---\ntitle: Hello\nimage: https://example.com/social.png\n---\n# Body"
+          result (html/extract-content-from-url
+                  "https://example.com"
+                  :content (.getBytes md "UTF-8")
+                  :content-type "text/markdown"
+                  :with-metadata true)]
+      (is (= "https://example.com/social.png" (get-in result [:metadata :image])))
+      (is (str/blank? (get-in result [:metadata :icon]))
+          "icon should not be populated from upstream image"))))

From c59b05928abcd8e39648a729812147a18f172f09 Mon Sep 17 00:00:00 2001
From: Dan Peddle <dan@flarework.com>
Date: Sun, 3 May 2026 08:42:03 +0100
Subject: [PATCH 2/8] filter placeholder values from metadata sources
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some CMSes leak unresolved templates into meta tags or JSON-LD —
{author.fullName}, #page.title, decorative '. -' strings. valid-metadata-value?
rejects these at the leaves (safe-attr, safe-text, get-json-ld-value,
get-json-ld-image) so first-non-blank chains fall through to the next
real source instead of accepting garbage that happens to be non-blank.
---
 src/r11y/lib/html.clj       | 80 +++++++++++++++++++++++--------------
 test/r11y/lib/html_test.clj | 29 ++++++++++++++
 2 files changed, 80 insertions(+), 29 deletions(-)

diff --git a/src/r11y/lib/html.clj b/src/r11y/lib/html.clj
index fcfbc3f..8b14790 100644
--- a/src/r11y/lib/html.clj
+++ b/src/r11y/lib/html.clj
@@ -337,15 +337,35 @@
     final))
 
 ;; Metadata extraction
+(defn- valid-metadata-value?
+  "Reject placeholder values that CMSes sometimes leak unresolved into
+   meta tags or JSON-LD: template literals like {author.fullName},
+   anchor-style references like #author.fullName, and strings without
+   any letter or digit at all (e.g. dashes, dots, underscores)."
+  [s]
+  (and (string? s)
+       (not (str/blank? s))
+       (not (re-find #"\{[^{}]+\}" s))
+       (not (re-find #"^#[\p{L}_][\p{L}\p{N}_.]*$" s))
+       (boolean (re-find #"[\p{L}\p{N}]" s))))
+
 (defn- safe-attr
-  "Safely get attribute from element, returning empty string if element is null"
+  "Safely get attribute from element, returning empty string if element is null
+   or the value is a placeholder."
   [^Element elem attr]
-  (if elem (.attr elem attr) ""))
+  (if elem
+    (let [v (.attr elem attr)]
+      (if (valid-metadata-value? v) v ""))
+    ""))
 
 (defn- safe-text
-  "Safely get text from element, returning empty string if element is null"
+  "Safely get text from element, returning empty string if element is null
+   or the value is a placeholder."
   [^Element elem]
-  (if elem (.text elem) ""))
+  (if elem
+    (let [v (.text elem)]
+      (if (valid-metadata-value? v) v ""))
+    ""))
 
 (defn- extract-json-ld
   "Extract and parse JSON-LD structured data"
@@ -358,40 +378,42 @@
        (catch Exception _ nil)))
 
 (defn- get-json-ld-value
-  "Safely extract value from JSON-LD data"
+  "Safely extract value from JSON-LD data. Rejects placeholder values."
   [json-ld & keys]
   (when
    json-ld
-    (let [val (get-in json-ld keys)]
-      (cond
-        (string? val) val
-        (map? val) (or (:name val)
-                       (get val (keyword "@value"))
-                       "")
-        (sequential? val) (let [first-val (first val)]
-                            (cond
-                              (string? first-val) first-val
-                              (map? first-val) (or (:name first-val)
-                                                   (get first-val (keyword "@value"))
-                                                   "")
-                              :else ""))
-        :else ""))))
+    (let [val (get-in json-ld keys)
+          raw (cond
+                (string? val) val
+                (map? val) (or (:name val)
+                               (get val (keyword "@value"))
+                               "")
+                (sequential? val) (let [first-val (first val)]
+                                    (cond
+                                      (string? first-val) first-val
+                                      (map? first-val) (or (:name first-val)
+                                                           (get first-val (keyword "@value"))
+                                                           "")
+                                      :else ""))
+                :else "")]
+      (if (valid-metadata-value? raw) raw ""))))
 
 (defn- get-json-ld-image
   "Extract image URL from JSON-LD. Image can be a string, an ImageObject map
    with :url, or an array of either."
   [json-ld]
   (when json-ld
-    (let [val (:image json-ld)]
-      (cond
-        (string? val) val
-        (map? val) (or (:url val) "")
-        (sequential? val) (let [first-val (first val)]
-                            (cond
-                              (string? first-val) first-val
-                              (map? first-val) (or (:url first-val) "")
-                              :else ""))
-        :else ""))))
+    (let [val (:image json-ld)
+          raw (cond
+                (string? val) val
+                (map? val) (or (:url val) "")
+                (sequential? val) (let [first-val (first val)]
+                                    (cond
+                                      (string? first-val) first-val
+                                      (map? first-val) (or (:url first-val) "")
+                                      :else ""))
+                :else "")]
+      (if (valid-metadata-value? raw) raw ""))))
 
 (defn- first-non-blank
   "Return first non-blank value"
diff --git a/test/r11y/lib/html_test.clj b/test/r11y/lib/html_test.clj
index 113422f..17fbd13 100644
--- a/test/r11y/lib/html_test.clj
+++ b/test/r11y/lib/html_test.clj
@@ -741,3 +741,32 @@
       (is (= "https://example.com/social.png" (get-in result [:metadata :image])))
       (is (str/blank? (get-in result [:metadata :icon]))
           "icon should not be populated from upstream image"))))
+
+(deftest test-placeholder-metadata-filter
+  (testing "Unresolved template literals fall through to next fallback"
+    (let [html-str "<html><head><title>{page.title}</title><meta property=\"og:title\" content=\"Real Title\"></head><body><p>x</p></body></html>"
+          result (html/extract-content-from-url
+                  "https://example.com/page"
+                  :content (.getBytes html-str "UTF-8")
+                  :content-type "text/html"
+                  :with-metadata true)]
+      (is (= "Real Title" (get-in result [:metadata :title]))
+          "Template-literal title should be rejected, og:title used")))
+
+  (testing "Anchor-style placeholders are rejected"
+    (let [html-str "<html><head><meta name=\"author\" content=\"#author.fullName\"><meta property=\"article:author\" content=\"Jane Doe\"></head><body><p>x</p></body></html>"
+          result (html/extract-content-from-url
+                  "https://example.com/page"
+                  :content (.getBytes html-str "UTF-8")
+                  :content-type "text/html"
+                  :with-metadata true)]
+      (is (= "Jane Doe" (get-in result [:metadata :author])))))
+
+  (testing "Strings without letters or digits are rejected"
+    (let [html-str "<html><head><meta name=\"description\" content=\". - .\"><meta property=\"og:description\" content=\"Real description\"></head><body><p>x</p></body></html>"
+          result (html/extract-content-from-url
+                  "https://example.com/page"
+                  :content (.getBytes html-str "UTF-8")
+                  :content-type "text/html"
+                  :with-metadata true)]
+      (is (= "Real description" (get-in result [:metadata :description]))))))

From 181330baa46dfb81c8907b165df925d0e285a5e5 Mon Sep 17 00:00:00 2001
From: Dan Peddle <dan@flarework.com>
Date: Sun, 3 May 2026 08:43:02 +0100
Subject: [PATCH 3/8] guard og:site_name against being over 6 words
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some sites mistakenly put the full article title in og:site_name.
Reject anything over 6 words and fall through to the next fallback —
keeps :sitename meaningful instead of leaking article titles into it.
---
 src/r11y/lib/html.clj       | 11 ++++++++++-
 test/r11y/lib/html_test.clj | 20 ++++++++++++++++++++
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/src/r11y/lib/html.clj b/src/r11y/lib/html.clj
index 8b14790..c1dd7d0 100644
--- a/src/r11y/lib/html.clj
+++ b/src/r11y/lib/html.clj
@@ -478,6 +478,15 @@
       (str (.resolve (URI. base-url) canonical-url))
       (catch Exception _ nil))))
 
+(defn- guard-sitename
+  "Reject overly long site names — sites sometimes put the article title
+   in og:site_name. More than 6 words is almost certainly not a site name."
+  [s]
+  (if (and (not (str/blank? s))
+           (> (count (re-seq #"\S+" s)) 6))
+    ""
+    s))
+
 (defn extract-metadata
   "Extract metadata from document"
   [^Document doc base-url]
@@ -494,7 +503,7 @@
                                      (safe-attr (.selectFirst doc "meta[name=description]") "content")
                                      (safe-attr (.selectFirst doc "meta[property=og:description]") "content"))
         sitename (first-non-blank (get-json-ld-value json-ld :publisher :name)
-                                  (safe-attr (.selectFirst doc "meta[property=og:site_name]") "content"))
+                                  (guard-sitename (safe-attr (.selectFirst doc "meta[property=og:site_name]") "content")))
         date (first-non-blank (get-json-ld-value json-ld :datePublished)
                               (get-json-ld-value json-ld :dateCreated)
                               (safe-attr (.selectFirst doc "meta[property=article:published_time]") "content")
diff --git a/test/r11y/lib/html_test.clj b/test/r11y/lib/html_test.clj
index 17fbd13..5904327 100644
--- a/test/r11y/lib/html_test.clj
+++ b/test/r11y/lib/html_test.clj
@@ -770,3 +770,23 @@
                   :content-type "text/html"
                   :with-metadata true)]
       (is (= "Real description" (get-in result [:metadata :description]))))))
+
+(deftest test-sitename-word-count-guard
+  (testing "og:site_name longer than 6 words is rejected"
+    (let [html-str "<html><head><meta property=\"og:site_name\" content=\"Some Long Article Title That Is Not A Site Name\"></head><body><p>x</p></body></html>"
+          result (html/extract-content-from-url
+                  "https://example.com/page"
+                  :content (.getBytes html-str "UTF-8")
+                  :content-type "text/html"
+                  :with-metadata true)]
+      (is (str/blank? (get-in result [:metadata :sitename]))
+          "Long og:site_name should be rejected as misused")))
+
+  (testing "Reasonable og:site_name is preserved"
+    (let [html-str "<html><head><meta property=\"og:site_name\" content=\"Acme News\"></head><body><p>x</p></body></html>"
+          result (html/extract-content-from-url
+                  "https://example.com/page"
+                  :content (.getBytes html-str "UTF-8")
+                  :content-type "text/html"
+                  :with-metadata true)]
+      (is (= "Acme News" (get-in result [:metadata :sitename]))))))

From 0da3d0496048d9b585a30bd218901ea641a30e47 Mon Sep 17 00:00:00 2001
From: Dan Peddle <dan@flarework.com>
Date: Sun, 3 May 2026 08:45:14 +0100
Subject: [PATCH 4/8] walk JSON-LD @graph and prefer Article-typed objects

Read all JSON-LD script tags (not just the first), strip /* */ and
// comments before parsing, flatten any @graph arrays, and prefer
objects whose @type is article-like (Article, NewsArticle, BlogPosting,
ScholarlyArticle, TechArticle, Report, WebPage, AboutPage). Recursively
decode HTML entities in the chosen object so &amp; and &#39; don't leak
into metadata strings.

News and blog sites publish either multiple scripts (one Organization,
one Article) or one @graph wrapping both. Previous code only saw the
first object and missed metadata that lived in the article-typed entry.
---
 src/r11y/lib/html.clj       | 87 ++++++++++++++++++++++++++++++++++---
 test/r11y/lib/html_test.clj | 53 ++++++++++++++++++++++
 2 files changed, 133 insertions(+), 7 deletions(-)

diff --git a/src/r11y/lib/html.clj b/src/r11y/lib/html.clj
index c1dd7d0..042a1a5 100644
--- a/src/r11y/lib/html.clj
+++ b/src/r11y/lib/html.clj
@@ -367,15 +367,88 @@
       (if (valid-metadata-value? v) v ""))
     ""))
 
+(def ^:private json-ld-primary-types
+  #{"Article" "NewsArticle" "BlogPosting" "ScholarlyArticle"
+    "TechArticle" "Report" "WebPage" "AboutPage"})
+
+(defn- decode-html-entities
+  "Decode HTML entities commonly found in JSON-LD strings (&amp;, &#39;, etc.).
+   Single-pass — each entity is decoded exactly once."
+  [^String s]
+  (let [named {"amp" "&" "lt" "<" "gt" ">" "quot" "\"" "apos" "'" "nbsp" " "}]
+    (str/replace s #"&(?:#(\d+)|#[xX]([0-9a-fA-F]+)|([a-zA-Z][a-zA-Z0-9]*));"
+                 (fn [[whole dec hex name]]
+                   (cond
+                     dec (try (str (char (Integer/parseInt dec))) (catch Exception _ whole))
+                     hex (try (str (char (Integer/parseInt hex 16))) (catch Exception _ whole))
+                     name (or (named name) whole)
+                     :else whole)))))
+
+(defn- decode-entities-deep
+  "Recursively decode HTML entities in all string values of a parsed JSON structure."
+  [val]
+  (cond
+    (string? val) (decode-html-entities val)
+    (map? val) (into {} (map (fn [[k v]] [k (decode-entities-deep v)]) val))
+    (sequential? val) (mapv decode-entities-deep val)
+    :else val))
+
+(defn- strip-json-comments
+  "Strip /* */ block comments and // line comments. Sites occasionally serve
+   non-strict JSON-LD with embedded comments; this is best-effort, not a
+   full JSON parser."
+  [^String s]
+  (-> s
+      (str/replace #"(?s)/\*.*?\*/" "")
+      (str/replace #"(?m)^\s*//.*$" "")))
+
+(defn- flatten-graph
+  "Expand a JSON-LD value into a flat list of objects: a top-level @graph
+   array becomes its items; a sequential becomes its elements (recursively)."
+  [parsed]
+  (cond
+    (map? parsed) (if-let [g (get parsed (keyword "@graph"))]
+                    (vec g)
+                    [parsed])
+    (sequential? parsed) (vec (mapcat flatten-graph parsed))
+    :else []))
+
+(defn- json-ld-types
+  "Return the set of @type values for an item (handles string or array)."
+  [item]
+  (let [t (get item (keyword "@type"))]
+    (cond
+      (string? t) #{t}
+      (sequential? t) (set t)
+      :else #{})))
+
+(defn- pick-primary-json-ld
+  "Choose the best object from a flat list of JSON-LD items: prefer one
+   whose @type is an article-like type; fall back to the first item."
+  [items]
+  (or (first (filter (fn [item]
+                       (and (map? item)
+                            (some json-ld-primary-types (json-ld-types item))))
+                     items))
+      (first (filter map? items))))
+
 (defn- extract-json-ld
-  "Extract and parse JSON-LD structured data"
+  "Extract and parse JSON-LD structured data. Reads all script tags,
+   strips JSON comments, flattens @graph, prefers Article-typed objects,
+   decodes HTML entities recursively in the chosen primary."
   [^Document doc]
-  (try (let [json-ld-scripts (.select doc "script[type='application/ld+json']")]
-         (when (seq json-ld-scripts)
-           (let [json-text (.html (first json-ld-scripts))]
-             (when-not (str/blank? json-text)
-               (try (json/parse json-text) (catch Exception _ nil))))))
-       (catch Exception _ nil)))
+  (try
+    (let [scripts (.select doc "script[type='application/ld+json']")
+          parsed (->> scripts
+                      (keep (fn [^Element script]
+                              (let [text (.html script)]
+                                (when-not (str/blank? text)
+                                  (try (json/parse (strip-json-comments text))
+                                       (catch Exception _ nil)))))))
+          flattened (vec (mapcat flatten-graph parsed))
+          primary (pick-primary-json-ld flattened)]
+      (when primary (decode-entities-deep primary)))
+    (catch Exception _ nil)))
 
 (defn- get-json-ld-value
   "Safely extract value from JSON-LD data. Rejects placeholder values."
diff --git a/test/r11y/lib/html_test.clj b/test/r11y/lib/html_test.clj
index 5904327..2f5bbc0 100644
--- a/test/r11y/lib/html_test.clj
+++ b/test/r11y/lib/html_test.clj
@@ -790,3 +790,56 @@
                   :content-type "text/html"
                   :with-metadata true)]
       (is (= "Acme News" (get-in result [:metadata :sitename]))))))
+
+(deftest test-json-ld-graph-walking
+  (testing "@graph array — Article object inside is preferred over WebSite"
+    (let [html-str (str "<html><head><script type=\"application/ld+json\">"
+                        "{\"@context\":\"https://schema.org\","
+                        " \"@graph\":["
+                        "  {\"@type\":\"WebSite\",\"name\":\"Site\"},"
+                        "  {\"@type\":\"Article\",\"headline\":\"Article Title\",\"author\":{\"name\":\"Jane\"}}]}"
+                        "</script></head><body><p>x</p></body></html>")
+          result (html/extract-content-from-url
+                  "https://example.com/page"
+                  :content (.getBytes html-str "UTF-8")
+                  :content-type "text/html"
+                  :with-metadata true)]
+      (is (= "Article Title" (get-in result [:metadata :title])))
+      (is (= "Jane" (get-in result [:metadata :author])))))
+
+  (testing "Multiple JSON-LD scripts — Article-typed one is preferred"
+    (let [html-str (str "<html><head>"
+                        "<script type=\"application/ld+json\">{\"@type\":\"Organization\",\"name\":\"Org\"}</script>"
+                        "<script type=\"application/ld+json\">{\"@type\":\"NewsArticle\",\"headline\":\"News Title\"}</script>"
+                        "</head><body><p>x</p></body></html>")
+          result (html/extract-content-from-url
+                  "https://example.com/page"
+                  :content (.getBytes html-str "UTF-8")
+                  :content-type "text/html"
+                  :with-metadata true)]
+      (is (= "News Title" (get-in result [:metadata :title])))))
+
+  (testing "HTML entities in JSON-LD strings are decoded"
+    (let [html-str (str "<html><head><script type=\"application/ld+json\">"
+                        "{\"@type\":\"Article\",\"headline\":\"Tom &amp; Jerry &#39;Best Of&#39;\"}"
+                        "</script></head><body><p>x</p></body></html>")
+          result (html/extract-content-from-url
+                  "https://example.com/page"
+                  :content (.getBytes html-str "UTF-8")
+                  :content-type "text/html"
+                  :with-metadata true)]
+      (is (= "Tom & Jerry 'Best Of'" (get-in result [:metadata :title])))))
+
+  (testing "JSON-LD with /* block */ and // line comments still parses"
+    (let [html-str (str "<html><head><script type=\"application/ld+json\">"
+                        "/* leading comment */\n"
+                        "{\"@type\":\"Article\",\n"
+                        "// inline comment\n"
+                        " \"headline\":\"Commented Title\"}"
+                        "</script></head><body><p>x</p></body></html>")
+          result (html/extract-content-from-url
+                  "https://example.com/page"
+                  :content (.getBytes html-str "UTF-8")
+                  :content-type "text/html"
+                  :with-metadata true)]
+      (is (= "Commented Title" (get-in result [:metadata :title]))))))

From 10308c5a6409f2ae448fe5b505080f36e8002a14 Mon Sep 17 00:00:00 2001
From: Dan Peddle <dan@flarework.com>
Date: Sun, 3 May 2026 08:46:10 +0100
Subject: [PATCH 5/8] standardize role-based semantic divs to native HTML tags

React/Next.js sites emit <div role=paragraph>, <div role=list>,
<div role=listitem> that JSoup serializes as plain divs. The markdown
converter then treats them as opaque blocks with no paragraph or list
semantics. Convert these to <p>, <ul>, <li> early in clean-document
so the rest of the pipeline sees real content structure.
---
 src/r11y/lib/html.clj       | 15 +++++++++++++++
 test/r11y/lib/html_test.clj | 30 ++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+)

diff --git a/src/r11y/lib/html.clj b/src/r11y/lib/html.clj
index 042a1a5..499b00f 100644
--- a/src/r11y/lib/html.clj
+++ b/src/r11y/lib/html.clj
@@ -77,9 +77,24 @@
     doc))
 
 ;; Cleaning pipeline functions
+(defn standardize-semantic-divs
+  "Convert role-based semantic divs to their proper HTML tags so the
+   markdown converter sees them as block content. Modern React/Next.js
+   sites emit <div role=paragraph>, <div role=list>, <div role=listitem>
+   that JSoup serializes as plain divs without semantic meaning."
+  [^Document doc]
+  (doseq [^Element elem (.select doc "div[role=paragraph]")]
+    (.tagName elem "p"))
+  (doseq [^Element elem (.select doc "div[role=list]")]
+    (.tagName elem "ul"))
+  (doseq [^Element elem (.select doc "div[role=listitem]")]
+    (.tagName elem "li"))
+  doc)
+
 (defn clean-document
   "Initial cleaning of the document."
   [^Document doc]
+  (standardize-semantic-divs doc)
   (doseq [elem (.select
                 doc
                 "script, style, noscript, iframe, object, embed, footer, header, nav, head link, aside, svg, canvas, applet, input, button, select, textarea, label, fieldset, legend, dialog")]
diff --git a/test/r11y/lib/html_test.clj b/test/r11y/lib/html_test.clj
index 2f5bbc0..332d5ac 100644
--- a/test/r11y/lib/html_test.clj
+++ b/test/r11y/lib/html_test.clj
@@ -843,3 +843,33 @@
                   :content-type "text/html"
                   :with-metadata true)]
       (is (= "Commented Title" (get-in result [:metadata :title]))))))
+
+(deftest test-semantic-div-standardization
+  (testing "div[role=paragraph] is treated as a paragraph in output"
+    (let [html-str "<html><body><article><div role=\"paragraph\">First sentence.</div><div role=\"paragraph\">Second sentence.</div></article></body></html>"
+          result (html/extract-content-from-url
+                  "https://example.com"
+                  :content (.getBytes html-str "UTF-8")
+                  :content-type "text/html"
+                  :with-metadata false)]
+      (is (re-find #"First sentence\." (:markdown result)))
+      (is (re-find #"Second sentence\." (:markdown result)))
+      (is (re-find #"First sentence\.\s*\n\s*\n\s*Second sentence\." (:markdown result))
+          "Adjacent role=paragraph divs should produce paragraph breaks")))
+
+  (testing "div[role=list] with div[role=listitem] becomes a markdown list"
+    (let [html-str (str "<html><body><article>"
+                        "<div role=\"list\">"
+                        "<div role=\"listitem\">Apple</div>"
+                        "<div role=\"listitem\">Banana</div>"
+                        "<div role=\"listitem\">Cherry</div>"
+                        "</div>"
+                        "</article></body></html>")
+          result (html/extract-content-from-url
+                  "https://example.com"
+                  :content (.getBytes html-str "UTF-8")
+                  :content-type "text/html"
+                  :with-metadata false)]
+      (is (re-find #"(?m)^\s*[-*]\s+Apple" (:markdown result)))
+      (is (re-find #"(?m)^\s*[-*]\s+Banana" (:markdown result)))
+      (is (re-find #"(?m)^\s*[-*]\s+Cherry" (:markdown result))))))

From 7f35948699bd218df4301b82992c13155068faca Mon Sep 17 00:00:00 2001
From: Dan Peddle <dan@flarework.com>
Date: Sun, 3 May 2026 08:58:15 +0100
Subject: [PATCH 6/8] fix surrogate pair handling and clear role attr after
 rename
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

decode-html-entities used (char (Integer/parseInt ...)) which truncates
to 16 bits — &#128512; (U+1F600 😀) and other supplementary code points
produced garbage. Switch to Character/toString(int) which handles BMP
and non-BMP code points uniformly.

standardize-semantic-divs now also clears the role attribute after
renaming the tag (role=paragraph on a <p> is redundant noise).
---
 src/r11y/lib/html.clj       | 19 +++++++++++++------
 test/r11y/lib/html_test.clj | 15 ++++++++++++++-
 2 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/src/r11y/lib/html.clj b/src/r11y/lib/html.clj
index 499b00f..84304a6 100644
--- a/src/r11y/lib/html.clj
+++ b/src/r11y/lib/html.clj
@@ -77,6 +77,12 @@
     doc))
 
 ;; Cleaning pipeline functions
+(defn- rename-tag!
+  "Rename element's tag and clear the now-redundant role attribute."
+  [^Element elem ^String new-tag]
+  (.tagName elem new-tag)
+  (.removeAttr elem "role"))
+
 (defn standardize-semantic-divs
   "Convert role-based semantic divs to their proper HTML tags so the
    markdown converter sees them as block content. Modern React/Next.js
@@ -84,11 +90,11 @@
    that JSoup serializes as plain divs without semantic meaning."
   [^Document doc]
   (doseq [^Element elem (.select doc "div[role=paragraph]")]
-    (.tagName elem "p"))
+    (rename-tag! elem "p"))
   (doseq [^Element elem (.select doc "div[role=list]")]
-    (.tagName elem "ul"))
+    (rename-tag! elem "ul"))
   (doseq [^Element elem (.select doc "div[role=listitem]")]
-    (.tagName elem "li"))
+    (rename-tag! elem "li"))
   doc)
 
 (defn clean-document
@@ -388,14 +394,15 @@
 
 (defn- decode-html-entities
   "Decode HTML entities commonly found in JSON-LD strings (&amp;, &#39;, etc.).
-   Single-pass — each entity is decoded exactly once."
+   Single-pass — each entity is decoded exactly once. Numeric entities use
+   Character/toString so non-BMP code points (emoji, etc.) decode correctly."
   [^String s]
   (let [named {"amp" "&" "lt" "<" "gt" ">" "quot" "\"" "apos" "'" "nbsp" " "}]
     (str/replace s #"&(?:#(\d+)|#[xX]([0-9a-fA-F]+)|([a-zA-Z][a-zA-Z0-9]*));"
                  (fn [[whole dec hex name]]
                    (cond
-                     dec (try (str (char (Integer/parseInt dec))) (catch Exception _ whole))
-                     hex (try (str (char (Integer/parseInt hex 16))) (catch Exception _ whole))
+                     dec (try (Character/toString (Integer/parseInt dec)) (catch Exception _ whole))
+                     hex (try (Character/toString (Integer/parseInt hex 16)) (catch Exception _ whole))
                      name (or (named name) whole)
                      :else whole)))))
 
diff --git a/test/r11y/lib/html_test.clj b/test/r11y/lib/html_test.clj
index 332d5ac..089e68c 100644
--- a/test/r11y/lib/html_test.clj
+++ b/test/r11y/lib/html_test.clj
@@ -842,7 +842,20 @@
                   :content (.getBytes html-str "UTF-8")
                   :content-type "text/html"
                   :with-metadata true)]
-      (is (= "Commented Title" (get-in result [:metadata :title]))))))
+      (is (= "Commented Title" (get-in result [:metadata :title])))))
+
+  (testing "Numeric entities outside the BMP decode to a valid surrogate pair"
+    (let [html-str (str "<html><head><script type=\"application/ld+json\">"
+                        ;; &#128512; = 😀 (U+1F600), outside BMP
+                        "{\"@type\":\"Article\",\"headline\":\"Hello &#128512; world\"}"
+                        "</script></head><body><p>x</p></body></html>")
+          result (html/extract-content-from-url
+                  "https://example.com/page"
+                  :content (.getBytes html-str "UTF-8")
+                  :content-type "text/html"
+                  :with-metadata true)]
+      (is (= "Hello 😀 world" (get-in result [:metadata :title]))
+          "Code point > 0xFFFF should produce a valid surrogate pair, not a truncated char"))))
 
 (deftest test-semantic-div-standardization
   (testing "div[role=paragraph] is treated as a paragraph in output"

From 92d4f12f51b47a669c47ca942121815fe4c19a84 Mon Sep 17 00:00:00 2001
From: Dan Peddle <dan@flarework.com>
Date: Sun, 3 May 2026 09:01:21 +0100
Subject: [PATCH 7/8] =?UTF-8?q?refresh=20README=20=E2=80=94=20brew=20insta?=
 =?UTF-8?q?ll,=20current=20options,=20current=20metadata=20schema?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add brew tap install instructions
- Update feature list to reflect current behaviour (markdown content
  negotiation, JSON-LD @graph walking, role-based pruning, semantic
  div standardisation)
- Add --version to options
- Refresh example output to include canonical-url, is-canonical, icon,
  image fields with a short note distinguishing icon from image
- Bump SDKMAN GraalVM hint from 22-graal to 25-graal
- Expand "How it works" with content negotiation and richer metadata
  description; mention Cloudflare-fronted sites and image dedupe in
  Special handling
---
 README.md | 51 +++++++++++++++++++++++++++++++++++----------------
 1 file changed, 35 insertions(+), 16 deletions(-)

diff --git a/README.md b/README.md
index 0f4f4f4..3ddb2f0 100644
--- a/README.md
+++ b/README.md
@@ -6,8 +6,11 @@ A fast, GraalVM-compiled CLI tool for extracting readable content from web pages
 
 - Extract main content from any URL as clean Markdown
 - Preserves whitespace in preformatted blocks
-- Rich metadata extraction with YAML frontmatter (title, author, date, description)
-- JSON-LD structured data support
+- Rich metadata extraction with YAML frontmatter (title, author, date, description, canonical URL, hero image, favicon, sitename)
+- JSON-LD structured data support, including `@graph` walking and multi-script preference for article-typed objects
+- Markdown content negotiation — sends `Accept: text/markdown` and recognises markdown bodies even when servers mis-label them as `text/html` (e.g. Cloudflare-fronted docs)
+- Standardises React/Next.js semantic divs (`role=paragraph`, `role=list`) into proper HTML so content structure survives extraction
+- Removes decorative SVGs, spacer images, layout tables, and duplicated UI chrome
 - GitHub-optimized extraction (README files, blob content)
 - Configurable link density threshold for content filtering
 - Fast startup with GraalVM native compilation (~40ms)
@@ -22,9 +25,16 @@ It's not as battle-tested as other more mature extraction tools, but PRs are wel
 
 ## Installation
 
-### Prebuilt Binary (Linux x86_64)
+### Homebrew (macOS arm64, Linux x86_64)
 
-Download the latest binary from [GitHub Releases](https://github.com/dazld/r11y/releases).
+```bash
+brew tap dazld/tap
+brew install r11y
+```
+
+### Prebuilt Binary
+
+Download the latest binary for macOS (arm64) or Linux (x86_64) from [GitHub Releases](https://github.com/dazld/r11y/releases).
 
 ### Quick Build
 
@@ -70,8 +80,8 @@ brew install --cask graalvm-jdk
 
 **Option 2: Using SDKMAN:**
 ```bash
-sdk install java 22-graal
-sdk use java 22-graal
+sdk install java 25-graal
+sdk use java 25-graal
 ```
 
 #### Building
@@ -109,6 +119,7 @@ r11y --help
 
 - `-m, --with-metadata` - Include YAML frontmatter with metadata (title, author, date, description, etc.)
 - `-l, --link-density N` - Link density threshold 0-1 (default: 0.5). Lower values are more aggressive at filtering link-heavy content.
+- `-v, --version` - Show version
 - `-h, --help` - Show help message
 
 ### Example Output with Metadata
@@ -117,16 +128,22 @@ r11y --help
 ---
 title: Intelligence on Earth Evolved Independently at Least Twice
 author: Yasemin Saplakoglu
-url: https://www.wired.com/story/intelligence-evolved...
+url: https://www.wired.com/story/intelligence-evolved-at-least-twice-in-vertebrate-animals/
+canonical-url: https://www.wired.com/story/intelligence-evolved-at-least-twice-in-vertebrate-animals/
+is-canonical: true
 hostname: www.wired.com
-description: Complex neural circuits likely arose independently...
+description: Complex neural circuits likely arose independently in birds and mammals...
 sitename: WIRED
 date: 2025-05-11T07:00:00.000-04:00
+icon: https://www.wired.com/verso/static/wired-us/assets/favicon.ico
+image: https://media.wired.com/photos/.../NeuralIntelligence-crSamanthaMash-Lede.jpeg
 ---
 
 # Article content here...
 ```
 
+`icon` is the site favicon (largest available, Apple touch icon preferred). `image` is the article hero / social-card image (`og:image` / `twitter:image` / JSON-LD `image`).
+
 ## Development
 
 ### Run with Clojure CLI
@@ -145,19 +162,21 @@ clj -e "(require '[r11y.lib.html :as html]) (println (html/extract-content-from-
 
 r11y uses content extraction algorithms inspired by Mozilla's Readability and trafilatura to identify and extract the main content from web pages:
 
-1. **Metadata extraction**: Pulls structured data from JSON-LD, OpenGraph tags, meta tags, `<time>` elements, and URL patterns
-2. **Content cleaning**: Removes boilerplate elements (scripts, styles, navigation, footers, ads)
-3. **Pattern filtering**: Filters elements based on class/id patterns to remove unlikely content
-4. **Link density analysis**: Removes navigation-heavy sections based on configurable threshold
-5. **Main content identification**: Finds the primary article/content element
-6. **Markdown conversion**: Converts cleaned HTML to clean Markdown with proper formatting for headings, lists, tables, code blocks, links, and images
+1. **Content negotiation**: Sends `Accept: text/markdown` and uses the response directly when servers honour it. Sniffs response bodies to recognise markdown returned with a `text/html` content-type (common with Cloudflare-fronted sites). Strips and rebuilds upstream YAML frontmatter in our schema.
+2. **Metadata extraction**: Pulls structured data from JSON-LD (walking `@graph` and preferring article-typed objects across multiple scripts), OpenGraph and Twitter Card meta tags, `<time>` elements, and URL date patterns. Filters placeholder values (`{template.literal}`, `#author.fullName`) and rejects misused `og:site_name` values.
+3. **Content cleaning**: Removes boilerplate elements (scripts, styles, navigation, footers, ads), decorative SVGs and `[role=img]` containers, layout tables, spacer/duplicate images. Standardises `<div role=paragraph>`, `<div role=list>`, and `<div role=listitem>` into proper HTML before pruning.
+4. **Pattern filtering**: Filters elements based on class/id patterns to remove unlikely content
+5. **Link density analysis**: Removes navigation-heavy sections based on configurable threshold
+6. **Main content identification**: Finds the primary article/content element via a CSS-selector cascade with a body fallback if the chosen subtree turns out to be too small
+7. **Markdown conversion**: Converts cleaned HTML to clean Markdown with proper formatting for headings, lists, tables, code blocks, links, and images
 
 ### Special handling
 
 - **GitHub**: Automatically extracts README content from repo pages and fetches raw content for blob URLs while preserving metadata
-- **Tables**: Converts HTML tables to Markdown table format
+- **Cloudflare-fronted sites**: Recognises markdown bodies returned via `Accept: text/markdown` even when the response is mis-labelled as `text/html`
+- **Tables**: Converts HTML tables to Markdown table format; detects layout tables (no `<th>`, `border=0`, block-level content) and unwraps rather than rendering
 - **Code blocks**: Preserves formatting in `<pre>` and `<code>` elements
-- **Images**: Includes alt text and image URLs
+- **Images**: Includes alt text and image URLs; deduplicates images repeated more than twice (UI chrome)
 
 ## License
 

From 7b16dc2d121a0a47ac6f0487fc4aa9a4522fb65a Mon Sep 17 00:00:00 2001
From: Dan Peddle <dan@flarework.com>
Date: Sun, 3 May 2026 09:02:13 +0100
Subject: [PATCH 8/8] restore babashka compatibility note + bb usage example,
 small cleanup

---
 README.md | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 3ddb2f0..2d5401e 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,13 @@
 # r11y
 
-A fast, GraalVM-compiled CLI tool for extracting readable content from web pages as Markdown.
+A lightning fast, GraalVM-compiled CLI tool for extracting readable content from web pages as Markdown.
+
+`r11y` as in `readability` - or "oh rlly?" if you're ancient and remember the terrible owl meme.
 
 ## Features
 
 - Extract main content from any URL as clean Markdown
-- Preserves whitespace in preformatted blocks
+- **Preserves whitespace** in preformatted blocks
 - Rich metadata extraction with YAML frontmatter (title, author, date, description, canonical URL, hero image, favicon, sitename)
 - JSON-LD structured data support, including `@graph` walking and multi-script preference for article-typed objects
 - Markdown content negotiation — sends `Accept: text/markdown` and recognises markdown bodies even when servers mis-label them as `text/html` (e.g. Cloudflare-fronted docs)
@@ -13,16 +15,9 @@ A fast, GraalVM-compiled CLI tool for extracting readable content from web pages
 - Removes decorative SVGs, spacer images, layout tables, and duplicated UI chrome
 - GitHub-optimized extraction (README files, blob content)
 - Configurable link density threshold for content filtering
+- Babashka-compatible — usable from `bb` scripts via `:git/tag` deps, no GraalVM required
 - Fast startup with GraalVM native compilation (~40ms)
 
-## Notes on repo
-
-This is a personal tool I've been using in my own projects - I specifically wanted a way to get URLs without clobbering the whitespace,
-and I couldn't find a tool that did that. I've used and recommend trafilatura before - but given it collapsed whitespace, and was very much
-a python project, I wanted to explore building a Clojure & Graal tool to do similar, and here we go.
-
-It's not as battle-tested as other more mature extraction tools, but PRs are welcome to improve this.
-
 ## Installation
 
 ### Homebrew (macOS arm64, Linux x86_64)
@@ -158,6 +153,16 @@ clj -M -m r11y.core https://example.com
 clj -e "(require '[r11y.lib.html :as html]) (println (html/extract-content-from-url \"https://clojure.org\" :format :markdown))"
 ```
 
+### Use from a babashka script
+
+```bash
+bb -Sdeps '{:deps {io.github.dazld/r11y {:git/tag "v1.0.5" :git/sha "aabc910"}}}' \
+  -e '(require (quote [r11y.lib.html :as html]))
+      (println (:markdown (html/extract-content-from-url "https://example.com" :format :markdown)))'
+```
+
+No GraalVM required — bb resolves the dep, downloads JSoup transitively, and runs the extractor. Useful for one-off scripts where you don't want to install the native binary.
+
 ## How it works
 
 r11y uses content extraction algorithms inspired by Mozilla's Readability and trafilatura to identify and extract the main content from web pages: