Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ A lightning fast, GraalVM-compiled CLI tool for extracting readable content from
- Removes decorative SVGs, spacer images, layout tables, and duplicated UI chrome
- GitHub-optimized extraction (README files, blob content)
- Configurable link density threshold for content filtering
- SOCKS5 proxy support (`--proxy`) for accessing geo-blocked or firewalled content, with chunked transfer-encoding handled correctly
- Babashka-compatible — usable from `bb` scripts via `:git/tag` deps, no GraalVM required
- Fast startup with GraalVM native compilation (~40ms)

Expand Down Expand Up @@ -108,6 +109,9 @@ r11y --link-density 0.3 https://example.com
# GitHub blob URLs (automatically fetches raw content with metadata)
r11y -m https://github.com/user/repo/blob/main/README.md

# Route traffic through a SOCKS5 proxy
r11y --proxy socks5://127.0.0.1:9050 https://example.com

# Show help
r11y --help
```
Expand All @@ -116,6 +120,7 @@ r11y --help

- `-m, --with-metadata` - Include YAML frontmatter with metadata (title, author, date, description, etc.)
- `-l, --link-density N` - Link density threshold 0-1 (default: 0.5). Lower values are more aggressive at filtering link-heavy content.
- `-p, --proxy URL` - SOCKS5 proxy URL with host:port (e.g. `socks5://127.0.0.1:9050`). Useful for accessing geo-blocked or firewalled content.
- `-v, --version` - Show version
- `-h, --help` - Show help message

Expand Down
2 changes: 1 addition & 1 deletion build-native.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ $NATIVE_IMAGE \
--features=clj_easy.graal_build_time.InitClojureClasses \
--no-fallback \
--report-unsupported-elements-at-runtime \
--enable-url-protocols=http,https \
--enable-url-protocols=http,https,socks \
-H:ConfigurationFileDirectories=graal-config

echo ""
Expand Down
58 changes: 58 additions & 0 deletions graal-config/reflect-config.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,64 @@
"allPublicMethods": true,
"allPublicConstructors": true
},
{
"name": "java.net.Proxy",
"allPublicMethods": true,
"allPublicConstructors": true
},
{
"name": "java.net.Proxy$Type",
"allPublicMethods": true,
"allPublicFields": true
},
{
"name": "java.net.ProxySelector",
"allPublicMethods": true,
"allPublicConstructors": true
},
{
"name": "java.net.InetSocketAddress",
"allPublicMethods": true,
"allPublicConstructors": true
},
{
"name": "javax.net.ssl.SSLSocket",
"allPublicMethods": true,
"allPublicConstructors": true
},
{
"name": "javax.net.ssl.SSLSocketFactory",
"allPublicMethods": true
},
{
"name": "javax.net.ssl.SSLSocketFactoryImpl",
"allPublicMethods": true,
"allPublicConstructors": true
},
{
"name": "sun.security.ssl.SSLSocketFactoryImpl",
"allPublicMethods": true,
"allPublicConstructors": true
},
{
"name": "sun.security.ssl.SSLSocketImpl",
"allPublicMethods": true,
"allPublicConstructors": true
},
{
"name": "javax.net.ssl.SSLParameters",
"allPublicMethods": true,
"allPublicConstructors": true
},
{
"name": "javax.net.ssl.SSLContext",
"allPublicMethods": true,
"allPublicConstructors": true
},
{
"name": "javax.net.ssl.SSLContextSpi",
"allPublicMethods": true
},
{
"name": "java.util.regex.Pattern",
"allPublicMethods": true,
Expand Down
97 changes: 52 additions & 45 deletions src/r11y/core.clj
Original file line number Diff line number Diff line change
Expand Up @@ -14,68 +14,75 @@
(println "Options:")
(println " -l, --link-density N Link density threshold 0-1 (default: 0.5)")
(println " -m, --with-metadata Include YAML frontmatter with metadata")
(println " -p, --proxy URL SOCKS5 proxy URL (e.g. socks5://127.0.0.1:9050)")
(println " -v, --version Show version")
(println " -h, --help Show this help message")
(println "")
(println "Example:")
(println " r11y https://example.com")
(println " r11y --link-density 0.3 https://example.com")
(println " r11y --with-metadata https://example.com"))
(println " r11y --with-metadata https://example.com")
(println " r11y --proxy socks5://127.0.0.1:9050 https://example.com"))

(defn parse-args
[args]
(loop [args args
opts {:link-density-threshold 0.5 :with-metadata false :url nil}]
opts {:link-density-threshold 0.5 :with-metadata false :proxy nil :url nil}]
(if (empty? args)
opts
(let [arg (first args)
rest-args (rest args)]
(cond (or (= arg "-h") (= arg "--help")) (assoc opts :help true)
(or (= arg "-v") (= arg "--version")) (assoc opts :version true)
(or (= arg "-m") (= arg "--with-metadata")) (recur rest-args (assoc opts :with-metadata true))
(or (= arg "-l") (= arg "--link-density"))
(if (empty? rest-args)
(assoc opts :error "Missing value for --link-density")
(let [val (first rest-args)
parse-result (try (let [n (Double/parseDouble val)]
(if (and (>= n 0.0) (<= n 1.0))
{:success true :value n}
{:success false :error (str "Link density must be between 0 and 1, got: " val)}))
(catch NumberFormatException _
{:success false :error (str "Invalid number for --link-density: " val)}))]
(if (:success parse-result)
(recur (rest rest-args) (assoc opts :link-density-threshold (:value parse-result)))
(assoc opts :error (:error parse-result)))))
(str/starts-with? arg "-") (assoc opts :error (str "Unknown option: " arg))
:else (if (:url opts)
(assoc opts :error "Multiple URLs provided. Only one URL is allowed.")
(recur rest-args (assoc opts :url arg))))))))
(or (= arg "-v") (= arg "--version")) (assoc opts :version true)
(or (= arg "-m") (= arg "--with-metadata")) (recur rest-args (assoc opts :with-metadata true))
(or (= arg "-p") (= arg "--proxy"))
(if (empty? rest-args)
(assoc opts :error "Missing value for --proxy")
(recur (rest rest-args) (assoc opts :proxy (first rest-args))))
(or (= arg "-l") (= arg "--link-density"))
(if (empty? rest-args)
(assoc opts :error "Missing value for --link-density")
(let [val (first rest-args)
parse-result (try (let [n (Double/parseDouble val)]
(if (and (>= n 0.0) (<= n 1.0))
{:success true :value n}
{:success false :error (str "Link density must be between 0 and 1, got: " val)}))
(catch NumberFormatException _
{:success false :error (str "Invalid number for --link-density: " val)}))]
(if (:success parse-result)
(recur (rest rest-args) (assoc opts :link-density-threshold (:value parse-result)))
(assoc opts :error (:error parse-result)))))
(str/starts-with? arg "-") (assoc opts :error (str "Unknown option: " arg))
:else (if (:url opts)
(assoc opts :error "Multiple URLs provided. Only one URL is allowed.")
(recur rest-args (assoc opts :url arg))))))))

(defn -main
[& args]
(let [opts (parse-args args)]
(cond (:version opts) (do (println (str "r11y " version))
(println "https://github.com/dazld/r11y/releases")
(System/exit 0))
(:help opts) (do (print-usage) (System/exit 0))
(:error opts) (do
(println "Error:" (:error opts))
(println)
(print-usage)
(System/exit 1))
(nil? (:url opts)) (do
(println "Error: No URL provided")
(println)
(print-usage)
(System/exit 1))
:else (try (let [result (html/extract-content-from-url (:url opts)
:format :markdown
:link-density-threshold
(:link-density-threshold opts)
:with-metadata (:with-metadata opts))]
(println (:markdown result))
(System/exit 0))
(catch Exception e
(binding [*out* *err*]
(println "Error extracting content:" (.getMessage e)))
(System/exit 1))))))
(println "https://github.com/dazld/r11y/releases")
(System/exit 0))
(:help opts) (do (print-usage) (System/exit 0))
(:error opts) (do
(println "Error:" (:error opts))
(println)
(print-usage)
(System/exit 1))
(nil? (:url opts)) (do
(println "Error: No URL provided")
(println)
(print-usage)
(System/exit 1))
:else (try (let [result (html/extract-content-from-url (:url opts)
:format :markdown
:link-density-threshold
(:link-density-threshold opts)
:with-metadata (:with-metadata opts)
:proxy (:proxy opts))]
(println (:markdown result))
(System/exit 0))
(catch Exception e
(binding [*out* *err*]
(println "Error extracting content:" (.getMessage e)))
(System/exit 1))))))
39 changes: 25 additions & 14 deletions src/r11y/lib/html.clj
Original file line number Diff line number Diff line change
Expand Up @@ -807,18 +807,25 @@
(re-find #"(?m)^[-*+]\s+\S" s)
(re-find #"(?m)^\d+\.\s+\S" s)))))

(def ^:const default-fetch-headers
"Default request identity for content extraction. The User-Agent
and Sec-Fetch-* headers make servers serve the same content they
would serve to a Safari user, which is the common case r11y is
built to read."
{"User-Agent" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.6 Safari/605.1.15"
"Accept" "text/markdown,text/html;q=0.9,application/xhtml+xml;q=0.9,application/xml;q=0.8,*/*;q=0.7"
"Accept-Encoding" "gzip, deflate"
"Accept-Language" "en-GB,en;q=0.9"
"Priority" "u=0, i"
"Sec-Fetch-Dest" "document"
"Sec-Fetch-Mode" "navigate"
"Sec-Fetch-Site" "none"})

(defn- fetch-url
"Fetch URL with common headers and return response map."
[url]
(http/get-url url {:as :byte-array
:headers {"User-Agent" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.6 Safari/605.1.15"
"Accept" "text/markdown,text/html;q=0.9,application/xhtml+xml;q=0.9,application/xml;q=0.8,*/*;q=0.7"
"Accept-Encoding" "gzip, deflate"
"Accept-Language" "en-GB,en;q=0.9"
"Priority" "u=0, i"
"Sec-Fetch-Dest" "document"
"Sec-Fetch-Mode" "navigate"
"Sec-Fetch-Site" "none"}}))
"Fetch URL with default headers. Return response map."
[url {:keys [proxy]}]
(http/get-url url (cond-> {:as :byte-array :headers default-fetch-headers}
proxy (assoc :proxy proxy))))

(defn extract-content-from-url
"Extract main content from a URL. Returns clean HTML by default.
Expand All @@ -828,13 +835,17 @@
:with-metadata - include YAML frontmatter with metadata (default false)
:content - pre-fetched HTML content (String or bytes), skips initial fetch
:content-type - content-type of pre-fetched content
:fetch-fn - custom fetch function returning http-kit style response map"
:fetch-fn - custom fetch function returning http-kit style response map
:proxy - proxy URL string (e.g. socks5://127.0.0.1:9050)"
[url & {:keys [format link-density-threshold with-metadata
content content-type fetch-fn]
content content-type fetch-fn proxy]
:or {format :html
link-density-threshold default-link-density-threshold
with-metadata false}}]
(let [do-fetch (or fetch-fn fetch-url)
(let [do-fetch (or fetch-fn
(let [base-opts (cond-> {:as :byte-array :headers default-fetch-headers}
proxy (assoc :proxy proxy))]
(fn [u] (http/get-url u base-opts))))
normalized-url (normalize-github-url url)
urls-differ? (not= normalized-url url)
;; Step 1: Resolve original content
Expand Down
Loading