From 8c287c4b1809943a09964d801dd605186fffbfbc Mon Sep 17 00:00:00 2001 From: Tyom Semonov Date: Thu, 21 May 2026 13:46:28 +0100 Subject: [PATCH 1/3] Make repo-intel resilient to GitHub history-fetch 502s Commit.history makes GitHub compute per-commit diff stats, so a page with a few large commits deterministically times out (502) at the same cursor. The fetch had no retries and discarded all progress on crash, so every re-run restarted from scratch. - Parametrize history page size; retry transient 5xx with backoff and a shrinking page size (100 -> 25 -> 10) to stay under GitHub's timeout - Persist partial progress on terminal failure so re-runs resume from the cache tail instead of refetching everything --- src/repo-intel/repo-intel.py | 87 +++++++++++++++++++++++++++++++++--- stow/bin/repo-intel | 87 +++++++++++++++++++++++++++++++++--- 2 files changed, 160 insertions(+), 14 deletions(-) diff --git a/src/repo-intel/repo-intel.py b/src/repo-intel/repo-intel.py index cf0c241..245c760 100755 --- a/src/repo-intel/repo-intel.py +++ b/src/repo-intel/repo-intel.py @@ -64,6 +64,7 @@ import re import subprocess import sys +import time import urllib.request import webbrowser from collections import defaultdict @@ -494,6 +495,47 @@ def gh_graphql(query, variables, token): return json.loads(resp.read()) +# GitHub returns these transient statuses when its GraphQL backend is +# overloaded or times out; they're worth retrying. +RETRYABLE_STATUS = frozenset({429, 500, 502, 503, 504}) + +# Plan for a single Commit.history page: (page_size, seconds_to_wait_first). +# Resolving Commit.history makes GitHub compute per-commit diff stats +# (additions/deletions), so a page holding a few large commits can blow past +# its backend timeout and return 502 — deterministically, at the same cursor. +# Shrinking `first` cuts the per-request work; the backoff rides out flakiness. +HISTORY_FETCH_PLAN = ( + (100, 0), + (100, 2), + (25, 4), + (25, 8), + (10, 15), +) + + +def fetch_history_page(query, variables, token, label): + """gh_graphql for a Commit.history page, retrying transient 5xx with + backoff and a shrinking page size. Raises the last error if all attempts + fail. `variables` must omit `pageSize` — it is injected per attempt.""" + last_exc = None + for page_size, sleep_s in HISTORY_FETCH_PLAN: + if sleep_s: + time.sleep(sleep_s) + try: + return gh_graphql(query, {**variables, "pageSize": page_size}, token) + except urllib.error.HTTPError as exc: + if exc.code not in RETRYABLE_STATUS: + raise + last_exc, detail = exc, f"HTTP {exc.code}" + except urllib.error.URLError as exc: + last_exc, detail = exc, str(exc.reason) + print( + f" warning: {label} page (size {page_size}) failed: {detail}", + file=sys.stderr, + ) + raise last_exc + + def gh_repository(body): """Extract data.repository defensively — GraphQL returns null on errors.""" return (body.get("data") or {}).get("repository") or {} @@ -712,13 +754,19 @@ def _paginate_history(fetch_page, cached_oids, last_n, since, fetch_page(cursor) -> history dict, or None when the anchor object is gone. Returns (nodes, reason) where reason ∈ - "hit_cache" | "short_circuit" | "page_end" | "anchor_null" + "hit_cache" | "short_circuit" | "page_end" | "anchor_null" | "fetch_failed" + On "fetch_failed" the returned nodes are still a contiguous run from the + walk's start, so the caller can persist them and resume on a re-run. """ nodes = [] cursor = None dropped_anchor = not skip_first while True: - history = fetch_page(cursor) + try: + history = fetch_page(cursor) + except urllib.error.URLError as exc: + print(f" error: {label} fetch aborted: {exc}", file=sys.stderr) + return nodes, "fetch_failed" if history is None: return nodes, "anchor_null" for n in history.get("nodes") or []: @@ -774,7 +822,7 @@ def collect_remote(slug, token, no_cache=False, commits_filter=None, since=None, ) history_block = """ -history(first: 100, after: $cursor) { +history(first: $pageSize, after: $cursor) { pageInfo { hasNextPage endCursor } nodes { oid messageHeadline @@ -784,7 +832,7 @@ def collect_remote(slug, token, no_cache=False, commits_filter=None, since=None, }""".strip() top_query = f""" -query($owner: String!, $repo: String!, $cursor: String) {{ +query($owner: String!, $repo: String!, $cursor: String, $pageSize: Int!) {{ repository(owner: $owner, name: $repo) {{ name url diskUsage defaultBranchRef {{ @@ -795,7 +843,7 @@ def collect_remote(slug, token, no_cache=False, commits_filter=None, since=None, }}""".strip() bottom_query = f""" -query($owner: String!, $repo: String!, $oid: GitObjectID!, $cursor: String) {{ +query($owner: String!, $repo: String!, $oid: GitObjectID!, $cursor: String, $pageSize: Int!) {{ repository(owner: $owner, name: $repo) {{ object(oid: $oid) {{ ... on Commit {{ {history_block} }} }} }} @@ -818,7 +866,9 @@ def collect_remote(slug, token, no_cache=False, commits_filter=None, since=None, } def top_fetch_page(cursor): - body = gh_graphql(top_query, {"owner": owner, "repo": repo, "cursor": cursor}, token) + body = fetch_history_page( + top_query, {"owner": owner, "repo": repo, "cursor": cursor}, token, "new" + ) if "errors" in body: sys.exit(f"GraphQL error: {body['errors']}") repo_node = gh_repository(body) @@ -838,6 +888,18 @@ def top_fetch_page(cursor): have_count_baseline=len(cached_nodes), label="new", ) + if top_reason == "fetch_failed": + # new_nodes is a contiguous run from HEAD. We never reached the old + # cache, so merging would leave a gap — persist just the fresh prefix + # (the next run resumes its tail via the older-fetch) and bail out. + if not no_cache and new_nodes: + save_cache(slug, new_nodes, False) + print( + f" cached {len(new_nodes)} commits so far — re-run to resume", + file=sys.stderr, + ) + sys.exit("error: GitHub fetch failed after repeated retries; aborting.") + if top_reason == "page_end" and cached_oids: print( f" cache: orphaned by force-push/rewrite, discarded ({len(cached_nodes)} commits)", @@ -863,10 +925,11 @@ def top_fetch_page(cursor): anchor_oid = cached_nodes[-1]["oid"] def bottom_fetch_page(cursor): - body = gh_graphql( + body = fetch_history_page( bottom_query, {"owner": owner, "repo": repo, "oid": anchor_oid, "cursor": cursor}, token, + "older", ) if "errors" in body: sys.exit(f"GraphQL error: {body['errors']}") @@ -893,6 +956,16 @@ def bottom_fetch_page(cursor): repo_size_kb = repo_meta["disk_kb"] nodes = new_nodes + cached_nodes + older_nodes + if bottom_reason == "fetch_failed": + # new + cached + older are contiguous, so the partial run is a valid + # prefix to persist; the next run extends from its tail. + if not no_cache and nodes: + save_cache(slug, nodes, False) + print( + f" cached {len(nodes)} commits so far — re-run to resume", + file=sys.stderr, + ) + sys.exit("error: GitHub fetch failed after repeated retries; aborting.") if bottom_reason is None: new_complete = top_reason == "page_end" or loaded_complete else: diff --git a/stow/bin/repo-intel b/stow/bin/repo-intel index fc04a3b..b4ad02d 100755 --- a/stow/bin/repo-intel +++ b/stow/bin/repo-intel @@ -64,6 +64,7 @@ import os import re import subprocess import sys +import time import urllib.request import webbrowser from collections import defaultdict @@ -494,6 +495,47 @@ def gh_graphql(query, variables, token): return json.loads(resp.read()) +# GitHub returns these transient statuses when its GraphQL backend is +# overloaded or times out; they're worth retrying. +RETRYABLE_STATUS = frozenset({429, 500, 502, 503, 504}) + +# Plan for a single Commit.history page: (page_size, seconds_to_wait_first). +# Resolving Commit.history makes GitHub compute per-commit diff stats +# (additions/deletions), so a page holding a few large commits can blow past +# its backend timeout and return 502 — deterministically, at the same cursor. +# Shrinking `first` cuts the per-request work; the backoff rides out flakiness. +HISTORY_FETCH_PLAN = ( + (100, 0), + (100, 2), + (25, 4), + (25, 8), + (10, 15), +) + + +def fetch_history_page(query, variables, token, label): + """gh_graphql for a Commit.history page, retrying transient 5xx with + backoff and a shrinking page size. Raises the last error if all attempts + fail. `variables` must omit `pageSize` — it is injected per attempt.""" + last_exc = None + for page_size, sleep_s in HISTORY_FETCH_PLAN: + if sleep_s: + time.sleep(sleep_s) + try: + return gh_graphql(query, {**variables, "pageSize": page_size}, token) + except urllib.error.HTTPError as exc: + if exc.code not in RETRYABLE_STATUS: + raise + last_exc, detail = exc, f"HTTP {exc.code}" + except urllib.error.URLError as exc: + last_exc, detail = exc, str(exc.reason) + print( + f" warning: {label} page (size {page_size}) failed: {detail}", + file=sys.stderr, + ) + raise last_exc + + def gh_repository(body): """Extract data.repository defensively — GraphQL returns null on errors.""" return (body.get("data") or {}).get("repository") or {} @@ -712,13 +754,19 @@ def _paginate_history(fetch_page, cached_oids, last_n, since, fetch_page(cursor) -> history dict, or None when the anchor object is gone. Returns (nodes, reason) where reason ∈ - "hit_cache" | "short_circuit" | "page_end" | "anchor_null" + "hit_cache" | "short_circuit" | "page_end" | "anchor_null" | "fetch_failed" + On "fetch_failed" the returned nodes are still a contiguous run from the + walk's start, so the caller can persist them and resume on a re-run. """ nodes = [] cursor = None dropped_anchor = not skip_first while True: - history = fetch_page(cursor) + try: + history = fetch_page(cursor) + except urllib.error.URLError as exc: + print(f" error: {label} fetch aborted: {exc}", file=sys.stderr) + return nodes, "fetch_failed" if history is None: return nodes, "anchor_null" for n in history.get("nodes") or []: @@ -774,7 +822,7 @@ def collect_remote(slug, token, no_cache=False, commits_filter=None, since=None, ) history_block = """ -history(first: 100, after: $cursor) { +history(first: $pageSize, after: $cursor) { pageInfo { hasNextPage endCursor } nodes { oid messageHeadline @@ -784,7 +832,7 @@ history(first: 100, after: $cursor) { }""".strip() top_query = f""" -query($owner: String!, $repo: String!, $cursor: String) {{ +query($owner: String!, $repo: String!, $cursor: String, $pageSize: Int!) {{ repository(owner: $owner, name: $repo) {{ name url diskUsage defaultBranchRef {{ @@ -795,7 +843,7 @@ query($owner: String!, $repo: String!, $cursor: String) {{ }}""".strip() bottom_query = f""" -query($owner: String!, $repo: String!, $oid: GitObjectID!, $cursor: String) {{ +query($owner: String!, $repo: String!, $oid: GitObjectID!, $cursor: String, $pageSize: Int!) {{ repository(owner: $owner, name: $repo) {{ object(oid: $oid) {{ ... on Commit {{ {history_block} }} }} }} @@ -818,7 +866,9 @@ query($owner: String!, $repo: String!, $oid: GitObjectID!, $cursor: String) {{ } def top_fetch_page(cursor): - body = gh_graphql(top_query, {"owner": owner, "repo": repo, "cursor": cursor}, token) + body = fetch_history_page( + top_query, {"owner": owner, "repo": repo, "cursor": cursor}, token, "new" + ) if "errors" in body: sys.exit(f"GraphQL error: {body['errors']}") repo_node = gh_repository(body) @@ -838,6 +888,18 @@ query($owner: String!, $repo: String!, $oid: GitObjectID!, $cursor: String) {{ have_count_baseline=len(cached_nodes), label="new", ) + if top_reason == "fetch_failed": + # new_nodes is a contiguous run from HEAD. We never reached the old + # cache, so merging would leave a gap — persist just the fresh prefix + # (the next run resumes its tail via the older-fetch) and bail out. + if not no_cache and new_nodes: + save_cache(slug, new_nodes, False) + print( + f" cached {len(new_nodes)} commits so far — re-run to resume", + file=sys.stderr, + ) + sys.exit("error: GitHub fetch failed after repeated retries; aborting.") + if top_reason == "page_end" and cached_oids: print( f" cache: orphaned by force-push/rewrite, discarded ({len(cached_nodes)} commits)", @@ -863,10 +925,11 @@ query($owner: String!, $repo: String!, $oid: GitObjectID!, $cursor: String) {{ anchor_oid = cached_nodes[-1]["oid"] def bottom_fetch_page(cursor): - body = gh_graphql( + body = fetch_history_page( bottom_query, {"owner": owner, "repo": repo, "oid": anchor_oid, "cursor": cursor}, token, + "older", ) if "errors" in body: sys.exit(f"GraphQL error: {body['errors']}") @@ -893,6 +956,16 @@ query($owner: String!, $repo: String!, $oid: GitObjectID!, $cursor: String) {{ repo_size_kb = repo_meta["disk_kb"] nodes = new_nodes + cached_nodes + older_nodes + if bottom_reason == "fetch_failed": + # new + cached + older are contiguous, so the partial run is a valid + # prefix to persist; the next run extends from its tail. + if not no_cache and nodes: + save_cache(slug, nodes, False) + print( + f" cached {len(nodes)} commits so far — re-run to resume", + file=sys.stderr, + ) + sys.exit("error: GitHub fetch failed after repeated retries; aborting.") if bottom_reason is None: new_complete = top_reason == "page_end" or loaded_complete else: From c3c7aa87356477bed2060747f8b26ed941d44308 Mon Sep 17 00:00:00 2001 From: Tyom Semonov Date: Thu, 21 May 2026 14:07:33 +0100 Subject: [PATCH 2/3] Include repo name in repo-intel page title MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Set document.title to " · Repo Intel" from the injected data so the browser tab identifies which repo the dashboard is for. --- src/repo-intel/template.html | 1 + stow/bin/repo-intel | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/repo-intel/template.html b/src/repo-intel/template.html index 4cb292e..98aa55d 100644 --- a/src/repo-intel/template.html +++ b/src/repo-intel/template.html @@ -296,6 +296,7 @@

const baseMatch = (D.githubBaseUrl || '').match(/\/\/[^/]+\/([^/]+\/[^/]+?)\/?$/); const titleText = baseMatch ? baseMatch[1] : D.repoName; +document.title = titleText ? `${titleText} · Repo Intel` : 'Repo Intel'; const titleEl = document.getElementById('title'); if (D.githubBaseUrl) { const a = document.createElement('a'); diff --git a/stow/bin/repo-intel b/stow/bin/repo-intel index b4ad02d..2e5d351 100755 --- a/stow/bin/repo-intel +++ b/stow/bin/repo-intel @@ -71,7 +71,7 @@ from collections import defaultdict from datetime import datetime, timedelta, timezone from pathlib import Path -TEMPLATE = '\n\n\n\n\n\nRepo Intel\n\n\n\n\n\n
\n

\n

\n
\n \n
\n

Contributions

\n

Commit timeline

\n
Drag canvas to pan · Drag the histogram below to jump · Shift-scroll or pinch to zoom · Hover for details · Hover tag dots to mark a moment · Click to open on GitHub
\n
\n
\n
\n
\n
\n

Summary

\n \n \n \n \n \n \n \n \n \n \n \n \n
#AuthorCommits%Added%Deleted%Net%L/CActive daysAvg/dayFirstLast
\n

Overall

\n
\n
\n
\n
\n
\n

Commit frequency over time

\n

Commit time patterns (hour of day)

\n

Day of week patterns

\n
\n
\n
\n\n\n\n' +TEMPLATE = '\n\n\n\n\n\nRepo Intel\n\n\n\n\n\n
\n

\n

\n
\n \n
\n

Contributions

\n

Commit timeline

\n
Drag canvas to pan · Drag the histogram below to jump · Shift-scroll or pinch to zoom · Hover for details · Hover tag dots to mark a moment · Click to open on GitHub
\n
\n
\n
\n
\n
\n

Summary

\n \n \n \n \n \n \n \n \n \n \n \n \n
#AuthorCommits%Added%Deleted%Net%L/CActive daysAvg/dayFirstLast
\n

Overall

\n
\n
\n
\n
\n
\n

Commit frequency over time

\n

Commit time patterns (hour of day)

\n

Day of week patterns

\n
\n
\n
\n\n\n\n' PLACEHOLDER = "/*__DATA_INJECTION__*/" NOREPLY_RE = re.compile(r"(?:\d+\+)?(.+)@users\.noreply\.github\.com") ORIGIN_RE = re.compile( From dd480175d9e7bc0eb1011addd9aa53a46903eb0c Mon Sep 17 00:00:00 2001 From: Tyom Semonov Date: Thu, 21 May 2026 14:11:27 +0100 Subject: [PATCH 3/3] Don't collapse non-retryable HTTP errors into fetch_failed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit _paginate_history caught urllib.error.URLError, which also catches its HTTPError subclass. A hard 401/403/404 surfaced by fetch_history_page was thus turned into a resumable fetch_failed — saving a bogus partial cache and telling the user to re-run. Propagate non-retryable statuses instead; retryable 5xx and network errors still resume as before. --- src/repo-intel/repo-intel.py | 5 +++++ stow/bin/repo-intel | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/src/repo-intel/repo-intel.py b/src/repo-intel/repo-intel.py index 245c760..5e6216a 100755 --- a/src/repo-intel/repo-intel.py +++ b/src/repo-intel/repo-intel.py @@ -765,6 +765,11 @@ def _paginate_history(fetch_page, cached_oids, last_n, since, try: history = fetch_page(cursor) except urllib.error.URLError as exc: + # A non-retryable HTTP status (401/403/404) is a hard failure, not + # a resumable one — propagate it rather than persisting a partial + # cache and telling the user to re-run. + if isinstance(exc, urllib.error.HTTPError) and exc.code not in RETRYABLE_STATUS: + raise print(f" error: {label} fetch aborted: {exc}", file=sys.stderr) return nodes, "fetch_failed" if history is None: diff --git a/stow/bin/repo-intel b/stow/bin/repo-intel index 2e5d351..ee0c1b3 100755 --- a/stow/bin/repo-intel +++ b/stow/bin/repo-intel @@ -765,6 +765,11 @@ def _paginate_history(fetch_page, cached_oids, last_n, since, try: history = fetch_page(cursor) except urllib.error.URLError as exc: + # A non-retryable HTTP status (401/403/404) is a hard failure, not + # a resumable one — propagate it rather than persisting a partial + # cache and telling the user to re-run. + if isinstance(exc, urllib.error.HTTPError) and exc.code not in RETRYABLE_STATUS: + raise print(f" error: {label} fetch aborted: {exc}", file=sys.stderr) return nodes, "fetch_failed" if history is None: