diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f217f31..8c11810 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -25,6 +25,15 @@ jobs: - name: Check formatting (Prettier) run: make format-check + # uv provides `uvx`, which the Makefile uses to run a pinned Ruff. + - uses: astral-sh/setup-uv@v5 + + - name: Lint Python (Ruff) + run: make py-lint + + - name: Check Python formatting (Ruff) + run: make py-format-check + # Type-check the frontend (svelte-check) — vite build strips types without # checking them, so without this a type regression would slip through. - name: Type-check frontend diff --git a/Makefile b/Makefile index 7da7958..bb68498 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,9 @@ .DEFAULT_GOAL := help +# Ruff is run via uvx so there's nothing to install or commit; pinned for +# reproducibility between local runs and CI. +RUFF := uvx ruff@0.14.5 + help: ## Show this help @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-12s\033[0m %s\n", $$1, $$2}' @@ -12,12 +16,23 @@ web-dev: ## Run the frontend dev server with HMR (reads web/public/mock-data.jso web-check: ## Type-check the frontend (svelte-check); does not build cd web && bun install --frozen-lockfile && bun run check -format: ## Format the whole repo with Prettier +format: py-format ## Format the whole repo (Prettier + Ruff) @bun install --frozen-lockfile >/dev/null && bunx prettier --write . format-check: ## Check formatting with Prettier (CI); does not write @bun install --frozen-lockfile >/dev/null && bunx prettier --check . +py-lint: ## Lint the Python sources with Ruff + @$(RUFF) check . + +py-format: ## Format the Python sources with Ruff + @$(RUFF) format . + +py-format-check: ## Check Python formatting with Ruff (CI); does not write + @$(RUFF) format --check . + +check: format-check py-lint py-format-check web-check ## Run all static checks (mirrors CI gates) + build: web-build ## Build the single-file artifact into dist/repo-intel python3 build.py dist/repo-intel @@ -37,4 +52,4 @@ gc: ## Repack git history (committed dist/repo-intel deltas down to ~nothing) after=$$(git count-objects -vH | awk '/size-pack:/{print $$2 $$3}'); \ echo "pack: $$before -> $$after" -.PHONY: help web-build web-dev web-check format format-check build techdata dev install-hooks gc +.PHONY: help web-build web-dev web-check format format-check py-lint py-format py-format-check check build techdata dev install-hooks gc diff --git a/README.md b/README.md index bfca069..cd98fdd 100644 --- a/README.md +++ b/README.md @@ -216,15 +216,17 @@ repo-intel facebook/react --clone # analyse via bare clone ## Development -| File | Purpose | -| ----------------- | ------------------------------------------------------------------------------------ | -| `repo-intel.py` | The script. Holds `TEMPLATE` + `TECHDATA` placeholders until bundled | -| `web/` | Frontend app (Svelte 5 + Vite + TypeScript). `bun run build` → `web/dist/index.html` | -| `web/src/lib/` | Dashboard engine: heatmap, timeline, charts, popovers, table (one module each) | -| `techdata.json` | Generated language + framework detection data (committed; embedded at build) | -| `gen_techdata.py` | Regenerates `techdata.json` from GitHub Linguist + a curated framework map | -| `build.py` | Substitutes the `TEMPLATE` / `TECHDATA` lines with their data as a `repr()` | -| `dist/repo-intel` | The built single-file artifact (committed; this is what curl/Action/Homebrew use) | +| File | Purpose | +| ------------------------- | -------------------------------------------------------------------------------------------------- | +| `repo-intel.py` | The script. Holds `TEMPLATE` + `TECHDATA` placeholders until bundled | +| `web/` | Frontend app (Svelte 5 + Vite + TypeScript). `bun run build` → `web/dist/index.html` | +| `web/src/App.svelte` | Root component — composes the dashboard from the `lib/components/` pieces | +| `web/src/lib/components/` | Dashboard UI as Svelte components — heatmap, table, charts, cards, popovers (one `.svelte` each) | +| `web/src/lib/` | Shared engine helpers: ECharts registration, the canvas timeline, popover state, theme, formatting | +| `techdata.json` | Generated language + framework detection data (committed; embedded at build) | +| `gen_techdata.py` | Regenerates `techdata.json` from GitHub Linguist + a curated framework map | +| `build.py` | Substitutes the `TEMPLATE` / `TECHDATA` lines with their data as a `repr()` | +| `dist/repo-intel` | The built single-file artifact (committed; this is what curl/Action/Homebrew use) | The frontend is built with [Bun](https://bun.sh). It compiles to a single self-contained `web/dist/index.html` (all JS + CSS inlined, Apache ECharts @@ -238,6 +240,8 @@ make web-dev # frontend dev server with HMR (renders web/public/mock-dat make build # rebuild the frontend bundle + dist/repo-intel make techdata # regenerate techdata.json from Linguist (needs network) make dev ARGS="3 facebook/react" # build frontend, then run from source live +make format # format the whole repo (Prettier + Ruff) +make check # run all static checks — Prettier + Ruff + svelte-check (mirrors CI) ``` `make web-dev` runs Vite's dev server with hot-reload against diff --git a/build.py b/build.py index ce473a5..1e26dcc 100644 --- a/build.py +++ b/build.py @@ -44,12 +44,12 @@ def main(): ("techdata.json", TECHDATA_PLACEHOLDER), ): if script.count(placeholder) != 1: - sys.exit(f"error: expected exactly one {placeholder!r} line in repo-intel.py") + sys.exit( + f"error: expected exactly one {name} placeholder ({placeholder!r}) in repo-intel.py" + ) - bundled = ( - script - .replace(TEMPLATE_PLACEHOLDER, f"TEMPLATE = {template!r}") - .replace(TECHDATA_PLACEHOLDER, f"TECHDATA = {techdata!r}") + bundled = script.replace(TEMPLATE_PLACEHOLDER, f"TEMPLATE = {template!r}").replace( + TECHDATA_PLACEHOLDER, f"TECHDATA = {techdata!r}" ) out_path.parent.mkdir(parents=True, exist_ok=True) out_path.write_text(bundled, encoding="utf-8") diff --git a/dist/repo-intel b/dist/repo-intel index 0193cce..b341923 100755 --- a/dist/repo-intel +++ b/dist/repo-intel @@ -70,6 +70,7 @@ Cache: file per repo. Re-runs only fetch new commits. """ +import contextlib import hashlib import json import os @@ -90,9 +91,7 @@ ORIGIN_RE = re.compile( r"^(?:https?://(?P[^/]+)/|git@(?P[^:]+):)" r"(?P[^/]+)/(?P.+?)(?:\.git)?/?$" ) -CACHE_DIR = ( - Path(os.environ.get("XDG_CACHE_HOME") or (Path.home() / ".cache")) / "repo-intel" -) +CACHE_DIR = Path(os.environ.get("XDG_CACHE_HOME") or (Path.home() / ".cache")) / "repo-intel" def parse_iso_instant(s): @@ -137,8 +136,7 @@ def save_cache(slug, nodes, complete): cache_path(slug).write_text(json.dumps({"nodes": nodes, "complete": complete})) -def needs_older_fetch(have_count, cached_oldest_date, prev_complete, - commits_filter, since, until): +def needs_older_fetch(have_count, cached_oldest_date, prev_complete, commits_filter, since, until): """Should we paginate below the oldest cached commit after top-fetch? have_count: len(new_nodes) + len(cached_nodes) after the top-fetch. @@ -190,9 +188,7 @@ def parse_formats(val, acc): if not name: continue if name not in VALID_FORMATS: - raise ValueError( - f"--format must be one of {', '.join(VALID_FORMATS)} (got {name!r})" - ) + raise ValueError(f"--format must be one of {', '.join(VALID_FORMATS)} (got {name!r})") if name not in acc: acc.append(name) return acc @@ -216,7 +212,7 @@ def parse_args(argv): sys.exit(2) return argv[i + 1], 2 if tok.startswith(name + "="): - return tok[len(name) + 1:], 1 + return tok[len(name) + 1 :], 1 return None, 0 while i < len(argv): @@ -296,8 +292,7 @@ def parse_args(argv): sys.exit(2) if not formats: formats = ["html"] - return (top_n, remote, output, no_open, no_cache, clone, - commits_filter, since, until, formats) + return (top_n, remote, output, no_open, no_cache, clone, commits_filter, since, until, formats) def login_from_email(email): @@ -348,11 +343,11 @@ def _load_techdata(): _TECH = _load_techdata() _LANG = _TECH.get("lang", {}) -EXT_LANG = _LANG.get("ext", {}) # extension (no dot, lower) -> language +EXT_LANG = _LANG.get("ext", {}) # extension (no dot, lower) -> language FILENAME_LANG = _LANG.get("filename", {}) # lowercased filename -> language -NAME_COLOR = _LANG.get("color", {}) # language -> hex color -FW_DEPS = _TECH.get("fw_deps", {}) # {ecosystem: {dependency: framework}} -FW_SENTINELS_JS = _TECH.get("fw_sentinels_js", []) # [[basename, framework]] +NAME_COLOR = _LANG.get("color", {}) # language -> hex color +FW_DEPS = _TECH.get("fw_deps", {}) # {ecosystem: {dependency: framework}} +FW_SENTINELS_JS = _TECH.get("fw_sentinels_js", []) # [[basename, framework]] FW_SENTINELS_OTHER = _TECH.get("fw_sentinels_other", []) # [[path, framework, lang]] @@ -376,11 +371,22 @@ _VENDOR_RE = _compile_vendor(_TECH.get("vendor", [])) # Lockfiles Linguist classifies as *generated* (handled in code, not vendor.yml) # — kept as a small supplement so they don't dominate the language bar. -NOISE_BASENAMES = frozenset({ - "package-lock.json", "yarn.lock", "pnpm-lock.yaml", "npm-shrinkwrap.json", - "composer.lock", "cargo.lock", "gemfile.lock", "poetry.lock", "go.sum", - "pdm.lock", "uv.lock", "flake.lock", -}) +NOISE_BASENAMES = frozenset( + { + "package-lock.json", + "yarn.lock", + "pnpm-lock.yaml", + "npm-shrinkwrap.json", + "composer.lock", + "cargo.lock", + "gemfile.lock", + "poetry.lock", + "go.sum", + "pdm.lock", + "uv.lock", + "flake.lock", + } +) # Shebang interpreter → language, for extensionless scripts Linguist can't name # from a path alone (e.g. `bin/deploy` with `#!/usr/bin/env bash`). A small @@ -388,11 +394,24 @@ NOISE_BASENAMES = frozenset({ # stripped (`python3` → `python`) before lookup. Names must be real Linguist # languages so they pick up a color. SHEBANG_LANG = { - "sh": "Shell", "bash": "Shell", "zsh": "Shell", "dash": "Shell", - "ksh": "Shell", "fish": "fish", "python": "Python", "ruby": "Ruby", - "node": "JavaScript", "perl": "Perl", "awk": "Awk", "gawk": "Awk", - "lua": "Lua", "php": "PHP", "rscript": "R", "tclsh": "Tcl", - "groovy": "Groovy", "osascript": "AppleScript", + "sh": "Shell", + "bash": "Shell", + "zsh": "Shell", + "dash": "Shell", + "ksh": "Shell", + "fish": "fish", + "python": "Python", + "ruby": "Ruby", + "node": "JavaScript", + "perl": "Perl", + "awk": "Awk", + "gawk": "Awk", + "lua": "Lua", + "php": "PHP", + "rscript": "R", + "tclsh": "Tcl", + "groovy": "Groovy", + "osascript": "AppleScript", } @@ -424,8 +443,8 @@ def numstat_newpath(field): lo = field.find("{") hi = field.find("}", lo) if lo != -1 else -1 if lo != -1 and hi != -1 and " => " in field[lo:hi]: - new = field[lo + 1:hi].split(" => ", 1)[1] - return field[:lo] + new + field[hi + 1:] + new = field[lo + 1 : hi].split(" => ", 1)[1] + return field[:lo] + new + field[hi + 1 :] return field.split(" => ", 1)[1] @@ -452,7 +471,7 @@ def classify_path(field, present=None, shebang=None): return FILENAME_LANG[base] dot = base.rfind(".") if dot > 0: - lang = EXT_LANG.get(base[dot + 1:]) + lang = EXT_LANG.get(base[dot + 1 :]) if lang: return lang if shebang and path in shebang: # extensionless/unknown but has a #! line @@ -488,13 +507,15 @@ def top_languages(langs, limit=10): existing["lines"] += overflow existing["pct"] = round(existing["lines"] * 100 / total, 1) else: - out.append({ - "name": OTHER_LANG, - "lines": overflow, - "files": 0, - "pct": round(overflow * 100 / total, 1), - "color": OTHER_COLOR, - }) + out.append( + { + "name": OTHER_LANG, + "lines": overflow, + "files": 0, + "pct": round(overflow * 100 / total, 1), + "color": OTHER_COLOR, + } + ) return out @@ -521,9 +542,7 @@ def _head_first_line(path, cwd=None): """First line of `path` at HEAD, decoded leniently, or "". Reads bytes so a stray binary doesn't crash the utf-8 decode `git(text=True)` would attempt.""" try: - out = subprocess.run( - ["git", "show", f"HEAD:{path}"], cwd=cwd, capture_output=True - ).stdout + out = subprocess.run(["git", "show", f"HEAD:{path}"], cwd=cwd, capture_output=True).stdout except OSError: return "" nl = out.find(b"\n") @@ -649,11 +668,13 @@ def _frameworks_from_files(paths, read_file): groups = [] for lang in sorted(found, key=lambda L: (-len(found[L]), L)): - groups.append({ - "language": lang, - "color": NAME_COLOR.get(lang, OTHER_COLOR), - "names": found[lang][:15], - }) + groups.append( + { + "language": lang, + "color": NAME_COLOR.get(lang, OTHER_COLOR), + "names": found[lang][:15], + } + ) return groups @@ -693,9 +714,7 @@ def ensure_bare_clone(owner, repo, no_cache): ) elif not no_cache: print(" updating cached bare clone…", file=sys.stderr) - subprocess.run( - ["git", "fetch", "--quiet", "origin"], cwd=clone_dir, check=False - ) + subprocess.run(["git", "fetch", "--quiet", "origin"], cwd=clone_dir, check=False) _CLONE_REFRESHED.add(clone_dir) return clone_dir @@ -783,7 +802,9 @@ def head_file_sizes(cwd=None, limit=40): # the run — the trap _head_first_line documents and sidesteps the same way. out = subprocess.run( ["git", "-c", "core.quotePath=false", "ls-tree", "-r", "-l", "HEAD"], - cwd=cwd, capture_output=True, check=True, + cwd=cwd, + capture_output=True, + check=True, ).stdout.decode("utf-8", "replace") except (subprocess.CalledProcessError, OSError): return None @@ -816,7 +837,8 @@ def history_disk_by_path(cwd=None, limit=40): # once — this walks every reachable object and can be huge on big repos. revs = subprocess.Popen( ["git", "-c", "core.quotePath=false", "rev-list", "--objects", "--all"], - cwd=cwd, stdout=subprocess.PIPE, + cwd=cwd, + stdout=subprocess.PIPE, ) # `%(rest)` echoes the path rev-list appended after each blob's oid. # No text=True: read bytes and decode each line leniently, so a non-UTF-8 @@ -824,7 +846,9 @@ def history_disk_by_path(cwd=None, limit=40): # decode and abort the run — same trap _head_first_line sidesteps. cat = subprocess.Popen( ["git", "cat-file", "--batch-check=%(objecttype) %(objectsize:disk) %(rest)"], - cwd=cwd, stdin=revs.stdout, stdout=subprocess.PIPE, + cwd=cwd, + stdin=revs.stdout, + stdout=subprocess.PIPE, ) # Drop our handle so rev-list gets SIGPIPE if cat-file exits early. revs.stdout.close() @@ -872,8 +896,12 @@ def count_branches(cwd=None): the union covers both. None when no refs resolve.""" try: out = git( - "for-each-ref", "--format=%(refname)", - "refs/heads", "refs/remotes", cwd=cwd, quiet=True, + "for-each-ref", + "--format=%(refname)", + "refs/heads", + "refs/remotes", + cwd=cwd, + quiet=True, ) except subprocess.CalledProcessError: return None @@ -881,11 +909,11 @@ def count_branches(cwd=None): for line in out.splitlines(): ref = line.strip() if ref.startswith("refs/heads/"): - names.add(ref[len("refs/heads/"):]) + names.add(ref[len("refs/heads/") :]) elif ref.startswith("refs/remotes/"): # refs/remotes// — drop the remote and skip the # symbolic origin/HEAD pointer so it isn't counted as a branch. - _, _, branch = ref[len("refs/remotes/"):].partition("/") + _, _, branch = ref[len("refs/remotes/") :].partition("/") if branch and branch != "HEAD": names.add(branch) return len(names) or None @@ -944,16 +972,17 @@ def collect_local(cwd=None, suppress_current_user=False): current_email = "" if not suppress_current_user: - try: + with contextlib.suppress(subprocess.CalledProcessError): current_email = git("config", "user.email", cwd=cwd).strip().lower() - except subprocess.CalledProcessError: - pass log = git( # -c core.quotePath=false: keep non-ASCII paths raw so log paths match # the `present` set from the HEAD tree below (both feed classify_path). - "-c", "core.quotePath=false", - "log", "--no-merges", "-M", + "-c", + "core.quotePath=false", + "log", + "--no-merges", + "-M", "--format=%H\x1f%s\x1f%aE\x1f%aN\x1f%aI", "--numstat", cwd=cwd, @@ -1247,9 +1276,7 @@ def fetch_user_profiles(logins, token): "repositories(privacy: PUBLIC, ownerAffiliations: OWNER) { totalCount }" ) var_decls = ", ".join(f"$l{i}: String!" for i in range(len(unique))) - fragments = " ".join( - f"u{i}: user(login: $l{i}) {{ {fields} }}" for i in range(len(unique)) - ) + fragments = " ".join(f"u{i}: user(login: $l{i}) {{ {fields} }}" for i in range(len(unique))) query = f"query({var_decls}) {{ {fragments} }}" variables = {f"l{i}": login for i, login in enumerate(unique)} @@ -1333,12 +1360,14 @@ query($owner: String!, $repo: String!, $cursor: String) { continue if not oid or not date: continue - tags.append({ - "name": node.get("name") or "", - "oid": oid, - "date": date, - "message": (message.splitlines() or [""])[0], - }) + tags.append( + { + "name": node.get("name") or "", + "oid": oid, + "date": date, + "message": (message.splitlines() or [""])[0], + } + ) page = refs.get("pageInfo") or {} if not page.get("hasNextPage"): break @@ -1363,10 +1392,19 @@ def gh_rest_get(path, token): # Manifests _frameworks_from_files actually parses (so we only fetch those # blobs). tsconfig.json / sentinels are presence-only — covered by the tree. -_REMOTE_MANIFEST_BASES = frozenset({ - "package.json", "composer.json", "pyproject.toml", "pipfile", - "setup.py", "setup.cfg", "gemfile", "go.mod", "cargo.toml", -}) +_REMOTE_MANIFEST_BASES = frozenset( + { + "package.json", + "composer.json", + "pyproject.toml", + "pipfile", + "setup.py", + "setup.cfg", + "gemfile", + "go.mod", + "cargo.toml", + } +) def _remote_manifest_paths(paths): @@ -1385,7 +1423,7 @@ def fetch_blob_texts(owner, repo, paths, token): out = {} paths = list(paths) for start in range(0, len(paths), 50): - chunk = paths[start:start + 50] + chunk = paths[start : start + 50] var_decls = ", ".join(f"$p{i}: String!" for i in range(len(chunk))) frags = " ".join( f"b{i}: object(expression: $p{i}) {{ ... on Blob {{ text }} }}" @@ -1430,8 +1468,7 @@ def fetch_frameworks_remote(owner, repo, token): # GitHub caps the recursive tree at ~100k entries / 7MB; deep manifests # past the cap are dropped, so detection may miss frameworks silently. print( - " warning: repo tree truncated by GitHub — framework detection " - "may be incomplete", + " warning: repo tree truncated by GitHub — framework detection may be incomplete", file=sys.stderr, ) paths = [e["path"] for e in (tree.get("tree") or []) if e.get("type") == "blob"] @@ -1479,8 +1516,9 @@ query($owner: String!, $repo: String!) { return top_languages(langs) -def _paginate_history(fetch_page, cached_oids, last_n, since, - have_count_baseline, label, skip_first=False): +def _paginate_history( + fetch_page, cached_oids, last_n, since, have_count_baseline, label, skip_first=False +): """Walk a Commit.history connection page by page. fetch_page(cursor) -> history dict, or None when the anchor object is gone. @@ -1636,8 +1674,12 @@ query($owner: String!, $repo: String!, $oid: GitObjectID!, $cursor: String, $pag sys.exit("error: GitHub fetch failed after repeated retries; aborting.") new_nodes, top_reason = _paginate_history( - top_fetch_page, cached_oids, last_n, since, - have_count_baseline=len(cached_nodes), label="new", + top_fetch_page, + cached_oids, + last_n, + since, + have_count_baseline=len(cached_nodes), + label="new", ) if top_reason == "fetch_failed": @@ -1661,12 +1703,15 @@ query($owner: String!, $repo: String!, $oid: GitObjectID!, $cursor: String, $pag bottom_reason = None have_count = len(new_nodes) + len(cached_nodes) cached_oldest_date = ( - ((cached_nodes[-1].get("author") or {}).get("date") or "")[:10] - if cached_nodes else "" + ((cached_nodes[-1].get("author") or {}).get("date") or "")[:10] if cached_nodes else "" ) if needs_older_fetch( - have_count, cached_oldest_date, loaded_complete, - commits_filter, since, until, + have_count, + cached_oldest_date, + loaded_complete, + commits_filter, + since, + until, ): anchor_oid = cached_nodes[-1]["oid"] @@ -1685,8 +1730,13 @@ query($owner: String!, $repo: String!, $oid: GitObjectID!, $cursor: String, $pag return obj.get("history") or {"nodes": [], "pageInfo": {}} older_nodes, bottom_reason = _paginate_history( - bottom_fetch_page, cached_oids, last_n, since, - have_count_baseline=have_count, label="older", skip_first=True, + bottom_fetch_page, + cached_oids, + last_n, + since, + have_count_baseline=have_count, + label="older", + skip_first=True, ) if bottom_reason == "anchor_null": print( @@ -1760,12 +1810,15 @@ query($owner: String!, $repo: String!, $oid: GitObjectID!, $cursor: String, $pag def apply_filters(commits_meta, line_stats, commits_filter, since, until): if since or until: + def in_range(m): d = (m.get("iso") or "")[:10] return bool(d) and (not since or d >= since) and (not until or d <= until) + commits_meta = {h: m for h, m in commits_meta.items() if in_range(m)} if commits_filter: epoch = datetime(1970, 1, 1, tzinfo=timezone.utc) + def _ts(h): iso = commits_meta[h].get("iso") or "" if not iso: @@ -1774,11 +1827,12 @@ def apply_filters(commits_meta, line_stats, commits_filter, since, until): return datetime.fromisoformat(iso.replace("Z", "+00:00")) except ValueError: return epoch + ordered = sorted(commits_meta, key=_ts) if commits_filter[0] == "last": - keep = set(ordered[-commits_filter[1]:]) + keep = set(ordered[-commits_filter[1] :]) else: - keep = set(ordered[commits_filter[1]:commits_filter[2]]) + keep = set(ordered[commits_filter[1] : commits_filter[2]]) commits_meta = {h: m for h, m in commits_meta.items() if h in keep} line_stats = {h: line_stats[h] for h in commits_meta if h in line_stats} return commits_meta, line_stats @@ -1926,8 +1980,7 @@ def build_data( weeks_sorted = sorted(all_weeks) weekly_data = { - r["email"]: [weekly_by_author[r["email"]].get(w, 0) for w in weeks_sorted] - for r in top + r["email"]: [weekly_by_author[r["email"]].get(w, 0) for w in weeks_sorted] for r in top } daily_data = {r["email"]: dict(daily_by_author[r["email"]]) for r in top} @@ -1947,17 +2000,18 @@ def build_data( cl = lang_stats.get(h) if cl: ftypes = sorted( - ([name, NAME_COLOR.get(name, OTHER_COLOR), files] - for name, (_, _, files) in cl.items()), - key=lambda x: x[2], reverse=True, + ( + [name, NAME_COLOR.get(name, OTHER_COLOR), files] + for name, (_, _, files) in cl.items() + ), + key=lambda x: x[2], + reverse=True, ) entry["f"] = ftypes[:4] commits_list.append(entry) date_range = ( - {"start": min(all_dates), "end": max(all_dates)} - if all_dates - else {"start": "", "end": ""} + {"start": min(all_dates), "end": max(all_dates)} if all_dates else {"start": "", "end": ""} ) return { "repoName": repo_name, @@ -2025,9 +2079,7 @@ def enrich_contributor_profiles(contributors, commits_meta, github_base, token=N missing = [c for c in contributors if not c.get("login")] if missing: - sample = _sample_oids_per_email( - commits_meta, {c["email"] for c in missing} - ) + sample = _sample_oids_per_email(commits_meta, {c["email"] for c in missing}) resolved = fetch_logins_for_commits( origin.group("owner"), origin.group("repo"), sample, token ) @@ -2221,8 +2273,9 @@ def render_markdown(data): def main(): - (top_n, remote, output, no_open, no_cache, clone, - commits_filter, since, until, formats) = parse_args(sys.argv[1:]) + (top_n, remote, output, no_open, no_cache, clone, commits_filter, since, until, formats) = ( + parse_args(sys.argv[1:]) + ) token = None if remote: @@ -2278,13 +2331,9 @@ def main(): ) else: try: - subprocess.check_output( - ["git", "rev-parse", "--git-dir"], stderr=subprocess.DEVNULL - ) + subprocess.check_output(["git", "rev-parse", "--git-dir"], stderr=subprocess.DEVNULL) except subprocess.CalledProcessError: - sys.exit( - "error: not in a git repository (and no owner/repo argument given)" - ) + sys.exit("error: not in a git repository (and no owner/repo argument given)") ( repo_name, github_base, @@ -2307,9 +2356,7 @@ def main(): commits_meta, line_stats = apply_filters( commits_meta, line_stats, commits_filter, since, until ) - print( - f" filtered: {len(commits_meta)}/{total_before} commits", file=sys.stderr - ) + print(f" filtered: {len(commits_meta)}/{total_before} commits", file=sys.stderr) if not commits_meta: sys.exit("error: no commits match the given filters") # The merge tally is whole-history (collected before filtering, and diff --git a/gen_techdata.py b/gen_techdata.py index f9f8bee..b2ca742 100644 --- a/gen_techdata.py +++ b/gen_techdata.py @@ -25,8 +25,12 @@ import urllib.request from pathlib import Path -LANGUAGES_YML = "https://raw.githubusercontent.com/github-linguist/linguist/master/lib/linguist/languages.yml" -VENDOR_YML = "https://raw.githubusercontent.com/github-linguist/linguist/master/lib/linguist/vendor.yml" +LANGUAGES_YML = ( + "https://raw.githubusercontent.com/github-linguist/linguist/master/lib/linguist/languages.yml" +) +VENDOR_YML = ( + "https://raw.githubusercontent.com/github-linguist/linguist/master/lib/linguist/vendor.yml" +) OUT = Path(__file__).resolve().parent / "techdata.json" @@ -37,16 +41,30 @@ # (e.g. `.md` → "GCC Machine Description"). This tiebreaker layer pins the few # that users actually notice; the chosen name must be a real Linguist language. EXT_OVERRIDE = { - "md": "Markdown", "markdown": "Markdown", "h": "C", "m": "Objective-C", - "r": "R", "pl": "Perl", "t": "Perl", "l": "Common Lisp", "v": "Verilog", - "f": "Fortran", "for": "Fortran", "cls": "Apex", "pro": "Prolog", - "ts": "TypeScript", "rs": "Rust", "cs": "C#", "sql": "SQL", + "md": "Markdown", + "markdown": "Markdown", + "h": "C", + "m": "Objective-C", + "r": "R", + "pl": "Perl", + "t": "Perl", + "l": "Common Lisp", + "v": "Verilog", + "f": "Fortran", + "for": "Fortran", + "cls": "Apex", + "pro": "Prolog", + "ts": "TypeScript", + "rs": "Rust", + "cs": "C#", + "sql": "SQL", # Linguist's "Gettext Catalog" (.po/.pot) is type:prose and ships no color, # so it'd be dropped and translation catalogs would vanish into "Other". # Re-pin them under Linguist's own name with the gold GitHub falls back to # for colorless languages (Primer --bgColor-attention-emphasis; see # SYNTHETIC_COLORS). - "po": "Gettext Catalog", "pot": "Gettext Catalog", + "po": "Gettext Catalog", + "pot": "Gettext Catalog", } # Generic extensions whose canonical Linguist owner is the colorless "Text" @@ -59,89 +77,172 @@ # Curated web/npm dependency → framework display name. Vercel/Netlify answer a # different question (deploy presets), so this is maintained directly. CURATED_WEB = { - "react": "React", "react-dom": "React", "next": "Next.js", - "vue": "Vue", "nuxt": "Nuxt", "@angular/core": "Angular", - "svelte": "Svelte", "@sveltejs/kit": "SvelteKit", - "solid-js": "SolidJS", "preact": "Preact", "astro": "Astro", - "gatsby": "Gatsby", "@remix-run/react": "Remix", - "express": "Express", "koa": "Koa", "fastify": "Fastify", - "@nestjs/core": "NestJS", "@hapi/hapi": "hapi", - "electron": "Electron", "react-native": "React Native", - "expo": "Expo", "@ionic/core": "Ionic", - "vite": "Vite", "webpack": "webpack", "rollup": "Rollup", - "esbuild": "esbuild", "parcel": "Parcel", - "tailwindcss": "Tailwind CSS", "bootstrap": "Bootstrap", - "@mui/material": "MUI", "@chakra-ui/react": "Chakra UI", + "react": "React", + "react-dom": "React", + "next": "Next.js", + "vue": "Vue", + "nuxt": "Nuxt", + "@angular/core": "Angular", + "svelte": "Svelte", + "@sveltejs/kit": "SvelteKit", + "solid-js": "SolidJS", + "preact": "Preact", + "astro": "Astro", + "gatsby": "Gatsby", + "@remix-run/react": "Remix", + "express": "Express", + "koa": "Koa", + "fastify": "Fastify", + "@nestjs/core": "NestJS", + "@hapi/hapi": "hapi", + "electron": "Electron", + "react-native": "React Native", + "expo": "Expo", + "@ionic/core": "Ionic", + "vite": "Vite", + "webpack": "webpack", + "rollup": "Rollup", + "esbuild": "esbuild", + "parcel": "Parcel", + "tailwindcss": "Tailwind CSS", + "bootstrap": "Bootstrap", + "@mui/material": "MUI", + "@chakra-ui/react": "Chakra UI", "styled-components": "styled-components", - "jest": "Jest", "vitest": "Vitest", "mocha": "Mocha", - "playwright": "Playwright", "@playwright/test": "Playwright", - "cypress": "Cypress", "puppeteer": "Puppeteer", "testcafe": "TestCafe", + "jest": "Jest", + "vitest": "Vitest", + "mocha": "Mocha", + "playwright": "Playwright", + "@playwright/test": "Playwright", + "cypress": "Cypress", + "puppeteer": "Puppeteer", + "testcafe": "TestCafe", "@testing-library/react": "Testing Library", "@testing-library/vue": "Testing Library", "@testing-library/dom": "Testing Library", - "eslint": "ESLint", "prettier": "Prettier", "@biomejs/biome": "Biome", + "eslint": "ESLint", + "prettier": "Prettier", + "@biomejs/biome": "Biome", # Storybook ships across many scoped packages; the framework adapters below # cover both apps that embed it and addons that declare it as a peer dep. - "storybook": "Storybook", "@storybook/react": "Storybook", - "@storybook/vue3": "Storybook", "@storybook/angular": "Storybook", - "@storybook/svelte": "Storybook", "@storybook/html": "Storybook", - "@storybook/web-components": "Storybook", "@storybook/preact": "Storybook", + "storybook": "Storybook", + "@storybook/react": "Storybook", + "@storybook/vue3": "Storybook", + "@storybook/angular": "Storybook", + "@storybook/svelte": "Storybook", + "@storybook/html": "Storybook", + "@storybook/web-components": "Storybook", + "@storybook/preact": "Storybook", # Monorepo / task runners. - "turbo": "Turborepo", "nx": "Nx", "@nx/workspace": "Nx", + "turbo": "Turborepo", + "nx": "Nx", + "@nx/workspace": "Nx", # Transpilers. - "@swc/core": "SWC", "@babel/core": "Babel", - "redux": "Redux", "@reduxjs/toolkit": "Redux", "zustand": "Zustand", - "@apollo/client": "Apollo", "graphql": "GraphQL", - "@trpc/server": "tRPC", "@trpc/client": "tRPC", - "prisma": "Prisma", "@prisma/client": "Prisma", - "drizzle-orm": "Drizzle", "typeorm": "TypeORM", - "mongoose": "Mongoose", "sequelize": "Sequelize", - "three": "three.js", "d3": "D3", "chart.js": "Chart.js", + "@swc/core": "SWC", + "@babel/core": "Babel", + "redux": "Redux", + "@reduxjs/toolkit": "Redux", + "zustand": "Zustand", + "@apollo/client": "Apollo", + "graphql": "GraphQL", + "@trpc/server": "tRPC", + "@trpc/client": "tRPC", + "prisma": "Prisma", + "@prisma/client": "Prisma", + "drizzle-orm": "Drizzle", + "typeorm": "TypeORM", + "mongoose": "Mongoose", + "sequelize": "Sequelize", + "three": "three.js", + "d3": "D3", + "chart.js": "Chart.js", } # Web/JS sentinel files: basename → framework (assigned to the JS/TS bucket). CURATED_SENTINELS_JS = [ - ["next.config.js", "Next.js"], ["next.config.ts", "Next.js"], - ["next.config.mjs", "Next.js"], ["nuxt.config.js", "Nuxt"], - ["nuxt.config.ts", "Nuxt"], ["svelte.config.js", "Svelte"], - ["astro.config.mjs", "Astro"], ["vue.config.js", "Vue"], - ["gatsby-config.js", "Gatsby"], ["angular.json", "Angular"], + ["next.config.js", "Next.js"], + ["next.config.ts", "Next.js"], + ["next.config.mjs", "Next.js"], + ["nuxt.config.js", "Nuxt"], + ["nuxt.config.ts", "Nuxt"], + ["svelte.config.js", "Svelte"], + ["astro.config.mjs", "Astro"], + ["vue.config.js", "Vue"], + ["gatsby-config.js", "Gatsby"], + ["angular.json", "Angular"], ] # Backend frameworks Vercel/Netlify don't cover — keyed by language, then # dependency name → display name. Matched as whole words in manifest text. CURATED_BACKEND = { "Python": { - "django": "Django", "djangorestframework": "Django REST", - "flask": "Flask", "fastapi": "FastAPI", "starlette": "Starlette", - "tornado": "Tornado", "aiohttp": "aiohttp", "sanic": "Sanic", - "pyramid": "Pyramid", "sqlalchemy": "SQLAlchemy", "pydantic": "Pydantic", - "celery": "Celery", "scrapy": "Scrapy", "numpy": "NumPy", - "pandas": "pandas", "scipy": "SciPy", "scikit-learn": "scikit-learn", - "tensorflow": "TensorFlow", "torch": "PyTorch", "keras": "Keras", - "transformers": "Transformers", "matplotlib": "Matplotlib", - "pytest": "pytest", "click": "Click", "typer": "Typer", - "requests": "Requests", "httpx": "HTTPX", + "django": "Django", + "djangorestframework": "Django REST", + "flask": "Flask", + "fastapi": "FastAPI", + "starlette": "Starlette", + "tornado": "Tornado", + "aiohttp": "aiohttp", + "sanic": "Sanic", + "pyramid": "Pyramid", + "sqlalchemy": "SQLAlchemy", + "pydantic": "Pydantic", + "celery": "Celery", + "scrapy": "Scrapy", + "numpy": "NumPy", + "pandas": "pandas", + "scipy": "SciPy", + "scikit-learn": "scikit-learn", + "tensorflow": "TensorFlow", + "torch": "PyTorch", + "keras": "Keras", + "transformers": "Transformers", + "matplotlib": "Matplotlib", + "pytest": "pytest", + "click": "Click", + "typer": "Typer", + "requests": "Requests", + "httpx": "HTTPX", }, "Ruby": { - "rails": "Rails", "sinatra": "Sinatra", "hanami": "Hanami", - "rspec": "RSpec", "sidekiq": "Sidekiq", "puma": "Puma", "devise": "Devise", + "rails": "Rails", + "sinatra": "Sinatra", + "hanami": "Hanami", + "rspec": "RSpec", + "sidekiq": "Sidekiq", + "puma": "Puma", + "devise": "Devise", }, "Go": { - "github.com/gin-gonic/gin": "Gin", "github.com/labstack/echo": "Echo", - "github.com/gofiber/fiber": "Fiber", "github.com/gorilla/mux": "Gorilla", - "gorm.io/gorm": "GORM", "github.com/spf13/cobra": "Cobra", - "github.com/go-chi/chi": "chi", "google.golang.org/grpc": "gRPC", + "github.com/gin-gonic/gin": "Gin", + "github.com/labstack/echo": "Echo", + "github.com/gofiber/fiber": "Fiber", + "github.com/gorilla/mux": "Gorilla", + "gorm.io/gorm": "GORM", + "github.com/spf13/cobra": "Cobra", + "github.com/go-chi/chi": "chi", + "google.golang.org/grpc": "gRPC", }, "Rust": { - "actix-web": "Actix Web", "axum": "Axum", "rocket": "Rocket", - "warp": "warp", "tokio": "Tokio", "serde": "Serde", "diesel": "Diesel", - "tonic": "Tonic", "clap": "clap", "bevy": "Bevy", "tauri": "Tauri", + "actix-web": "Actix Web", + "axum": "Axum", + "rocket": "Rocket", + "warp": "warp", + "tokio": "Tokio", + "serde": "Serde", + "diesel": "Diesel", + "tonic": "Tonic", + "clap": "clap", + "bevy": "Bevy", + "tauri": "Tauri", }, "PHP": { - "laravel/framework": "Laravel", "symfony/symfony": "Symfony", - "symfony/framework-bundle": "Symfony", "slim/slim": "Slim", - "cakephp/cakephp": "CakePHP", "yiisoft/yii2": "Yii", + "laravel/framework": "Laravel", + "symfony/symfony": "Symfony", + "symfony/framework-bundle": "Symfony", + "slim/slim": "Slim", + "cakephp/cakephp": "CakePHP", + "yiisoft/yii2": "Yii", }, } @@ -197,8 +298,7 @@ def parse_languages_yml(text): m = re.match(r'^(?:"([^"]+)"|\'([^\']+)\'|([^:]+)):\s*$', raw) if m: name = m.group(1) or m.group(2) or m.group(3) - cur = {"type": "", "color": "", "group": "", - "extensions": [], "filenames": []} + cur = {"type": "", "color": "", "group": "", "extensions": [], "filenames": []} langs[name] = cur listkey = None else: @@ -206,12 +306,12 @@ def parse_languages_yml(text): continue if cur is None: continue - item = re.match(r'^ - (.*)$', raw) + item = re.match(r"^ - (.*)$", raw) if item and listkey: val = item.group(1).strip().strip('"').strip("'") cur[listkey].append(val) continue - prop = re.match(r'^ (\w+):\s*(.*)$', raw) + prop = re.match(r"^ (\w+):\s*(.*)$", raw) if prop: key, val = prop.group(1), prop.group(2).strip() if key in ("extensions", "filenames") and val == "": @@ -261,7 +361,7 @@ def build_language_tables(langs): for fn in info.get("filenames", []): filename_lang.setdefault(fn.lower(), eff) name_color.update(SYNTHETIC_COLORS) # synthetic buckets Linguist doesn't color; - # merged before EXT_OVERRIDE so its guard sees them + # merged before EXT_OVERRIDE so its guard sees them for ext, lang in EXT_OVERRIDE.items(): if lang in name_color: ext_lang[ext] = lang @@ -273,7 +373,7 @@ def build_language_tables(langs): def parse_vendor_yml(text): out = [] for line in text.splitlines(): - m = re.match(r'^- (.*)$', line) + m = re.match(r"^- (.*)$", line) if m: out.append(m.group(1).strip()) return out @@ -283,8 +383,7 @@ def main(): print("fetching Linguist languages.yml…", file=sys.stderr) langs = parse_languages_yml(fetch(LANGUAGES_YML)) name_color, ext_lang, filename_lang = build_language_tables(langs) - print(f" {len(name_color)} colored languages, {len(ext_lang)} extensions", - file=sys.stderr) + print(f" {len(name_color)} colored languages, {len(ext_lang)} extensions", file=sys.stderr) print("fetching Linguist vendor.yml…", file=sys.stderr) vendor = parse_vendor_yml(fetch(VENDOR_YML)) diff --git a/repo-intel.py b/repo-intel.py index c615e3e..d9085e2 100755 --- a/repo-intel.py +++ b/repo-intel.py @@ -70,6 +70,7 @@ file per repo. Re-runs only fetch new commits. """ +import contextlib import hashlib import json import os @@ -90,9 +91,7 @@ r"^(?:https?://(?P[^/]+)/|git@(?P[^:]+):)" r"(?P[^/]+)/(?P.+?)(?:\.git)?/?$" ) -CACHE_DIR = ( - Path(os.environ.get("XDG_CACHE_HOME") or (Path.home() / ".cache")) / "repo-intel" -) +CACHE_DIR = Path(os.environ.get("XDG_CACHE_HOME") or (Path.home() / ".cache")) / "repo-intel" def parse_iso_instant(s): @@ -137,8 +136,7 @@ def save_cache(slug, nodes, complete): cache_path(slug).write_text(json.dumps({"nodes": nodes, "complete": complete})) -def needs_older_fetch(have_count, cached_oldest_date, prev_complete, - commits_filter, since, until): +def needs_older_fetch(have_count, cached_oldest_date, prev_complete, commits_filter, since, until): """Should we paginate below the oldest cached commit after top-fetch? have_count: len(new_nodes) + len(cached_nodes) after the top-fetch. @@ -190,9 +188,7 @@ def parse_formats(val, acc): if not name: continue if name not in VALID_FORMATS: - raise ValueError( - f"--format must be one of {', '.join(VALID_FORMATS)} (got {name!r})" - ) + raise ValueError(f"--format must be one of {', '.join(VALID_FORMATS)} (got {name!r})") if name not in acc: acc.append(name) return acc @@ -216,7 +212,7 @@ def take_value(name): sys.exit(2) return argv[i + 1], 2 if tok.startswith(name + "="): - return tok[len(name) + 1:], 1 + return tok[len(name) + 1 :], 1 return None, 0 while i < len(argv): @@ -296,8 +292,7 @@ def take_value(name): sys.exit(2) if not formats: formats = ["html"] - return (top_n, remote, output, no_open, no_cache, clone, - commits_filter, since, until, formats) + return (top_n, remote, output, no_open, no_cache, clone, commits_filter, since, until, formats) def login_from_email(email): @@ -348,11 +343,11 @@ def _load_techdata(): _TECH = _load_techdata() _LANG = _TECH.get("lang", {}) -EXT_LANG = _LANG.get("ext", {}) # extension (no dot, lower) -> language +EXT_LANG = _LANG.get("ext", {}) # extension (no dot, lower) -> language FILENAME_LANG = _LANG.get("filename", {}) # lowercased filename -> language -NAME_COLOR = _LANG.get("color", {}) # language -> hex color -FW_DEPS = _TECH.get("fw_deps", {}) # {ecosystem: {dependency: framework}} -FW_SENTINELS_JS = _TECH.get("fw_sentinels_js", []) # [[basename, framework]] +NAME_COLOR = _LANG.get("color", {}) # language -> hex color +FW_DEPS = _TECH.get("fw_deps", {}) # {ecosystem: {dependency: framework}} +FW_SENTINELS_JS = _TECH.get("fw_sentinels_js", []) # [[basename, framework]] FW_SENTINELS_OTHER = _TECH.get("fw_sentinels_other", []) # [[path, framework, lang]] @@ -376,11 +371,22 @@ def _compile_vendor(patterns): # Lockfiles Linguist classifies as *generated* (handled in code, not vendor.yml) # — kept as a small supplement so they don't dominate the language bar. -NOISE_BASENAMES = frozenset({ - "package-lock.json", "yarn.lock", "pnpm-lock.yaml", "npm-shrinkwrap.json", - "composer.lock", "cargo.lock", "gemfile.lock", "poetry.lock", "go.sum", - "pdm.lock", "uv.lock", "flake.lock", -}) +NOISE_BASENAMES = frozenset( + { + "package-lock.json", + "yarn.lock", + "pnpm-lock.yaml", + "npm-shrinkwrap.json", + "composer.lock", + "cargo.lock", + "gemfile.lock", + "poetry.lock", + "go.sum", + "pdm.lock", + "uv.lock", + "flake.lock", + } +) # Shebang interpreter → language, for extensionless scripts Linguist can't name # from a path alone (e.g. `bin/deploy` with `#!/usr/bin/env bash`). A small @@ -388,11 +394,24 @@ def _compile_vendor(patterns): # stripped (`python3` → `python`) before lookup. Names must be real Linguist # languages so they pick up a color. SHEBANG_LANG = { - "sh": "Shell", "bash": "Shell", "zsh": "Shell", "dash": "Shell", - "ksh": "Shell", "fish": "fish", "python": "Python", "ruby": "Ruby", - "node": "JavaScript", "perl": "Perl", "awk": "Awk", "gawk": "Awk", - "lua": "Lua", "php": "PHP", "rscript": "R", "tclsh": "Tcl", - "groovy": "Groovy", "osascript": "AppleScript", + "sh": "Shell", + "bash": "Shell", + "zsh": "Shell", + "dash": "Shell", + "ksh": "Shell", + "fish": "fish", + "python": "Python", + "ruby": "Ruby", + "node": "JavaScript", + "perl": "Perl", + "awk": "Awk", + "gawk": "Awk", + "lua": "Lua", + "php": "PHP", + "rscript": "R", + "tclsh": "Tcl", + "groovy": "Groovy", + "osascript": "AppleScript", } @@ -424,8 +443,8 @@ def numstat_newpath(field): lo = field.find("{") hi = field.find("}", lo) if lo != -1 else -1 if lo != -1 and hi != -1 and " => " in field[lo:hi]: - new = field[lo + 1:hi].split(" => ", 1)[1] - return field[:lo] + new + field[hi + 1:] + new = field[lo + 1 : hi].split(" => ", 1)[1] + return field[:lo] + new + field[hi + 1 :] return field.split(" => ", 1)[1] @@ -452,7 +471,7 @@ def classify_path(field, present=None, shebang=None): return FILENAME_LANG[base] dot = base.rfind(".") if dot > 0: - lang = EXT_LANG.get(base[dot + 1:]) + lang = EXT_LANG.get(base[dot + 1 :]) if lang: return lang if shebang and path in shebang: # extensionless/unknown but has a #! line @@ -488,13 +507,15 @@ def top_languages(langs, limit=10): existing["lines"] += overflow existing["pct"] = round(existing["lines"] * 100 / total, 1) else: - out.append({ - "name": OTHER_LANG, - "lines": overflow, - "files": 0, - "pct": round(overflow * 100 / total, 1), - "color": OTHER_COLOR, - }) + out.append( + { + "name": OTHER_LANG, + "lines": overflow, + "files": 0, + "pct": round(overflow * 100 / total, 1), + "color": OTHER_COLOR, + } + ) return out @@ -521,9 +542,7 @@ def _head_first_line(path, cwd=None): """First line of `path` at HEAD, decoded leniently, or "". Reads bytes so a stray binary doesn't crash the utf-8 decode `git(text=True)` would attempt.""" try: - out = subprocess.run( - ["git", "show", f"HEAD:{path}"], cwd=cwd, capture_output=True - ).stdout + out = subprocess.run(["git", "show", f"HEAD:{path}"], cwd=cwd, capture_output=True).stdout except OSError: return "" nl = out.find(b"\n") @@ -649,11 +668,13 @@ def gather(bases, requirements=False): groups = [] for lang in sorted(found, key=lambda L: (-len(found[L]), L)): - groups.append({ - "language": lang, - "color": NAME_COLOR.get(lang, OTHER_COLOR), - "names": found[lang][:15], - }) + groups.append( + { + "language": lang, + "color": NAME_COLOR.get(lang, OTHER_COLOR), + "names": found[lang][:15], + } + ) return groups @@ -693,9 +714,7 @@ def ensure_bare_clone(owner, repo, no_cache): ) elif not no_cache: print(" updating cached bare clone…", file=sys.stderr) - subprocess.run( - ["git", "fetch", "--quiet", "origin"], cwd=clone_dir, check=False - ) + subprocess.run(["git", "fetch", "--quiet", "origin"], cwd=clone_dir, check=False) _CLONE_REFRESHED.add(clone_dir) return clone_dir @@ -783,7 +802,9 @@ def head_file_sizes(cwd=None, limit=40): # the run — the trap _head_first_line documents and sidesteps the same way. out = subprocess.run( ["git", "-c", "core.quotePath=false", "ls-tree", "-r", "-l", "HEAD"], - cwd=cwd, capture_output=True, check=True, + cwd=cwd, + capture_output=True, + check=True, ).stdout.decode("utf-8", "replace") except (subprocess.CalledProcessError, OSError): return None @@ -816,7 +837,8 @@ def history_disk_by_path(cwd=None, limit=40): # once — this walks every reachable object and can be huge on big repos. revs = subprocess.Popen( ["git", "-c", "core.quotePath=false", "rev-list", "--objects", "--all"], - cwd=cwd, stdout=subprocess.PIPE, + cwd=cwd, + stdout=subprocess.PIPE, ) # `%(rest)` echoes the path rev-list appended after each blob's oid. # No text=True: read bytes and decode each line leniently, so a non-UTF-8 @@ -824,7 +846,9 @@ def history_disk_by_path(cwd=None, limit=40): # decode and abort the run — same trap _head_first_line sidesteps. cat = subprocess.Popen( ["git", "cat-file", "--batch-check=%(objecttype) %(objectsize:disk) %(rest)"], - cwd=cwd, stdin=revs.stdout, stdout=subprocess.PIPE, + cwd=cwd, + stdin=revs.stdout, + stdout=subprocess.PIPE, ) # Drop our handle so rev-list gets SIGPIPE if cat-file exits early. revs.stdout.close() @@ -872,8 +896,12 @@ def count_branches(cwd=None): the union covers both. None when no refs resolve.""" try: out = git( - "for-each-ref", "--format=%(refname)", - "refs/heads", "refs/remotes", cwd=cwd, quiet=True, + "for-each-ref", + "--format=%(refname)", + "refs/heads", + "refs/remotes", + cwd=cwd, + quiet=True, ) except subprocess.CalledProcessError: return None @@ -881,11 +909,11 @@ def count_branches(cwd=None): for line in out.splitlines(): ref = line.strip() if ref.startswith("refs/heads/"): - names.add(ref[len("refs/heads/"):]) + names.add(ref[len("refs/heads/") :]) elif ref.startswith("refs/remotes/"): # refs/remotes// — drop the remote and skip the # symbolic origin/HEAD pointer so it isn't counted as a branch. - _, _, branch = ref[len("refs/remotes/"):].partition("/") + _, _, branch = ref[len("refs/remotes/") :].partition("/") if branch and branch != "HEAD": names.add(branch) return len(names) or None @@ -944,16 +972,17 @@ def collect_local(cwd=None, suppress_current_user=False): current_email = "" if not suppress_current_user: - try: + with contextlib.suppress(subprocess.CalledProcessError): current_email = git("config", "user.email", cwd=cwd).strip().lower() - except subprocess.CalledProcessError: - pass log = git( # -c core.quotePath=false: keep non-ASCII paths raw so log paths match # the `present` set from the HEAD tree below (both feed classify_path). - "-c", "core.quotePath=false", - "log", "--no-merges", "-M", + "-c", + "core.quotePath=false", + "log", + "--no-merges", + "-M", "--format=%H\x1f%s\x1f%aE\x1f%aN\x1f%aI", "--numstat", cwd=cwd, @@ -1247,9 +1276,7 @@ def fetch_user_profiles(logins, token): "repositories(privacy: PUBLIC, ownerAffiliations: OWNER) { totalCount }" ) var_decls = ", ".join(f"$l{i}: String!" for i in range(len(unique))) - fragments = " ".join( - f"u{i}: user(login: $l{i}) {{ {fields} }}" for i in range(len(unique)) - ) + fragments = " ".join(f"u{i}: user(login: $l{i}) {{ {fields} }}" for i in range(len(unique))) query = f"query({var_decls}) {{ {fragments} }}" variables = {f"l{i}": login for i, login in enumerate(unique)} @@ -1333,12 +1360,14 @@ def fetch_remote_tags(owner, repo, token): continue if not oid or not date: continue - tags.append({ - "name": node.get("name") or "", - "oid": oid, - "date": date, - "message": (message.splitlines() or [""])[0], - }) + tags.append( + { + "name": node.get("name") or "", + "oid": oid, + "date": date, + "message": (message.splitlines() or [""])[0], + } + ) page = refs.get("pageInfo") or {} if not page.get("hasNextPage"): break @@ -1363,10 +1392,19 @@ def gh_rest_get(path, token): # Manifests _frameworks_from_files actually parses (so we only fetch those # blobs). tsconfig.json / sentinels are presence-only — covered by the tree. -_REMOTE_MANIFEST_BASES = frozenset({ - "package.json", "composer.json", "pyproject.toml", "pipfile", - "setup.py", "setup.cfg", "gemfile", "go.mod", "cargo.toml", -}) +_REMOTE_MANIFEST_BASES = frozenset( + { + "package.json", + "composer.json", + "pyproject.toml", + "pipfile", + "setup.py", + "setup.cfg", + "gemfile", + "go.mod", + "cargo.toml", + } +) def _remote_manifest_paths(paths): @@ -1385,7 +1423,7 @@ def fetch_blob_texts(owner, repo, paths, token): out = {} paths = list(paths) for start in range(0, len(paths), 50): - chunk = paths[start:start + 50] + chunk = paths[start : start + 50] var_decls = ", ".join(f"$p{i}: String!" for i in range(len(chunk))) frags = " ".join( f"b{i}: object(expression: $p{i}) {{ ... on Blob {{ text }} }}" @@ -1430,8 +1468,7 @@ def fetch_frameworks_remote(owner, repo, token): # GitHub caps the recursive tree at ~100k entries / 7MB; deep manifests # past the cap are dropped, so detection may miss frameworks silently. print( - " warning: repo tree truncated by GitHub — framework detection " - "may be incomplete", + " warning: repo tree truncated by GitHub — framework detection may be incomplete", file=sys.stderr, ) paths = [e["path"] for e in (tree.get("tree") or []) if e.get("type") == "blob"] @@ -1479,8 +1516,9 @@ def fetch_languages_remote(owner, repo, token): return top_languages(langs) -def _paginate_history(fetch_page, cached_oids, last_n, since, - have_count_baseline, label, skip_first=False): +def _paginate_history( + fetch_page, cached_oids, last_n, since, have_count_baseline, label, skip_first=False +): """Walk a Commit.history connection page by page. fetch_page(cursor) -> history dict, or None when the anchor object is gone. @@ -1636,8 +1674,12 @@ def bail_partial(nodes): sys.exit("error: GitHub fetch failed after repeated retries; aborting.") new_nodes, top_reason = _paginate_history( - top_fetch_page, cached_oids, last_n, since, - have_count_baseline=len(cached_nodes), label="new", + top_fetch_page, + cached_oids, + last_n, + since, + have_count_baseline=len(cached_nodes), + label="new", ) if top_reason == "fetch_failed": @@ -1661,12 +1703,15 @@ def bail_partial(nodes): bottom_reason = None have_count = len(new_nodes) + len(cached_nodes) cached_oldest_date = ( - ((cached_nodes[-1].get("author") or {}).get("date") or "")[:10] - if cached_nodes else "" + ((cached_nodes[-1].get("author") or {}).get("date") or "")[:10] if cached_nodes else "" ) if needs_older_fetch( - have_count, cached_oldest_date, loaded_complete, - commits_filter, since, until, + have_count, + cached_oldest_date, + loaded_complete, + commits_filter, + since, + until, ): anchor_oid = cached_nodes[-1]["oid"] @@ -1685,8 +1730,13 @@ def bottom_fetch_page(cursor): return obj.get("history") or {"nodes": [], "pageInfo": {}} older_nodes, bottom_reason = _paginate_history( - bottom_fetch_page, cached_oids, last_n, since, - have_count_baseline=have_count, label="older", skip_first=True, + bottom_fetch_page, + cached_oids, + last_n, + since, + have_count_baseline=have_count, + label="older", + skip_first=True, ) if bottom_reason == "anchor_null": print( @@ -1760,12 +1810,15 @@ def bottom_fetch_page(cursor): def apply_filters(commits_meta, line_stats, commits_filter, since, until): if since or until: + def in_range(m): d = (m.get("iso") or "")[:10] return bool(d) and (not since or d >= since) and (not until or d <= until) + commits_meta = {h: m for h, m in commits_meta.items() if in_range(m)} if commits_filter: epoch = datetime(1970, 1, 1, tzinfo=timezone.utc) + def _ts(h): iso = commits_meta[h].get("iso") or "" if not iso: @@ -1774,11 +1827,12 @@ def _ts(h): return datetime.fromisoformat(iso.replace("Z", "+00:00")) except ValueError: return epoch + ordered = sorted(commits_meta, key=_ts) if commits_filter[0] == "last": - keep = set(ordered[-commits_filter[1]:]) + keep = set(ordered[-commits_filter[1] :]) else: - keep = set(ordered[commits_filter[1]:commits_filter[2]]) + keep = set(ordered[commits_filter[1] : commits_filter[2]]) commits_meta = {h: m for h, m in commits_meta.items() if h in keep} line_stats = {h: line_stats[h] for h in commits_meta if h in line_stats} return commits_meta, line_stats @@ -1926,8 +1980,7 @@ def build_data( weeks_sorted = sorted(all_weeks) weekly_data = { - r["email"]: [weekly_by_author[r["email"]].get(w, 0) for w in weeks_sorted] - for r in top + r["email"]: [weekly_by_author[r["email"]].get(w, 0) for w in weeks_sorted] for r in top } daily_data = {r["email"]: dict(daily_by_author[r["email"]]) for r in top} @@ -1947,17 +2000,18 @@ def build_data( cl = lang_stats.get(h) if cl: ftypes = sorted( - ([name, NAME_COLOR.get(name, OTHER_COLOR), files] - for name, (_, _, files) in cl.items()), - key=lambda x: x[2], reverse=True, + ( + [name, NAME_COLOR.get(name, OTHER_COLOR), files] + for name, (_, _, files) in cl.items() + ), + key=lambda x: x[2], + reverse=True, ) entry["f"] = ftypes[:4] commits_list.append(entry) date_range = ( - {"start": min(all_dates), "end": max(all_dates)} - if all_dates - else {"start": "", "end": ""} + {"start": min(all_dates), "end": max(all_dates)} if all_dates else {"start": "", "end": ""} ) return { "repoName": repo_name, @@ -2025,9 +2079,7 @@ def enrich_contributor_profiles(contributors, commits_meta, github_base, token=N missing = [c for c in contributors if not c.get("login")] if missing: - sample = _sample_oids_per_email( - commits_meta, {c["email"] for c in missing} - ) + sample = _sample_oids_per_email(commits_meta, {c["email"] for c in missing}) resolved = fetch_logins_for_commits( origin.group("owner"), origin.group("repo"), sample, token ) @@ -2221,8 +2273,9 @@ def render_markdown(data): def main(): - (top_n, remote, output, no_open, no_cache, clone, - commits_filter, since, until, formats) = parse_args(sys.argv[1:]) + (top_n, remote, output, no_open, no_cache, clone, commits_filter, since, until, formats) = ( + parse_args(sys.argv[1:]) + ) token = None if remote: @@ -2278,13 +2331,9 @@ def main(): ) else: try: - subprocess.check_output( - ["git", "rev-parse", "--git-dir"], stderr=subprocess.DEVNULL - ) + subprocess.check_output(["git", "rev-parse", "--git-dir"], stderr=subprocess.DEVNULL) except subprocess.CalledProcessError: - sys.exit( - "error: not in a git repository (and no owner/repo argument given)" - ) + sys.exit("error: not in a git repository (and no owner/repo argument given)") ( repo_name, github_base, @@ -2307,9 +2356,7 @@ def main(): commits_meta, line_stats = apply_filters( commits_meta, line_stats, commits_filter, since, until ) - print( - f" filtered: {len(commits_meta)}/{total_before} commits", file=sys.stderr - ) + print(f" filtered: {len(commits_meta)}/{total_before} commits", file=sys.stderr) if not commits_meta: sys.exit("error: no commits match the given filters") # The merge tally is whole-history (collected before filtering, and diff --git a/ruff.toml b/ruff.toml new file mode 100644 index 0000000..90fcc39 --- /dev/null +++ b/ruff.toml @@ -0,0 +1,16 @@ +# Ruff config — Python linting + formatting. Dev/CI only; never shipped. +# repo-intel stays stdlib-only and single-file, so Ruff is not a runtime dep. +line-length = 100 # matches Prettier's printWidth on the web side +target-version = "py39" + +[lint] +select = ["E", "F", "W", "I", "B", "UP", "SIM", "C4"] +# Line length is enforced by `ruff format`, not the linter — the formatter +# can't wrap strings/comments, so E501 would only nag about lines it can't fix. +ignore = ["E501"] + +[lint.per-file-ignores] +# The HELP constant intentionally sits above the imports so the file opens +# like a man page (see the top of repo-intel.py). The imports are plain +# module-level imports, not lazy/conditional, so E402 here isn't a real smell. +"repo-intel.py" = ["E402"]