Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ flowchart TD
- An existing internal repo with an `origin` remote
- An existing public repo with at least one commit (e.g. a README created during repo setup)
- *(Optional)* [`gh` CLI](https://cli.github.com/) authenticated via `gh auth login`. Enables automatic PR creation for GitHub-hosted repos. Without it, pubgate logs the manual steps instead.
- *(Optional)* [Git LFS](https://git-lfs.com/) if your repo uses LFS-tracked files. pubgate auto-detects LFS and handles pointer files automatically. Without it, LFS-specific operations are silently skipped.
- A clean worktree on `main`, synced with `origin` (no uncommitted changes, no unpushed commits)

### Setup
Expand Down Expand Up @@ -208,6 +209,7 @@ ignore = [
## Edge Cases

- **Binary files**: included as-is in staged snapshots (`BEGIN-INTERNAL` markers inside binaries are not processed); during absorb, binary modifications take the public version and are flagged for manual review.
- **Git LFS files**: LFS pointers pass through all pipelines without modification. LFS files are treated as binary (never merged, never scrubbed for internal markers). pubgate runs `git lfs fetch`/`push` automatically during absorb and publish. Use ignore patterns in `pubgate.toml` to exclude sensitive LFS files from publication. If LFS is not installed, these operations are silently skipped.
- **Renames on public repo**: the new path is copied in; the old file is kept locally and flagged for review.
- **Deletions on public repo**: deleted files are kept locally and flagged for review in the absorb PR.
- **Merge conflicts**: absorb uses three-way merge. Conflicts produce standard git conflict markers (`<<<<<<<`/`=======`/`>>>>>>>`) for manual resolution.
Expand Down
13 changes: 13 additions & 0 deletions SPEC.md
Original file line number Diff line number Diff line change
Expand Up @@ -143,3 +143,16 @@ The result is that divergence between the two repos is always controlled and bou
### State file conflicts on repeated publish without absorb

If the user publishes multiple times without running `absorb` between cycles, each publish PR is based on the same absorbed commit. The `.pubgate-staged` file will have different values on the PR branch versus `public-remote/main` (from the previous publish merge), and both appear as "added" relative to the absorbed base, producing a guaranteed merge conflict on this file. The conflict is trivially resolvable by taking the newer value. Running `absorb` between publish cycles advances the baseline and eliminates this.

### Git LFS support

pubgate supports repositories that use Git LFS. LFS support is auto-detected via `git lfs version` and requires no configuration.

**How it works:** LFS-tracked files are stored as pointer files in git. pubgate reads and writes these pointers as-is; they pass through the snapshot, stage, and publish pipelines without modification. When files are staged with `git add`, git's clean/smudge filters handle the LFS encoding automatically via `.gitattributes`.

**LFS object transfer:** pubgate runs `git lfs fetch` during command startups (absorb, publish) to ensure LFS objects are locally cached, and `git lfs push` after pushing branches to transfer LFS objects to the destination remote's LFS server.

**Limitations:**
- **LFS files are treated as binary**: they are never merged during `absorb` (copied/overwritten instead) and never scrubbed for `BEGIN-INTERNAL`/`END-INTERNAL` markers during `stage`. Do not place internal markers inside LFS-tracked files; use ignore patterns in `pubgate.toml` to exclude sensitive LFS files from publication.
- **`.gitattributes` is included as-is** in the public snapshot (with internal-block scrubbing if markers are present). If internal `.gitattributes` contains LFS patterns for files excluded by pubgate's ignore rules, those orphan patterns will appear in the public repo. This is harmless but may be confusing. Use `BEGIN-INTERNAL`/`END-INTERNAL` markers in `.gitattributes` to exclude internal-only LFS patterns.
- **When LFS is not installed**, pubgate's behavior is unchanged; LFS-specific operations (fetch, push) are silently skipped.
44 changes: 30 additions & 14 deletions pubgate/absorb.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from .config import Config
from .errors import GitError, PubGateError
from .filtering import scrub_internal_blocks
from .git import GitRepo
from .git import GitRepo, is_lfs_pointer
from .models import format_commit
from .state import AbsorbStatus, StateRef

Expand Down Expand Up @@ -88,6 +88,13 @@ def absorb_commit_message(
# ---------------------------------------------------------------------------


def _read_text_at_ref(git: GitRepo, ref: str, path: str) -> str | None:
data = git.read_file_at_ref_bytes(ref, path)
if data is None:
return None
return data.decode("utf-8")


def _apply_absorb_changes(
git: GitRepo,
base_sha: str,
Expand All @@ -106,27 +113,29 @@ def _apply_absorb_changes(
if change.is_add:
local_path = git.repo_dir / change.path
if local_path.exists():
if git.is_binary_at_ref(public_ref, change.path):
actions.append(f" added on public (kept local version, review manually): {change.path}")
kind = git.classify_at_ref(public_ref, change.path)
if kind != "text":
label = "LFS file" if kind == "lfs" else "binary"
actions.append(f" {label} added on public (kept local version, review manually): {change.path}")
else:
theirs_content = git.read_file_at_ref(public_ref, change.path)
theirs_content = _read_text_at_ref(git, public_ref, change.path)
if theirs_content is None:
actions.append(f" added on public (kept local version, review manually): {change.path}")
continue
# Try to find the published base: the scrubbed version of
# the file at the internal commit that was staged.
published_base: str | None = None
if staged_sha is not None:
staged_content = git.read_file_at_ref(staged_sha, change.path)
staged_content = _read_text_at_ref(git, staged_sha, change.path)
if staged_content is not None:
published_base = scrub_internal_blocks(staged_content, path=change.path)
if published_base is not None:
# Three-way merge using the published version as base
with tempfile.TemporaryDirectory() as tmpdir:
base_tmp = Path(tmpdir) / "base"
theirs_tmp = Path(tmpdir) / "theirs"
base_tmp.write_text(published_base, encoding="utf-8")
theirs_tmp.write_text(theirs_content, encoding="utf-8")
base_tmp.write_text(published_base, encoding="utf-8", newline="")
theirs_tmp.write_text(theirs_content, encoding="utf-8", newline="")
clean = git.merge_file(local_path, base_tmp, theirs_tmp)
git.stage(change.path)
if clean:
Expand All @@ -138,7 +147,13 @@ def _apply_absorb_changes(
actions.append(f" added on public (kept local, review manually): {change.path}")
else:
is_binary = git.copy_file_from_ref(public_ref, change.path)
actions.append(f" add{' (binary)' if is_binary else ''}: {change.path}")
if is_binary:
with open(git.repo_dir / change.path, "rb") as f:
head = f.read(1024)
tag = " (LFS)" if is_lfs_pointer(head) else " (binary)"
else:
tag = ""
actions.append(f" add{tag}: {change.path}")

elif change.is_modify:
_merge_file(git, base_sha, public_ref, change.path, actions, staged_sha=staged_sha)
Expand Down Expand Up @@ -175,7 +190,8 @@ def _merge_file(
f"at {public_ref}. Repository may have corrupt objects.",
)
git.write_file_and_stage_bytes(path, theirs_bytes)
actions.append(f" binary changed on public (replaced locally, review manually): {path}")
label = "LFS file" if is_lfs_pointer(theirs_bytes) else "binary"
actions.append(f" {label} changed on public (replaced locally, review manually): {path}")
return

# Use the scrubbed staged content as merge base when available.
Expand All @@ -184,12 +200,12 @@ def _merge_file(
# old public content (which would cause false conflicts on internal blocks).
base_content: str | None = None
if staged_sha is not None:
staged_content = git.read_file_at_ref(staged_sha, path)
staged_content = _read_text_at_ref(git, staged_sha, path)
if staged_content is not None:
base_content = scrub_internal_blocks(staged_content, path=path)
if base_content is None:
base_content = git.read_file_at_ref(base_sha, path)
theirs_content = git.read_file_at_ref(public_ref, path)
base_content = _read_text_at_ref(git, base_sha, path)
theirs_content = _read_text_at_ref(git, public_ref, path)

if base_content is None or theirs_content is None:
missing_ref = base_sha if base_content is None else public_ref
Expand All @@ -210,8 +226,8 @@ def _merge_file(
with tempfile.TemporaryDirectory() as tmpdir:
base_tmp = Path(tmpdir) / "base"
theirs_tmp = Path(tmpdir) / "theirs"
base_tmp.write_text(base_content, encoding="utf-8")
theirs_tmp.write_text(theirs_content, encoding="utf-8")
base_tmp.write_text(base_content, encoding="utf-8", newline="")
theirs_tmp.write_text(theirs_content, encoding="utf-8", newline="")

clean = git.merge_file(ours_path, base_tmp, theirs_tmp)
git.stage(path)
Expand Down
5 changes: 5 additions & 0 deletions pubgate/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ def _bootstrap_work() -> bool:
work_fn=_bootstrap_work,
)
self._push_to_remote(cfg.internal_absorb_branch, "origin", cfg.internal_absorb_branch, force=force)
self.git.lfs_push("origin", cfg.internal_absorb_branch)
title = f"pubgate: initialize absorb tracking at {public_head[:7]}"
self._handle_pr(
remote="origin",
Expand Down Expand Up @@ -175,6 +176,7 @@ def _absorb_work() -> bool:
work_fn=_absorb_work,
)
self._push_to_remote(cfg.internal_absorb_branch, "origin", cfg.internal_absorb_branch, force=force)
self.git.lfs_push("origin", cfg.internal_absorb_branch)
full_msg = absorb_commit_message(git, last_absorbed, public_head)
title, body = _split_message(full_msg)
self._handle_pr(
Expand Down Expand Up @@ -400,6 +402,7 @@ def _publish_work() -> bool:

def _publish_push() -> None:
self._push_to_remote(cfg.public_publish_branch, cfg.public_remote, cfg.public_publish_branch, force=force)
self.git.lfs_push(cfg.public_remote, cfg.public_publish_branch)

committed = self._run_on_pr_branch(
branch=cfg.public_publish_branch,
Expand Down Expand Up @@ -515,6 +518,7 @@ def _absorb_startup(self) -> AbsorbResult:
logger.debug("Starting absorb startup")
self._require_on_main()
self.git.fetch(self.cfg.public_remote)
self.git.lfs_fetch(self.cfg.public_remote, self.cfg.public_main_branch)
self._prune_internal_pr_branches()
self._prune_public_publish_branch()
return check_absorb(self.cfg, self.git)
Expand All @@ -531,6 +535,7 @@ def _publish_startup(self) -> None:
git, cfg = self.git, self.cfg
git.ensure_clean_worktree()
git.fetch("origin")
git.lfs_fetch("origin", cfg.internal_approved_branch)
git.fetch(cfg.public_remote)
self._prune_public_publish_branch()

Expand Down
80 changes: 62 additions & 18 deletions pubgate/git.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,24 @@
_TIMEOUT_NETWORK = 300


_LFS_POINTER_PREFIX = b"version https://git-lfs.github.com/spec/v1\n"
_LFS_POINTER_MAX_LEN = 512


def is_lfs_pointer(data: str | bytes) -> bool:
if isinstance(data, str):
data = data.encode("utf-8")
if len(data) > _LFS_POINTER_MAX_LEN:
return False
if not data.startswith(_LFS_POINTER_PREFIX):
return False
return b"\noid sha256:" in data and b"\nsize " in data


class GitRepo:
def __init__(self, repo_dir: Path) -> None:
self.repo_dir = repo_dir
self._lfs_available: bool | None = None

# ------------------------------------------------------------------
# Internal runners
Expand Down Expand Up @@ -309,18 +324,23 @@ def read_file_at_ref_bytes(self, ref: str, path: str) -> bytes | None:
)
return result.stdout

def is_binary_at_ref(self, ref: str, path: str) -> bool:
def classify_at_ref(self, ref: str, path: str) -> str:
data = self.read_file_at_ref_bytes(ref, path)
if data is None:
return False
return "text"
if is_lfs_pointer(data):
return "lfs"
chunk = data[:8192]
if b"\x00" in chunk:
return True
return "binary"
try:
chunk.decode("utf-8")
except UnicodeDecodeError:
return True
return False
return "binary"
return "text"

def is_binary_at_ref(self, ref: str, path: str) -> bool:
return self.classify_at_ref(ref, path) != "text"

def read_file_auto(self, ref: str, path: str) -> str | bytes | None:
data = self.read_file_at_ref_bytes(ref, path)
Expand All @@ -345,7 +365,7 @@ def stage(self, path: str) -> None:
def write_file_and_stage(self, repo_relative_path: str, content: str) -> None:
full_path = self.repo_dir / repo_relative_path
full_path.parent.mkdir(parents=True, exist_ok=True)
full_path.write_text(content, encoding="utf-8")
full_path.write_text(content, encoding="utf-8", newline="")
self._run("add", repo_relative_path)

def write_file_and_stage_bytes(self, repo_relative_path: str, content: bytes) -> None:
Expand All @@ -370,19 +390,13 @@ def rm_all_tracked(self) -> None:
self._run("rm", "-rf", "--ignore-unmatch", ".", check=False)

def copy_file_from_ref(self, ref: str, path: str) -> bool:
if self.is_binary_at_ref(ref, path):
content = self.read_file_at_ref_bytes(ref, path)
if content is not None:
self.write_file_and_stage_bytes(path, content)
else:
logger.warning("Could not read binary file %s at %s (skipped)", path, ref)
return True
content = self.read_file_at_ref(ref, path)
if content is not None:
self.write_file_and_stage(path, content)
else:
content = self.read_file_auto(ref, path)
if content is None:
logger.warning("Could not read file %s at %s (skipped)", path, ref)
return False
return False
is_binary = isinstance(content, bytes) or is_lfs_pointer(content)
self.write_file_and_stage_auto(path, content)
return is_binary

# ------------------------------------------------------------------
# Commits
Expand All @@ -407,3 +421,33 @@ def merge_file(self, ours: Path, base: Path, theirs: Path) -> bool:
def is_ancestor(self, ancestor: str, descendant: str) -> bool:
result = self._run("merge-base", "--is-ancestor", ancestor, descendant, check=False)
return result.returncode == 0

# ------------------------------------------------------------------
# LFS operations
# ------------------------------------------------------------------

def is_lfs_available(self) -> bool:
if self._lfs_available is None:
result = self._run("lfs", "version", check=False)
self._lfs_available = result.returncode == 0
if self._lfs_available:
logger.debug("Git LFS available: %s", result.stdout.strip())
else:
logger.debug("Git LFS not available")
return self._lfs_available

def lfs_fetch(self, remote: str, ref: str) -> None:
if not self.is_lfs_available():
return
logger.info("Fetching LFS objects from %s for %s", remote, ref)
result = self._run("lfs", "fetch", remote, ref, check=False, timeout=_TIMEOUT_NETWORK)
if result.returncode != 0:
logger.warning("LFS fetch failed (exit %d): %s", result.returncode, result.stderr.strip())

def lfs_push(self, remote: str, branch: str) -> None:
if not self.is_lfs_available():
return
logger.info("Pushing LFS objects to %s for %s", remote, branch)
result = self._run("lfs", "push", remote, branch, check=False, timeout=_TIMEOUT_NETWORK)
if result.returncode != 0:
logger.warning("LFS push failed (exit %d): %s", result.returncode, result.stderr.strip())
11 changes: 10 additions & 1 deletion pubgate/stage_snapshot.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from .config import Config
from .errors import PubGateError
from .filtering import check_conflict_markers, check_residual_markers, is_ignored, scrub_internal_blocks
from .git import GitRepo
from .git import GitRepo, is_lfs_pointer
from .models import format_commit
from .state import StateRef

Expand All @@ -18,6 +18,7 @@ def build_stage_snapshot(
) -> dict[str, str | bytes]:
all_files = git.ls_tree(ref)
snapshot: dict[str, str | bytes] = {}
lfs_files: list[str] = []

for path in all_files:
if path in excluded:
Expand All @@ -33,6 +34,10 @@ def build_stage_snapshot(

if isinstance(content, bytes):
snapshot[path] = content
elif is_lfs_pointer(content):
logger.debug("LFS pointer (skipping scrub): %s", path)
lfs_files.append(path)
snapshot[path] = content
else:
try:
content = scrub_internal_blocks(content, path=path)
Expand All @@ -42,6 +47,10 @@ def build_stage_snapshot(
raise PubGateError(f"Error: {exc}") from exc
snapshot[path] = content

if lfs_files:
logger.info("Snapshot includes %d LFS-tracked %s", len(lfs_files), "file" if len(lfs_files) == 1 else "files")
for path in lfs_files:
logger.debug(" LFS: %s", path)
logger.debug("Snapshot contains %d files", len(snapshot))
return snapshot

Expand Down
Loading
Loading