diff --git a/README.md b/README.md index 32ce6dd..c8920c4 100644 --- a/README.md +++ b/README.md @@ -74,6 +74,7 @@ flowchart TD - An existing internal repo with an `origin` remote - An existing public repo with at least one commit (e.g. a README created during repo setup) - *(Optional)* [`gh` CLI](https://cli.github.com/) authenticated via `gh auth login`. Enables automatic PR creation for GitHub-hosted repos. Without it, pubgate logs the manual steps instead. +- *(Optional)* [Git LFS](https://git-lfs.com/) if your repo uses LFS-tracked files. pubgate auto-detects LFS and handles pointer files automatically. Without it, LFS-specific operations are silently skipped. - A clean worktree on `main`, synced with `origin` (no uncommitted changes, no unpushed commits) ### Setup @@ -208,6 +209,7 @@ ignore = [ ## Edge Cases - **Binary files**: included as-is in staged snapshots (`BEGIN-INTERNAL` markers inside binaries are not processed); during absorb, binary modifications take the public version and are flagged for manual review. +- **Git LFS files**: LFS pointers pass through all pipelines without modification. LFS files are treated as binary (never merged, never scrubbed for internal markers). pubgate runs `git lfs fetch`/`push` automatically during absorb and publish. Use ignore patterns in `pubgate.toml` to exclude sensitive LFS files from publication. If LFS is not installed, these operations are silently skipped. - **Renames on public repo**: the new path is copied in; the old file is kept locally and flagged for review. - **Deletions on public repo**: deleted files are kept locally and flagged for review in the absorb PR. - **Merge conflicts**: absorb uses three-way merge. Conflicts produce standard git conflict markers (`<<<<<<<`/`=======`/`>>>>>>>`) for manual resolution. diff --git a/SPEC.md b/SPEC.md index de7781b..a05d074 100644 --- a/SPEC.md +++ b/SPEC.md @@ -143,3 +143,16 @@ The result is that divergence between the two repos is always controlled and bou ### State file conflicts on repeated publish without absorb If the user publishes multiple times without running `absorb` between cycles, each publish PR is based on the same absorbed commit. The `.pubgate-staged` file will have different values on the PR branch versus `public-remote/main` (from the previous publish merge), and both appear as "added" relative to the absorbed base, producing a guaranteed merge conflict on this file. The conflict is trivially resolvable by taking the newer value. Running `absorb` between publish cycles advances the baseline and eliminates this. + +### Git LFS support + +pubgate supports repositories that use Git LFS. LFS support is auto-detected via `git lfs version` and requires no configuration. + +**How it works:** LFS-tracked files are stored as pointer files in git. pubgate reads and writes these pointers as-is; they pass through the snapshot, stage, and publish pipelines without modification. When files are staged with `git add`, git's clean/smudge filters handle the LFS encoding automatically via `.gitattributes`. + +**LFS object transfer:** pubgate runs `git lfs fetch` during command startups (absorb, publish) to ensure LFS objects are locally cached, and `git lfs push` after pushing branches to transfer LFS objects to the destination remote's LFS server. + +**Limitations:** +- **LFS files are treated as binary**: they are never merged during `absorb` (copied/overwritten instead) and never scrubbed for `BEGIN-INTERNAL`/`END-INTERNAL` markers during `stage`. Do not place internal markers inside LFS-tracked files; use ignore patterns in `pubgate.toml` to exclude sensitive LFS files from publication. +- **`.gitattributes` is included as-is** in the public snapshot (with internal-block scrubbing if markers are present). If internal `.gitattributes` contains LFS patterns for files excluded by pubgate's ignore rules, those orphan patterns will appear in the public repo. This is harmless but may be confusing. Use `BEGIN-INTERNAL`/`END-INTERNAL` markers in `.gitattributes` to exclude internal-only LFS patterns. +- **When LFS is not installed**, pubgate's behavior is unchanged; LFS-specific operations (fetch, push) are silently skipped. diff --git a/pubgate/absorb.py b/pubgate/absorb.py index 0767e7e..14630dc 100644 --- a/pubgate/absorb.py +++ b/pubgate/absorb.py @@ -5,7 +5,7 @@ from .config import Config from .errors import GitError, PubGateError from .filtering import scrub_internal_blocks -from .git import GitRepo +from .git import GitRepo, is_lfs_pointer from .models import format_commit from .state import AbsorbStatus, StateRef @@ -88,6 +88,13 @@ def absorb_commit_message( # --------------------------------------------------------------------------- +def _read_text_at_ref(git: GitRepo, ref: str, path: str) -> str | None: + data = git.read_file_at_ref_bytes(ref, path) + if data is None: + return None + return data.decode("utf-8") + + def _apply_absorb_changes( git: GitRepo, base_sha: str, @@ -106,10 +113,12 @@ def _apply_absorb_changes( if change.is_add: local_path = git.repo_dir / change.path if local_path.exists(): - if git.is_binary_at_ref(public_ref, change.path): - actions.append(f" added on public (kept local version, review manually): {change.path}") + kind = git.classify_at_ref(public_ref, change.path) + if kind != "text": + label = "LFS file" if kind == "lfs" else "binary" + actions.append(f" {label} added on public (kept local version, review manually): {change.path}") else: - theirs_content = git.read_file_at_ref(public_ref, change.path) + theirs_content = _read_text_at_ref(git, public_ref, change.path) if theirs_content is None: actions.append(f" added on public (kept local version, review manually): {change.path}") continue @@ -117,7 +126,7 @@ def _apply_absorb_changes( # the file at the internal commit that was staged. published_base: str | None = None if staged_sha is not None: - staged_content = git.read_file_at_ref(staged_sha, change.path) + staged_content = _read_text_at_ref(git, staged_sha, change.path) if staged_content is not None: published_base = scrub_internal_blocks(staged_content, path=change.path) if published_base is not None: @@ -125,8 +134,8 @@ def _apply_absorb_changes( with tempfile.TemporaryDirectory() as tmpdir: base_tmp = Path(tmpdir) / "base" theirs_tmp = Path(tmpdir) / "theirs" - base_tmp.write_text(published_base, encoding="utf-8") - theirs_tmp.write_text(theirs_content, encoding="utf-8") + base_tmp.write_text(published_base, encoding="utf-8", newline="") + theirs_tmp.write_text(theirs_content, encoding="utf-8", newline="") clean = git.merge_file(local_path, base_tmp, theirs_tmp) git.stage(change.path) if clean: @@ -138,7 +147,13 @@ def _apply_absorb_changes( actions.append(f" added on public (kept local, review manually): {change.path}") else: is_binary = git.copy_file_from_ref(public_ref, change.path) - actions.append(f" add{' (binary)' if is_binary else ''}: {change.path}") + if is_binary: + with open(git.repo_dir / change.path, "rb") as f: + head = f.read(1024) + tag = " (LFS)" if is_lfs_pointer(head) else " (binary)" + else: + tag = "" + actions.append(f" add{tag}: {change.path}") elif change.is_modify: _merge_file(git, base_sha, public_ref, change.path, actions, staged_sha=staged_sha) @@ -175,7 +190,8 @@ def _merge_file( f"at {public_ref}. Repository may have corrupt objects.", ) git.write_file_and_stage_bytes(path, theirs_bytes) - actions.append(f" binary changed on public (replaced locally, review manually): {path}") + label = "LFS file" if is_lfs_pointer(theirs_bytes) else "binary" + actions.append(f" {label} changed on public (replaced locally, review manually): {path}") return # Use the scrubbed staged content as merge base when available. @@ -184,12 +200,12 @@ def _merge_file( # old public content (which would cause false conflicts on internal blocks). base_content: str | None = None if staged_sha is not None: - staged_content = git.read_file_at_ref(staged_sha, path) + staged_content = _read_text_at_ref(git, staged_sha, path) if staged_content is not None: base_content = scrub_internal_blocks(staged_content, path=path) if base_content is None: - base_content = git.read_file_at_ref(base_sha, path) - theirs_content = git.read_file_at_ref(public_ref, path) + base_content = _read_text_at_ref(git, base_sha, path) + theirs_content = _read_text_at_ref(git, public_ref, path) if base_content is None or theirs_content is None: missing_ref = base_sha if base_content is None else public_ref @@ -210,8 +226,8 @@ def _merge_file( with tempfile.TemporaryDirectory() as tmpdir: base_tmp = Path(tmpdir) / "base" theirs_tmp = Path(tmpdir) / "theirs" - base_tmp.write_text(base_content, encoding="utf-8") - theirs_tmp.write_text(theirs_content, encoding="utf-8") + base_tmp.write_text(base_content, encoding="utf-8", newline="") + theirs_tmp.write_text(theirs_content, encoding="utf-8", newline="") clean = git.merge_file(ours_path, base_tmp, theirs_tmp) git.stage(path) diff --git a/pubgate/core.py b/pubgate/core.py index cbe1549..e937cec 100644 --- a/pubgate/core.py +++ b/pubgate/core.py @@ -96,6 +96,7 @@ def _bootstrap_work() -> bool: work_fn=_bootstrap_work, ) self._push_to_remote(cfg.internal_absorb_branch, "origin", cfg.internal_absorb_branch, force=force) + self.git.lfs_push("origin", cfg.internal_absorb_branch) title = f"pubgate: initialize absorb tracking at {public_head[:7]}" self._handle_pr( remote="origin", @@ -175,6 +176,7 @@ def _absorb_work() -> bool: work_fn=_absorb_work, ) self._push_to_remote(cfg.internal_absorb_branch, "origin", cfg.internal_absorb_branch, force=force) + self.git.lfs_push("origin", cfg.internal_absorb_branch) full_msg = absorb_commit_message(git, last_absorbed, public_head) title, body = _split_message(full_msg) self._handle_pr( @@ -400,6 +402,7 @@ def _publish_work() -> bool: def _publish_push() -> None: self._push_to_remote(cfg.public_publish_branch, cfg.public_remote, cfg.public_publish_branch, force=force) + self.git.lfs_push(cfg.public_remote, cfg.public_publish_branch) committed = self._run_on_pr_branch( branch=cfg.public_publish_branch, @@ -515,6 +518,7 @@ def _absorb_startup(self) -> AbsorbResult: logger.debug("Starting absorb startup") self._require_on_main() self.git.fetch(self.cfg.public_remote) + self.git.lfs_fetch(self.cfg.public_remote, self.cfg.public_main_branch) self._prune_internal_pr_branches() self._prune_public_publish_branch() return check_absorb(self.cfg, self.git) @@ -531,6 +535,7 @@ def _publish_startup(self) -> None: git, cfg = self.git, self.cfg git.ensure_clean_worktree() git.fetch("origin") + git.lfs_fetch("origin", cfg.internal_approved_branch) git.fetch(cfg.public_remote) self._prune_public_publish_branch() diff --git a/pubgate/git.py b/pubgate/git.py index 3e5d5c1..1ac0d91 100644 --- a/pubgate/git.py +++ b/pubgate/git.py @@ -13,9 +13,24 @@ _TIMEOUT_NETWORK = 300 +_LFS_POINTER_PREFIX = b"version https://git-lfs.github.com/spec/v1\n" +_LFS_POINTER_MAX_LEN = 512 + + +def is_lfs_pointer(data: str | bytes) -> bool: + if isinstance(data, str): + data = data.encode("utf-8") + if len(data) > _LFS_POINTER_MAX_LEN: + return False + if not data.startswith(_LFS_POINTER_PREFIX): + return False + return b"\noid sha256:" in data and b"\nsize " in data + + class GitRepo: def __init__(self, repo_dir: Path) -> None: self.repo_dir = repo_dir + self._lfs_available: bool | None = None # ------------------------------------------------------------------ # Internal runners @@ -309,18 +324,23 @@ def read_file_at_ref_bytes(self, ref: str, path: str) -> bytes | None: ) return result.stdout - def is_binary_at_ref(self, ref: str, path: str) -> bool: + def classify_at_ref(self, ref: str, path: str) -> str: data = self.read_file_at_ref_bytes(ref, path) if data is None: - return False + return "text" + if is_lfs_pointer(data): + return "lfs" chunk = data[:8192] if b"\x00" in chunk: - return True + return "binary" try: chunk.decode("utf-8") except UnicodeDecodeError: - return True - return False + return "binary" + return "text" + + def is_binary_at_ref(self, ref: str, path: str) -> bool: + return self.classify_at_ref(ref, path) != "text" def read_file_auto(self, ref: str, path: str) -> str | bytes | None: data = self.read_file_at_ref_bytes(ref, path) @@ -345,7 +365,7 @@ def stage(self, path: str) -> None: def write_file_and_stage(self, repo_relative_path: str, content: str) -> None: full_path = self.repo_dir / repo_relative_path full_path.parent.mkdir(parents=True, exist_ok=True) - full_path.write_text(content, encoding="utf-8") + full_path.write_text(content, encoding="utf-8", newline="") self._run("add", repo_relative_path) def write_file_and_stage_bytes(self, repo_relative_path: str, content: bytes) -> None: @@ -370,19 +390,13 @@ def rm_all_tracked(self) -> None: self._run("rm", "-rf", "--ignore-unmatch", ".", check=False) def copy_file_from_ref(self, ref: str, path: str) -> bool: - if self.is_binary_at_ref(ref, path): - content = self.read_file_at_ref_bytes(ref, path) - if content is not None: - self.write_file_and_stage_bytes(path, content) - else: - logger.warning("Could not read binary file %s at %s (skipped)", path, ref) - return True - content = self.read_file_at_ref(ref, path) - if content is not None: - self.write_file_and_stage(path, content) - else: + content = self.read_file_auto(ref, path) + if content is None: logger.warning("Could not read file %s at %s (skipped)", path, ref) - return False + return False + is_binary = isinstance(content, bytes) or is_lfs_pointer(content) + self.write_file_and_stage_auto(path, content) + return is_binary # ------------------------------------------------------------------ # Commits @@ -407,3 +421,33 @@ def merge_file(self, ours: Path, base: Path, theirs: Path) -> bool: def is_ancestor(self, ancestor: str, descendant: str) -> bool: result = self._run("merge-base", "--is-ancestor", ancestor, descendant, check=False) return result.returncode == 0 + + # ------------------------------------------------------------------ + # LFS operations + # ------------------------------------------------------------------ + + def is_lfs_available(self) -> bool: + if self._lfs_available is None: + result = self._run("lfs", "version", check=False) + self._lfs_available = result.returncode == 0 + if self._lfs_available: + logger.debug("Git LFS available: %s", result.stdout.strip()) + else: + logger.debug("Git LFS not available") + return self._lfs_available + + def lfs_fetch(self, remote: str, ref: str) -> None: + if not self.is_lfs_available(): + return + logger.info("Fetching LFS objects from %s for %s", remote, ref) + result = self._run("lfs", "fetch", remote, ref, check=False, timeout=_TIMEOUT_NETWORK) + if result.returncode != 0: + logger.warning("LFS fetch failed (exit %d): %s", result.returncode, result.stderr.strip()) + + def lfs_push(self, remote: str, branch: str) -> None: + if not self.is_lfs_available(): + return + logger.info("Pushing LFS objects to %s for %s", remote, branch) + result = self._run("lfs", "push", remote, branch, check=False, timeout=_TIMEOUT_NETWORK) + if result.returncode != 0: + logger.warning("LFS push failed (exit %d): %s", result.returncode, result.stderr.strip()) diff --git a/pubgate/stage_snapshot.py b/pubgate/stage_snapshot.py index 275b28f..f825b49 100644 --- a/pubgate/stage_snapshot.py +++ b/pubgate/stage_snapshot.py @@ -3,7 +3,7 @@ from .config import Config from .errors import PubGateError from .filtering import check_conflict_markers, check_residual_markers, is_ignored, scrub_internal_blocks -from .git import GitRepo +from .git import GitRepo, is_lfs_pointer from .models import format_commit from .state import StateRef @@ -18,6 +18,7 @@ def build_stage_snapshot( ) -> dict[str, str | bytes]: all_files = git.ls_tree(ref) snapshot: dict[str, str | bytes] = {} + lfs_files: list[str] = [] for path in all_files: if path in excluded: @@ -33,6 +34,10 @@ def build_stage_snapshot( if isinstance(content, bytes): snapshot[path] = content + elif is_lfs_pointer(content): + logger.debug("LFS pointer (skipping scrub): %s", path) + lfs_files.append(path) + snapshot[path] = content else: try: content = scrub_internal_blocks(content, path=path) @@ -42,6 +47,10 @@ def build_stage_snapshot( raise PubGateError(f"Error: {exc}") from exc snapshot[path] = content + if lfs_files: + logger.info("Snapshot includes %d LFS-tracked %s", len(lfs_files), "file" if len(lfs_files) == 1 else "files") + for path in lfs_files: + logger.debug(" LFS: %s", path) logger.debug("Snapshot contains %d files", len(snapshot)) return snapshot diff --git a/tests/test_absorb.py b/tests/test_absorb.py index 1155719..1d90602 100644 --- a/tests/test_absorb.py +++ b/tests/test_absorb.py @@ -319,15 +319,15 @@ def test_unreadable_modified_file_raises(self, topo: Topology): # three-way merge path (not the "missing locally" shortcut) topo.commit_internal({"tracked.txt": "original\n"}, push=True) - # Patch read_file_at_ref to return None for tracked.txt only - original_read = GitRepo.read_file_at_ref + # Patch read_file_at_ref_bytes to return None for tracked.txt only + original_read = GitRepo.read_file_at_ref_bytes def fake_read(self, ref, path): if path == "tracked.txt": return None return original_read(self, ref, path) - with patch.object(GitRepo, "read_file_at_ref", fake_read): + with patch.object(GitRepo, "read_file_at_ref_bytes", fake_read): with pytest.raises(PubGateError, match="unreadable"): topo.pubgate.absorb() @@ -635,3 +635,46 @@ def test_warning_when_stage_state_unreadable(self, topo: Topology, caplog): topo.pubgate.absorb() assert "Could not read stage state" in caplog.text + + +class TestAbsorbCRLF: + def test_new_file_preserves_crlf(self, topo: Topology): + from conftest import _git + + topo.bootstrap_absorb() + + # Force a CRLF file into public repo (bypass autocrlf normalization) + ext_path = topo.external_contributor.path + (ext_path / "crlf.txt").write_bytes(b"line1\r\nline2\r\n") + _git(ext_path, "-c", "core.autocrlf=false", "add", "crlf.txt") + _git(ext_path, "commit", "-m", "add crlf file") + topo.external_contributor.push("origin", "main") + + topo.pubgate.absorb() + + raw = topo.work_dir.git.read_file_at_ref_bytes(topo.cfg.internal_absorb_branch, "crlf.txt") + assert raw == b"line1\r\nline2\r\n" + + def test_modified_file_merge_preserves_crlf(self, topo: Topology): + from conftest import _git + + # Set up a file with CRLF on public + ext_path = topo.external_contributor.path + (ext_path / "shared.txt").write_bytes(b"line1\r\nline2\r\n") + _git(ext_path, "-c", "core.autocrlf=false", "add", "shared.txt") + _git(ext_path, "commit", "-m", "add shared with crlf") + topo.external_contributor.push("origin", "main") + + topo.bootstrap_absorb() + + # Modify the file on public (keep CRLF) + (ext_path / "shared.txt").write_bytes(b"line1\r\nline2\r\nline3\r\n") + _git(ext_path, "-c", "core.autocrlf=false", "add", "shared.txt") + _git(ext_path, "commit", "-m", "modify shared with crlf") + topo.external_contributor.push("origin", "main") + + topo.pubgate.absorb() + + raw = topo.work_dir.git.read_file_at_ref_bytes(topo.cfg.internal_absorb_branch, "shared.txt") + assert raw is not None + assert b"\r\n" in raw diff --git a/tests/test_cli.py b/tests/test_cli.py index 5d7b307..d942100 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -251,8 +251,7 @@ def test_warning_on_unreadable_text_file(self, topo: Topology, caplog): git = topo.work_dir.git with ( - mock_patch.object(git, "is_binary_at_ref", return_value=False), - mock_patch.object(git, "read_file_at_ref", return_value=None), + mock_patch.object(git, "read_file_at_ref_bytes", return_value=None), caplog.at_level(logging.WARNING, logger="pubgate"), ): git.copy_file_from_ref("HEAD", "ghost.txt") @@ -263,12 +262,11 @@ def test_warning_on_unreadable_binary_file(self, topo: Topology, caplog): from unittest.mock import patch as mock_patch with ( - mock_patch.object(git, "is_binary_at_ref", return_value=True), mock_patch.object(git, "read_file_at_ref_bytes", return_value=None), caplog.at_level(logging.WARNING, logger="pubgate"), ): git.copy_file_from_ref("HEAD", "ghost.bin") - assert "Could not read binary file" in caplog.text + assert "Could not read file" in caplog.text class TestBranchSyncValidation: diff --git a/tests/test_lfs.py b/tests/test_lfs.py new file mode 100644 index 0000000..ab1a996 --- /dev/null +++ b/tests/test_lfs.py @@ -0,0 +1,322 @@ +import logging + +import pytest +from conftest import Topology + +from pubgate.config import DEFAULT_IGNORE_PATTERNS +from pubgate.filtering import is_ignored +from pubgate.git import is_lfs_pointer +from pubgate.stage_snapshot import build_stage_snapshot, snapshot_unchanged_ref + +# --------------------------------------------------------------------------- +# Sample LFS pointer data +# --------------------------------------------------------------------------- + +SAMPLE_LFS_POINTER = ( + b"version https://git-lfs.github.com/spec/v1\n" + b"oid sha256:4d7a214614ab2935c943f9e0ff69d22eadbb8f32b1258daaa5e2ca24d17e2393\n" + b"size 12345\n" +) + +SAMPLE_LFS_POINTER_STR = SAMPLE_LFS_POINTER.decode("utf-8") + + +# --------------------------------------------------------------------------- +# is_lfs_pointer unit tests +# --------------------------------------------------------------------------- + + +class TestIsLfsPointer: + def test_valid_pointer(self): + assert is_lfs_pointer(SAMPLE_LFS_POINTER) is True + + def test_valid_pointer_with_extra_fields(self): + data = ( + b"version https://git-lfs.github.com/spec/v1\n" + b"oid sha256:abcdef1234567890abcdef1234567890abcdef1234567890abcdef1234567890\n" + b"size 0\n" + ) + assert is_lfs_pointer(data) is True + + def test_wrong_version(self): + data = ( + b"version https://git-lfs.github.com/spec/v2\n" + b"oid sha256:4d7a214614ab2935c943f9e0ff69d22eadbb8f32b1258daaa5e2ca24d17e2393\n" + b"size 12345\n" + ) + assert is_lfs_pointer(data) is False + + def test_missing_oid(self): + data = b"version https://git-lfs.github.com/spec/v1\nsize 12345\n" + assert is_lfs_pointer(data) is False + + def test_missing_size(self): + data = ( + b"version https://git-lfs.github.com/spec/v1\n" + b"oid sha256:4d7a214614ab2935c943f9e0ff69d22eadbb8f32b1258daaa5e2ca24d17e2393\n" + ) + assert is_lfs_pointer(data) is False + + def test_regular_text(self): + assert is_lfs_pointer(b"just regular text\n") is False + + def test_empty_bytes(self): + assert is_lfs_pointer(b"") is False + + def test_binary_data(self): + assert is_lfs_pointer(b"\x89PNG\r\n\x1a\n\x00\x00") is False + + def test_too_large_to_be_pointer(self): + data = SAMPLE_LFS_POINTER + b"x" * 512 + assert is_lfs_pointer(data) is False + + +# --------------------------------------------------------------------------- +# is_binary_at_ref with LFS pointers +# --------------------------------------------------------------------------- + + +class TestIsBinaryAtRefLfs: + def test_lfs_pointer_detected_as_binary(self, topo: Topology): + # Commit an LFS pointer as a regular file (simulates what git stores for LFS files) + topo.commit_internal({"large.bin": SAMPLE_LFS_POINTER_STR}) + assert topo.work_dir.git.is_binary_at_ref("HEAD", "large.bin") + + +# --------------------------------------------------------------------------- +# Stage snapshot with LFS pointers +# --------------------------------------------------------------------------- + + +class TestStageSnapshotLfs: + def test_lfs_pointer_passes_through_without_scrub(self, topo: Topology): + topo.commit_internal({"data.bin": SAMPLE_LFS_POINTER_STR}) + snapshot = build_stage_snapshot( + topo.work_dir.git, + "HEAD", + ignore_patterns=[], + excluded=frozenset(), + ) + assert "data.bin" in snapshot + assert snapshot["data.bin"] == SAMPLE_LFS_POINTER_STR + + def test_lfs_pointer_not_scrubbed_for_internal_markers(self, topo: Topology): + # Even though the pointer text is valid UTF-8, it should not be + # passed through scrub_internal_blocks + topo.commit_internal({"model.bin": SAMPLE_LFS_POINTER_STR}) + snapshot = build_stage_snapshot( + topo.work_dir.git, + "HEAD", + ignore_patterns=[], + excluded=frozenset(), + ) + # Pointer text should be preserved exactly + assert snapshot["model.bin"] == SAMPLE_LFS_POINTER_STR + + +# --------------------------------------------------------------------------- +# .gitattributes not excluded by default ignore patterns +# --------------------------------------------------------------------------- + + +class TestGitattributesNotIgnored: + def test_gitattributes_not_matched_by_default_patterns(self): + assert not is_ignored(".gitattributes", DEFAULT_IGNORE_PATTERNS) + + def test_gitattributes_included_in_snapshot(self, topo: Topology): + topo.commit_internal({".gitattributes": "*.bin filter=lfs diff=lfs merge=lfs -text\n"}) + snapshot = build_stage_snapshot( + topo.work_dir.git, + "HEAD", + ignore_patterns=list(DEFAULT_IGNORE_PATTERNS), + excluded=frozenset(), + ) + assert ".gitattributes" in snapshot + attrs = snapshot[".gitattributes"] + assert isinstance(attrs, str) + assert "filter=lfs" in attrs + + +# --------------------------------------------------------------------------- +# is_lfs_available +# --------------------------------------------------------------------------- + + +class TestIsLfsAvailable: + def test_result_is_cached(self, topo: Topology): + git = topo.work_dir.git + result1 = git.is_lfs_available() + result2 = git.is_lfs_available() + assert result1 == result2 + # Verify caching happened + assert git._lfs_available is not None + + +# --------------------------------------------------------------------------- +# copy_file_from_ref with LFS pointers +# --------------------------------------------------------------------------- + + +class TestCopyFileFromRefLfs: + def test_returns_true_for_lfs_pointer(self, topo: Topology): + topo.commit_internal({"data.bin": SAMPLE_LFS_POINTER_STR}) + git = topo.work_dir.git + result = git.copy_file_from_ref("HEAD", "data.bin") + assert result is True + + def test_preserves_pointer_content(self, topo: Topology): + topo.commit_internal({"data.bin": SAMPLE_LFS_POINTER_STR}) + git = topo.work_dir.git + git.copy_file_from_ref("HEAD", "data.bin") + on_disk = (git.repo_dir / "data.bin").read_text(encoding="utf-8") + assert on_disk == SAMPLE_LFS_POINTER_STR + + +# --------------------------------------------------------------------------- +# snapshot_unchanged_ref with LFS pointers +# --------------------------------------------------------------------------- + + +class TestSnapshotUnchangedLfs: + def test_identical_lfs_pointer_detected_as_unchanged(self, topo: Topology): + topo.bootstrap_absorb() + topo.commit_internal({"model.bin": SAMPLE_LFS_POINTER_STR}) + topo.pubgate.stage() + topo.merge_internal_pr(topo.cfg.internal_stage_branch, topo.cfg.internal_approved_branch) + topo.work_dir.run("checkout", "main") + + # Build the same snapshot again (no changes) + snapshot = build_stage_snapshot( + topo.work_dir.git, + topo.cfg.internal_main_branch, + ignore_patterns=list(topo.cfg.ignore), + excluded=frozenset({"pubgate.toml"}), + ) + ref = snapshot_unchanged_ref(topo.cfg, topo.work_dir.git, snapshot) + assert ref is not None + + +# --------------------------------------------------------------------------- +# Absorb with LFS pointer files +# --------------------------------------------------------------------------- + + +class TestAbsorbLfsAdd: + def test_lfs_pointer_added_on_public_new_file(self, topo: Topology, caplog): + topo.bootstrap_absorb() + topo.commit_to_public({"model.bin": SAMPLE_LFS_POINTER_STR}) + with caplog.at_level(logging.INFO, logger="pubgate"): + topo.pubgate.absorb() + assert "LFS" in caplog.text + assert "model.bin" in caplog.text + + absorbed = topo.work_dir.read_file_at_ref(topo.cfg.internal_absorb_branch, "model.bin") + assert absorbed == SAMPLE_LFS_POINTER_STR + + def test_lfs_pointer_added_on_public_exists_locally(self, topo: Topology, caplog): + topo.bootstrap_absorb() + # File exists locally with different content + topo.commit_internal({"model.bin": "local version\n"}) + topo.commit_to_public({"model.bin": SAMPLE_LFS_POINTER_STR}) + with caplog.at_level(logging.WARNING, logger="pubgate"): + topo.pubgate.absorb() + assert "LFS file" in caplog.text + assert "kept local version" in caplog.text + + +class TestAbsorbLfsModify: + def test_lfs_pointer_modified_on_public(self, topo: Topology, caplog): + topo.bootstrap_absorb() + # Initial LFS pointer on public + topo.commit_to_public({"model.bin": SAMPLE_LFS_POINTER_STR}) + topo.pubgate.absorb() + topo.merge_internal_pr(topo.cfg.internal_absorb_branch, "main") + + # Modified LFS pointer (different sha) + new_pointer = ( + "version https://git-lfs.github.com/spec/v1\n" + "oid sha256:aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\n" + "size 99999\n" + ) + topo.commit_to_public({"model.bin": new_pointer}) + with caplog.at_level(logging.WARNING, logger="pubgate"): + topo.pubgate.absorb() + assert "LFS file" in caplog.text + assert "changed on public" in caplog.text + + absorbed = topo.work_dir.read_file_at_ref(topo.cfg.internal_absorb_branch, "model.bin") + assert absorbed == new_pointer + + +# --------------------------------------------------------------------------- +# Stage with LFS pointer logging +# --------------------------------------------------------------------------- + + +class TestStageLfsLogging: + def test_stage_logs_lfs_file_count(self, topo: Topology, caplog): + topo.bootstrap_absorb() + topo.commit_internal({"model.bin": SAMPLE_LFS_POINTER_STR}) + with caplog.at_level(logging.INFO, logger="pubgate"): + topo.pubgate.stage() + assert "1 LFS-tracked file" in caplog.text + + def test_stage_logs_multiple_lfs_files(self, topo: Topology, caplog): + topo.bootstrap_absorb() + topo.commit_internal( + { + "a.bin": SAMPLE_LFS_POINTER_STR, + "b.bin": SAMPLE_LFS_POINTER_STR, + } + ) + with caplog.at_level(logging.INFO, logger="pubgate"): + topo.pubgate.stage() + assert "2 LFS-tracked files" in caplog.text + + +# --------------------------------------------------------------------------- +# Full stage + publish cycle with LFS pointer +# --------------------------------------------------------------------------- + + +class TestPublishLfs: + def test_lfs_pointer_survives_full_publish_cycle(self, topo: Topology): + topo.bootstrap_absorb() + topo.commit_internal( + { + "model.bin": SAMPLE_LFS_POINTER_STR, + "readme.txt": "hello\n", + } + ) + topo.do_full_publish_cycle() + + # Verify the public repo has the LFS pointer intact + pub_files = topo.external_contributor.list_files_at_ref("HEAD") + assert "model.bin" in pub_files + pub_content = topo.external_contributor.read_file_at_ref("HEAD", "model.bin") + assert pub_content == SAMPLE_LFS_POINTER_STR + + def test_lfs_pointer_with_gitattributes_in_publish(self, topo: Topology, monkeypatch: pytest.MonkeyPatch): + # Skip LFS smudge and push; no real LFS object store in tests. + monkeypatch.setenv("GIT_LFS_SKIP_SMUDGE", "1") + monkeypatch.setenv("GIT_LFS_SKIP_PUSH", "1") + topo.bootstrap_absorb() + topo.commit_internal( + { + ".gitattributes": "*.bin filter=lfs diff=lfs merge=lfs -text\n", + "data.bin": SAMPLE_LFS_POINTER_STR, + "readme.txt": "hello\n", + } + ) + topo.do_full_publish_cycle() + + pub_files = topo.external_contributor.list_files_at_ref("HEAD") + assert ".gitattributes" in pub_files + assert "data.bin" in pub_files + + attrs = topo.external_contributor.read_file_at_ref("HEAD", ".gitattributes") + assert attrs is not None + assert "filter=lfs" in attrs + + pub_content = topo.external_contributor.read_file_at_ref("HEAD", "data.bin") + assert pub_content == SAMPLE_LFS_POINTER_STR diff --git a/tests/test_publish.py b/tests/test_publish.py index 8b12250..46475e2 100644 --- a/tests/test_publish.py +++ b/tests/test_publish.py @@ -503,3 +503,27 @@ def test_external_changes_prevent_base_advancement(self, topo: Topology, caplog) topo.pubgate.publish() assert "Keeping publish base" in caplog.text + + +class TestPublishCRLF: + def test_publish_preserves_crlf(self, topo: Topology): + from conftest import _git + + # Force CRLF file into internal main (bypass autocrlf normalization) + work_path = topo.work_dir.path + (work_path / "crlf.txt").write_bytes(b"hello\r\nworld\r\n") + _git(work_path, "-c", "core.autocrlf=false", "add", "crlf.txt") + _git(work_path, "commit", "-m", "add crlf file") + topo.work_dir.push("origin", "main") + + topo.bootstrap_absorb() + topo.pubgate.stage() + topo.merge_internal_pr(topo.cfg.internal_stage_branch, topo.cfg.internal_approved_branch) + topo.work_dir.run("checkout", "main") + + topo.pubgate.publish() + topo.work_dir.run("fetch", "public-remote") + + pr_ref = f"public-remote/{topo.cfg.public_publish_branch}" + raw = topo.work_dir.git.read_file_at_ref_bytes(pr_ref, "crlf.txt") + assert raw == b"hello\r\nworld\r\n"