diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index db682aa..7ab9c26 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -77,6 +77,11 @@ jobs: tests/test_audit.py \ tests/test_personality.py \ tests/test_conftest_fixtures.py \ + tests/test_sync_config.py \ + tests/test_sync_filters.py \ + tests/test_sync_runner.py \ + tests/test_sync_cli.py \ + tests/test_sync_scheduler.py \ -q --tb=short \ --cov=utils.sanitizer \ --cov=utils.errors \ diff --git a/.gitignore b/.gitignore index 5fe05f6..c76ed63 100644 --- a/.gitignore +++ b/.gitignore @@ -61,3 +61,12 @@ desktop.ini # Codacy / CI artifacts results.sarif codacy-*.sarif + +# rclone / Dropbox sync (never commit credentials or local sync state) +rclone.conf +**/rclone.conf +.rclone-state/ + +# Coverage artifacts +.coverage +htmlcov/ diff --git a/Dropbox_Sync_Guide.md b/Dropbox_Sync_Guide.md new file mode 100644 index 0000000..b8d3019 --- /dev/null +++ b/Dropbox_Sync_Guide.md @@ -0,0 +1,449 @@ +# CyClaw Dropbox Sync — Setup & Usage Guide + +This guide walks you through enabling and running the CyClaw Dropbox corpus sync on **Windows** and **Linux** (macOS steps mirror Linux). It is written for end users, not developers — the internals live in `docs/SYNC_README.md`. + +**What it does:** mirrors a Dropbox folder into your local `data/corpus/` so your knowledge base stays current across machines, without weakening any of CyClaw's security properties. Sync runs as a separate process on a schedule — never inline with the gateway. + +**What it does NOT do:** touch `gate.py`, `graph.py`, or any live request path. No token is ever stored in CyClaw's config or logs. The soul layer (`data/personality/`) is excluded by default. + +--- + +## Before you start + +You need: +- CyClaw cloned and working (Python 3.12, `pip install -r requirements.txt` done) +- A Dropbox account +- Internet access on the machine you're setting up (one time only for OAuth) + +--- + +## Part 1 — Install rclone + +CyClaw requires **rclone ≥ 1.68.2**. It will refuse to run with an older version (security floor for CVE-2024-52522). + +### Linux + +```bash +# Official install script (installs the latest stable release) +curl https://rclone.org/install.sh | sudo bash + +# Verify +rclone version +# Should show v1.68.2 or higher +``` + +If you prefer a package manager: +```bash +# Ubuntu / Debian +sudo apt install rclone # check version — distro packages often lag; prefer the official script + +# Fedora / RHEL +sudo dnf install rclone +``` + +If the distro package is older than 1.68.2, use the official script above instead. + +### Windows + +Open **PowerShell** (as a regular user — no admin needed for winget): + +```powershell +winget install Rclone.Rclone + +# Verify (open a new terminal after install) +rclone version +# Should show v1.68.2 or higher +``` + +Alternative: download the `rclone-*-windows-amd64.zip` from [rclone.org/downloads](https://rclone.org/downloads/), extract, and add the folder to your `PATH`. + +--- + +## Part 2 — Create the Dropbox remote + +Run this once, on the machine that has a browser. If you're on a headless Linux server, see the [remote auth section](#headless-linux-no-browser). + +### Step 1: Start rclone config + +```bash +rclone config +``` + +### Step 2: Walk through the prompts + +``` +n) New remote +name> dropbox_cyclaw +``` + +> The name `dropbox_cyclaw` must match `remote_name` in your `config.yaml`. Change both if you want a different name. + +``` +Storage> dropbox +``` + +At the client ID / secret prompts, press **Enter** to accept defaults (uses rclone's shared Dropbox app). + +``` +Edit advanced config? n +Use auto config? y +``` + +A browser window will open. Log into Dropbox and click **Allow**. Return to the terminal — it should print: + +``` +Success! +``` + +### Step 3: Verify the connection + +```bash +rclone lsd dropbox_cyclaw: +# Lists the root of your Dropbox App Folder — should show folders (or be empty) +``` + +Create the folder that CyClaw will sync into: + +```bash +rclone mkdir dropbox_cyclaw:CyClaw/corpus +``` + +### Step 4: Protect the token file + +The Dropbox refresh token lives only in `rclone.conf`, not in CyClaw. + +**Linux:** +```bash +chmod 600 ~/.config/rclone/rclone.conf +``` + +**Windows** (PowerShell): +```powershell +# View the file location +rclone config file + +# Restrict access to your user only (run in the folder shown above) +$conf = (rclone config file) +icacls $conf /inheritance:r /grant:r "$($env:USERNAME):(R,W)" +``` + +### Headless Linux (no browser) + +On a headless server, use another machine to generate the token and copy the config: + +```bash +# On your laptop / desktop (any machine with a browser and rclone installed): +rclone config +# Create a remote named dropbox_cyclaw, complete the OAuth + +# Find the config file +rclone config file +# Copy it to the server: +scp ~/.config/rclone/rclone.conf user@yourserver:~/.config/rclone/rclone.conf +``` + +--- + +## Part 3 — Edit `config.yaml` + +Open `config.yaml` in the CyClaw project root. Find the `sync:` block (it's near the bottom). Change **these two lines** to enable sync: + +```yaml +sync: + enabled: true # was: false — change this to enable + local_path: "data/corpus" # leave as-is unless you moved the corpus + remote_name: "dropbox_cyclaw" # must match the name you gave in Step 2 + remote_path: "CyClaw/corpus" # folder inside your Dropbox App Folder + direction: "pull" # pull = Dropbox→local only (safe default) + include_soul: false # leave false — soul is governed separately + reindex_on_change: true # auto-trigger reindex when files change + checksum: true + max_delete: 20 # safety: abort if >20 files would be deleted + max_transfer: "1G" # safety: abort if run would transfer >1 GB + schedule_hour: 2 # daily run at 02:00 local time + schedule_min: 0 +``` + +No Dropbox credentials go in this file — only paths and behaviour settings. + +--- + +## Part 4 — Validate the setup + +From the CyClaw directory: + +```bash +cd /path/to/CyClaw + +# Run the pre-flight self-test +python -m sync.cli test +``` + +Expected output: +``` +Self-test: 5/5 passed +``` + +If rclone is missing or too old, the test prints the error and exits 3. If your config.yaml is wrong, it names the failing field. + +Also run a dry-run sync (reads Dropbox but writes nothing locally): + +```bash +python -m sync.cli sync --dry-run +``` + +--- + +## Part 5 — Run a sync manually + +```bash +# From the CyClaw directory: + +# Preview — shows what would change, changes nothing +python -m sync.cli sync --dry-run + +# Live sync +python -m sync.cli sync +``` + +### Reading the output + +``` +Sync complete + direction............... pull + exit_code............... 0 + duration_sec............ 2.41 + added................... 3 + modified................ 1 + deleted................. 0 + corpus_changed.......... True +``` + +### Exit codes + +| Code | Meaning | What to do | +|------|---------|------------| +| `0` | Success, nothing changed | Nothing | +| `10` | Success, corpus changed | Run `python -m retrieval.indexer`, then restart the gateway | +| `1` | Safety fuse tripped | Check if many files were genuinely deleted upstream; don't blindly raise the fuse | +| `2` | Sync failed | Check `logs/audit.jsonl` and the rclone log in `~/.config/rclone/logs/` | +| `3` | Config or environment problem | Read the error message — usually rclone missing, too old, or `config.yaml` bad | + +### After a sync that changed files + +When exit code is `10`, rebuild the index so the gateway sees the new content: + +```bash +python -m retrieval.indexer +``` + +Then restart CyClaw's gateway (the index is loaded at startup). + +--- + +## Part 6 — Set up scheduled (daily) sync + +### Linux (cron) + +```bash +python -m sync.cli schedule +``` + +This adds exactly one tagged line to your crontab. Default time is 02:00 (change `schedule_hour`/`schedule_min` in `config.yaml` first if needed). The line looks like: + +``` +0 2 * * * cd "/path/to/CyClaw" && "/usr/bin/python3.12" -m sync.cli sync # CYCLAW_DROPBOX_SYNC +``` + +To see the current schedule: +```bash +python -m sync.cli status +``` + +To remove the schedule: +```bash +python -m sync.cli unschedule +``` + +**For a more reliable setup on Linux**, use a systemd user timer instead of cron (journald logging, runs after resume from suspend, no cron daemon needed): + +```bash +mkdir -p ~/.config/systemd/user + +# Create the service unit +cat > ~/.config/systemd/user/cyclaw-sync.service <<'EOF' +[Unit] +Description=CyClaw Dropbox corpus sync + +[Service] +Type=oneshot +WorkingDirectory=/path/to/CyClaw +ExecStart=/path/to/CyClaw/.venv/bin/python -m sync.cli sync +EOF + +# Create the timer unit (runs daily at 02:00) +cat > ~/.config/systemd/user/cyclaw-sync.timer <<'EOF' +[Unit] +Description=Daily CyClaw Dropbox sync + +[Timer] +OnCalendar=*-*-* 02:00:00 +Persistent=true + +[Install] +WantedBy=timers.target +EOF + +# Enable and start +systemctl --user daemon-reload +systemctl --user enable --now cyclaw-sync.timer + +# Check status +systemctl --user status cyclaw-sync.timer +systemctl --user list-timers cyclaw-sync.timer +``` + +> Replace `/path/to/CyClaw` with your actual path (e.g. `~/CyClaw`), and adjust the Python path to your venv or system Python. + +### Windows (Task Scheduler) + +```powershell +python -m sync.cli schedule +``` + +This registers a daily Task Scheduler job named **"CyClaw Dropbox Sync"** running at the time set in `config.yaml` (`schedule_hour`:`schedule_min`). A small launcher batch file (`cyclaw_sync.bat`) is written to the rclone log folder — the task points at this file to avoid Windows quoting issues with paths that contain spaces. + +To verify it was created: +```powershell +python -m sync.cli status +# or +schtasks /Query /TN "CyClaw Dropbox Sync" /FO LIST +``` + +To remove it: +```powershell +python -m sync.cli unschedule +``` + +To run it immediately (without waiting for the schedule): +```powershell +schtasks /Run /TN "CyClaw Dropbox Sync" +``` + +**Important on Windows:** The task runs under your user account (`/RL LIMITED`). Rclone reads `%APPDATA%\rclone\rclone.conf` for your token. If the task fails, open Event Viewer → Windows Logs → Application and look for rclone errors, or check the rclone log at `%APPDATA%\rclone\logs\rclone_cyclaw.log`. + +--- + +## Part 7 — One-shot full setup + +The `setup` subcommand combines Steps 4–6: + +```bash +# Validate + write filters + print OAuth reminder +python -m sync.cli setup + +# Validate + write filters + register the daily schedule +python -m sync.cli setup --schedule +``` + +--- + +## Part 8 — Check status + +```bash +python -m sync.cli status +``` + +Shows enabled state, local path, remote, sync direction, schedule time, filter file path, rclone version, and whether the daily job is registered. + +--- + +## Part 9 — Advanced: cron-friendly reindex automation + +For a fully automated pipeline (sync → reindex → restart) on Linux, wrap sync in a shell script: + +```bash +#!/usr/bin/env bash +# /path/to/CyClaw/scripts/sync_and_reindex.sh +set -euo pipefail + +REPO="/path/to/CyClaw" +cd "$REPO" + +python -m sync.cli sync +rc=$? + +if [ "$rc" -eq 10 ]; then + echo "[sync] corpus changed — rebuilding index" + python -m retrieval.indexer + echo "[sync] index rebuilt — restart the gateway to pick up changes" + # Add your gateway restart command here, e.g.: + # systemctl --user restart cyclaw-gateway +elif [ "$rc" -eq 0 ]; then + echo "[sync] no corpus changes" +else + echo "[sync] failed with exit code $rc" >&2 + exit "$rc" +fi +``` + +Register this script in cron or systemd instead of the bare `sync.cli sync`. + +--- + +## Troubleshooting + +| Symptom | Fix | +|---------|-----| +| `RCLONE_NOT_INSTALLED` | Install rclone (Part 1) | +| `RCLONE_VERSION_TOO_OLD` | Upgrade to rclone ≥ 1.68.2 | +| `SYNC_CONFIG_INVALID: sync.enabled` | Set `enabled: true` in `config.yaml` | +| `SYNC_CONFIG_INVALID: local_path` | `local_path` must point inside `data/corpus/` | +| `rclone lsd dropbox_cyclaw:` fails | Re-run `rclone config` and redo the OAuth flow | +| Safety fuse tripped (exit 1, `aborted_for_safety: true`) | More than `max_delete` files would be deleted. Check Dropbox for unexpected deletions. Raise `max_delete` only when intentional. | +| Stale answers after sync | After exit 10 + reindex, **restart the CyClaw gateway** — the index is loaded at startup, not hot-reloaded. | +| Schedule doesn't fire on Windows | Check Task Scheduler → "CyClaw Dropbox Sync" → Last Run Result. Code `0x1` usually means the batch file path has changed; run `python -m sync.cli unschedule` then `python -m sync.cli schedule` to re-register. | +| Another sync is running (exit `SYNC_RUNTIME`) | A previous run is in progress, or it crashed and left a lock. The lock in `~/.config/rclone/logs/sync.lock.d` (Linux) or `%APPDATA%\rclone\logs\sync.lock.d` (Windows) is auto-cleaned after 3 hours; or remove it manually. | +| Soul changed unexpectedly | `include_soul` was set to `true`. Set it back to `false`, and use `POST /soul/apply` to manage `data/personality/` intentionally. | + +--- + +## What is never synced + +These are always excluded regardless of your settings: + +- `data/personality/**` — soul layer; governed via `POST /soul/apply` only +- AI model weights (`.gguf`, `.bin`, `.safetensors`, `.onnx`, `.pt`, `.pth`) +- ChromaDB / BM25 indices (`index/**`, `.chroma/**`) — rebuild with `python -m retrieval.indexer` +- Embeddings cache (`.emb_cache/**`) +- Virtual environments +- Logs and audit events (`logs/**`, `*.jsonl`) +- Secrets (`.env`, `*.pem`, `*.key`, `*_secret*`, `credentials*`) +- Soul DB (`*.db`, `*.db-wal`, `*.db-shm`) +- Git state (`.git/**`) +- rclone's own state files + +To add additional exclusions, use `extra_excludes:` in your `config.yaml`: + +```yaml +sync: + ... + extra_excludes: + - "scratch/**" + - "*.tmp" +``` + +--- + +## Quick reference + +```bash +python -m sync.cli setup # validate + write filters +python -m sync.cli setup --schedule # validate + write filters + schedule +python -m sync.cli sync --dry-run # preview only +python -m sync.cli sync # live sync +python -m sync.cli status # current state + schedule +python -m sync.cli schedule # (re-)register daily job +python -m sync.cli unschedule # remove daily job +python -m sync.cli test # pre-flight self-test +``` diff --git a/README.md b/README.md index 810ce6a..b0ab68f 100644 --- a/README.md +++ b/README.md @@ -277,7 +277,7 @@ The MCP server exposes a single `hybrid_search` tool. It has **no sampling capab - Browser UI via `static/terminal.html` **v1.4.0 targets:** -- Dropbox/cloud corpus sync +- Dropbox/cloud corpus sync — **delivered** (out-of-band `rclone` wrapper; see [docs/SYNC_README.md](docs/SYNC_README.md)) - `plan_node` for multi-step query decomposition - BM25 index SHA-256 integrity check on load - General-purpose agent (tool invocation from corpus context) diff --git a/config.yaml b/config.yaml index a12282a..25c3c37 100644 --- a/config.yaml +++ b/config.yaml @@ -181,4 +181,28 @@ security: - "http://10.0.0.112:8787" # DevSkim: ignore DS137138 - with port # NOTE: null removed — curl/MCP clients send no Origin header and are not # subject to CORS filtering. null in allow_origins is invalid for starlette - # CORSMiddleware and would cause a runtime error or silent rejection. \ No newline at end of file + # CORSMiddleware and would cause a runtime error or silent rejection. + +# =========================== +# sync: Dropbox corpus sync (out-of-band, rclone-based) +# =========================== +# Absence of this block disables sync entirely. Sync runs strictly via +# `python -m sync.cli` — never imported by gate.py, graph.py, or the MCP server. +# The Dropbox refresh token lives ONLY in rclone.conf, never here. +sync: + enabled: false # off by default; `sync.cli sync` no-ops (exit 0) while false + local_path: "data/corpus" # validated: absolute, under repo data/corpus, a dir + remote_name: "dropbox_cyclaw" # must match `rclone listremotes` + remote_path: "CyClaw/corpus" # folder inside the Dropbox App Folder + direction: "pull" # "pull" (safe default) | "bisync" (opt-in, discouraged) + include_soul: false # leave false — data/personality/ is NOT sync-safe + reindex_on_change: true # exit 10 when data/corpus/** changed + checksum: true # rclone --checksum (hash compare, not mtime) + max_delete: 20 # safety fuse: abort if > N deletions + max_transfer: "1G" # safety fuse: abort if run would move > this + schedule_hour: 2 # 24h local time (cron/timer/Task Scheduler) + schedule_min: 0 + conflict_resolve: "newer" # bisync-only: newer modtime wins + conflict_loser: "rename" # bisync-only: loser saved as .conflict1 (never deleted) + # extra_excludes: # optional, appended AFTER hardened block + # - "scratch/**" \ No newline at end of file diff --git a/docs/DROPBOX_SYNC_IMPLEMENTATION_PLAN.md b/docs/DROPBOX_SYNC_IMPLEMENTATION_PLAN.md new file mode 100644 index 0000000..35fc9a8 --- /dev/null +++ b/docs/DROPBOX_SYNC_IMPLEMENTATION_PLAN.md @@ -0,0 +1,557 @@ +# CyClaw Dropbox Corpus Sync — Implementation Planning Guide + +**Status:** Proposed (planning only — no code in this PR) +**Target:** `main` (via feature branch → reviewed PR) +**Author:** Planning synthesis (Claude) from the PsyClaw `sync/` prior art + two research passes +**Scope item:** README roadmap v1.4.0 ("Dropbox/cloud corpus sync") / v1.5.0 ("test Dropbox corpus sync integration") +**Date:** 2026-06-20 + +--- + +## 0. TL;DR + +Add an **out-of-band Dropbox → local corpus sync** as a standalone `sync/` Python package that is a **thin wrapper around the `rclone` binary** (`subprocess`, no `shell=True`), invoked only via `python -m sync.cli` from cron / systemd timer / launchd / Task Scheduler — **never imported by `gate.py`, `graph.py`, or `mcp_hybrid_server.py`.** + +It: +- adds **zero** new Python dependencies (stdlib + existing `pyyaml`, `utils.logger`, `utils.errors`), +- adds **no** FastAPI endpoint and **no** LangGraph node/edge, +- writes only to the local filesystem (default: `data/corpus/`) and appends to `logs/audit.jsonl` via the existing `utils.logger.audit_log()`, +- defaults to **one-way pull** (`rclone copy`, which never deletes), with `bisync` available but discouraged, +- keeps the Dropbox refresh token **entirely inside `rclone.conf`** — CyClaw's process never sees it, +- signals "corpus changed → reindex" via a **dedicated exit code** so a wrapper script can conditionally run `python -m retrieval.indexer`. + +Every one of CyClaw's existing security invariants is preserved **by construction**, because the sync code never touches the request path. + +This guide is the direct CyClaw port of the validated PsyClaw `sync/` v1.0 module. The mapping is ~1:1 (identical `audit_log()` signature, identical `RAGError(message, code, details)` base, identical `data/corpus`, `data/personality/soul.md`, `index/`, `.emb_cache`, `retrieval.indexer`, and `POST /soul/apply` soul-governance gate), which makes this a **low-risk** addition. + +--- + +## 1. Background & motivation + +CyClaw is an offline-first, RAG-first personal AI gateway. Its knowledge base lives in `data/corpus/` (`.md` / `.txt`), is indexed into ChromaDB + BM25 by `retrieval.indexer`, and is retrieved hybrid-style at query time. Today the corpus is populated and edited locally. The goal of this feature is to let a user **mirror a Dropbox folder into `data/corpus/`** so the knowledge base can be maintained from multiple machines / mobile, while **not weakening any security property** and **not coupling the sync to the live request path**. + +An older sibling build (**PsyClaw Sync v1.0**, May 2026) already solved this exact problem with an `rclone`-wrapper `sync/` package. This plan adapts that design to CyClaw's current `main` and re-validates every decision against (a) Dropbox/rclone best practices as of late 2025/2026 and (b) CyClaw's actual code, CI, and security tooling. + +### 1.1 What CyClaw already provides that we reuse (do NOT reimplement) + +| Primitive | Location | Reuse for sync | +|---|---|---| +| `audit_log(event: dict, config_path="config.yaml")` | `utils/logger.py:106` | All sync audit events terminate here (same JSONL, same PII redaction) | +| `redact_sensitive(text, cfg)` / `hash_query(query)` | `utils/logger.py:93` / `:66` | Backstop redaction; `hash_query` only if we ever hash a sensitive string | +| `_get_config()` / `reset_config_cache()` | `utils/logger.py:55` / `:62` | Cached config load; tests reset the cache | +| `RAGError(message, code, details)` | `utils/errors.py:10` | Base class for new `SyncError` / `RcloneError` | +| `retrieval.indexer` full rebuild | `retrieval/indexer.py` (`build_index`, `__main__`) | Reindex trigger, invoked as `python -m retrieval.indexer` | +| `config.yaml` single source of truth | repo root | New additive `sync:` block | +| `.gitignore` never-commit list | repo root | Mirror into the rclone filter file | + +--- + +## 2. Non-negotiable design constraints (the invariant contract) + +These are derived from CyClaw's README "five security invariants" + security model, re-mapped to this feature. **Each must be demonstrably true in the PR.** + +1. **RAG-First preserved.** Sync only changes files on disk that are later consumed at index time. The `retrieve` node remains the graph entry point. No change to `graph.py`. +2. **Topology = Policy preserved.** Sync adds **no** StateGraph node and **no** edge. `graph.py` is not modified. Corpus mutation — a side-effecting operation — stays *outside* the graph, exactly as soul-evolution is kept out of the graph today. +3. **Triple-Gated External preserved.** Sync is unrelated to the Grok fallback path. The only outbound network call is `rclone` → Dropbox, **operator-initiated, out-of-band**, never triggered by a user query. No new inbound listener. +4. **Audit Convergence preserved.** Sync emits its own events through the *same* `audit_log()` into the *same* `logs/audit.jsonl`. It does not alter or bypass the graph's `audit_logger` node. +5. **Soul Governance preserved.** `data/personality/**` is **excluded by default** from sync. A synced file can never overwrite `soul.md` or `cyclaw_soul.db` and thereby bypass the `apply_evolution()` injection scan behind `POST /soul/apply`. **This is the single most important path-safety rule.** Opt-in (`include_soul: true`) is loud and discouraged. +6. **Loopback/no-listener posture preserved.** Sync adds no socket, no `Depends`, no route. `rclone` makes an *outbound* HTTPS call only; there is no inbound surface. +7. **Zero-telemetry posture preserved.** `rclone` has no LangSmith/Chroma/OTel surface, but the subprocess must run with no remote-control (`--rc`), no usage reporting, and inherit CyClaw's clean env. (`gate.py`'s telemetry-kill block is request-path only; sync sets its own minimal clean env for the child.) +8. **Minimal-deps posture preserved.** **Zero** new entries in `requirements.txt` / `constraints.txt` / `pyproject.toml` dependencies. `rclone` is an external binary installed out-of-band (just like LM Studio at `127.0.0.1:1234` already is). + +--- + +## 3. Transport decision (validated) + +**Decision: rclone binary wrapper, invoked out-of-band. Confirmed.** + +| Option | Verdict for CyClaw | +|---|---| +| **rclone** ✅ | Zero Python deps; no daemon; refresh token stays in `rclone.conf` (never in CyClaw's process); single static binary on Linux/macOS/WSL/**Windows**; very mature (stable 1.72.1, Dec 2025). The CLI surface is the security boundary. | +| Maestral ❌ | Always-on **daemon** (violates "no daemons"); **no native Windows** (CyClaw targets Windows); inflates the venv. | +| `dropbox` Python SDK ❌ | Forces CyClaw to **hold the refresh token**; drags `requests`/`urllib3` into a deliberately minimal, CVE-annotated tree; you reimplement delta/retry/auth. | +| `dbxcli` ❌ | Effectively abandoned. | + +**Mandatory rclone hardening (from research):** +- **Pin `rclone ≥ 1.68.2`** as a hard floor — fixes **CVE-2024-52522** (insecure symlink handling with `--links`/`--metadata`). Assert version at startup; refuse to run if older. (PsyClaw used ≥1.65; **raise CyClaw's floor to 1.68.2** for the CVE fix. If `bisync` is ever enabled, the practical floor is ≥1.66 for the snapshot-model redesign — 1.68.2 covers both.) +- **Never** pass `--links` or `--metadata` (not needed for a text corpus; avoids the CVE class entirely). +- Use an **App Folder**-scoped Dropbox app (least privilege), not Full Dropbox. +- `chmod 600 ~/.config/rclone/rclone.conf` (restrict ACLs on Windows). Document rclone config-encryption as the higher-security option (trade-off: needs a password per unattended run). + +--- + +## 4. Sync semantics + +### 4.1 Direction — default **pull** (one-way), `bisync` discouraged + +- **Default `direction: pull`** → `rclone copy remote:path data/corpus`. `rclone copy` **never deletes** at the destination — safest possible default for an RAG corpus. +- `direction: bisync` is **opt-in and discouraged.** Rationale: bidirectional sync creates a second, uncontrolled write path into governed state and is an advanced rclone command with known rough edges (e.g. `--max-delete` evaluated before `--track-renames`). If ever adopted, require rclone ≥1.66, `--resync` baseline, `--check-access`, `--conflict-resolve newer`, `--conflict-loser rename` (never silent delete), and `--max-lock`. + +### 4.2 Safety fuses (applied every run) + +- `--max-delete N` (default **20**) — abort if more than N deletions would occur. (Only meaningful for `rclone sync`/`bisync`; harmless under `copy`.) +- `--max-transfer ` — stop if the run would move more than expected (returns exit 8). +- `--check-first` — do all comparisons before any transfer (deterministic ordering; no interleaved deletes-before-copies). +- `--checksum` — compare by hash+size, not mtime (immune to clock skew; Dropbox stores a content hash so this is cheap correctness). +- `--dropbox-batch-mode sync` (rclone default) — stay within Dropbox rate limits. +- `--use-json-log --stats-one-line` — machine-readable output for the wrapper to parse "what changed". +- Let rclone own transport retries via `--retries` / `--low-level-retries`. The wrapper retries only the *whole run*, bounded, with backoff, then surfaces a `SyncError` (no crash-loop — there's no daemon). +- **`--dry-run` discipline:** `sync --dry-run` previews and changes nothing. + +### 4.3 Filtering — hardened exclude list (denylist), corpus-scoped + +Two valid styles surfaced in research: +- **Denylist** (PsyClaw style): exclude the dangerous/rebuildable categories, sync the rest. +- **Allowlist** (corpus-only): exclude everything, then `+ /*.md`, `+ /*.txt`. + +**Recommendation: ship the denylist as the hardened baseline** (it's battle-tested in PsyClaw and fails safe even if the remote layout changes), and document the allowlist as an optional tightening via `extra_excludes`. The filter file is generated by `sync/filters.py` to the rclone state dir and is `--filter-from`'d on every run. First-match-wins, most-restrictive-first. + +Built-in hardened excludes (mirror of `.gitignore` never-commit categories), in order: + +``` +- data/personality/** # SOUL LAYER — governed via POST /soul/apply only +- *.gguf, *.bin, *.safetensors, *.onnx, *.pt, *.pth # model weights +- index/**, .emb_cache/**, .chroma/** # rebuildable indices/caches +- venv/**, .venv/**, env/**, __pycache__/**, *.pyc, *.pyo, *.egg-info/** +- logs/**, *.log, *.jsonl # local forensic data incl. audit.jsonl — never share +- .env, .env.*, *.env, *.pem, *.key, *_secret*, credentials* # secrets +- *.db, *.db-wal, *.db-shm # governed soul DB state +- .git/**, .gitignore # use git separately +- .DS_Store, Thumbs.db, desktop.ini, *.swp, .idea/**, .vscode/** # OS/editor noise +- .rclone-state/**, *.rclone.lst* # rclone's own state +``` + +`extra_excludes:` from config is appended **after** the hardened block (users can tighten further, never accidentally re-include something the hardened rules already excluded — first-match-wins guarantees this). + +### 4.4 Reindex trigger — only on actual corpus change + +`retrieval.indexer.build_index()` is a **full rebuild** (deletes + recreates the Chroma collection, overwrites `bm25.json`) — it re-embeds the entire corpus and is expensive. Therefore: +- The wrapper parses rclone's JSON stats / log to detect whether any file under `data/corpus/**` was added/modified/deleted. +- If yes → CLI exits **10** ("corpus changed → reindex recommended"); a cron wrapper conditionally runs `python -m retrieval.indexer` as a **separate process** (never inline, never in the gateway event loop). +- If no change → exit 0; no reindex. +- A gateway restart is required to pick up the rebuilt index (the retriever is constructed at import time in `gate.py`). + +--- + +## 5. Proposed file layout + +All new code lives under `sync/` plus two small additive edits to existing files. + +``` +CyClaw/ +├── sync/ # NEW package — never imported by gate/graph/mcp +│ ├── __init__.py # public API re-exports + __version__ +│ ├── config.py # RcloneConfig dataclass + validating loader of config.yaml `sync:` block +│ ├── filters.py # hardened filter-file generator (denylist above) +│ ├── runner.py # rclone subprocess + JSON-log parse + SHA-256 audit + reindex exit code +│ ├── scheduler.py # cron / systemd-timer / launchd / Task Scheduler abstraction (idempotent install/remove) +│ ├── selftest.py # pre-flight self-test (version, config, filter, remote reachability dry-run) +│ └── cli.py # `python -m sync.cli {setup,sync,test,schedule,unschedule,status}` +├── tests/ +│ └── test_sync.py # NEW — fully mocked (patch subprocess.run + shutil.which); no network +├── utils/errors.py # EDIT (additive): add SyncError + subclasses subclassing RAGError +├── config.yaml # EDIT (additive): append `sync:` block (no secrets) +├── pyproject.toml # EDIT (additive): add "sync" to [tool.coverage.run] source +├── .gitignore # EDIT (additive): ignore any local rclone.conf copy / sync state +└── docs/ + └── SYNC_README.md # NEW — operator guide (Step 1 install rclone … troubleshooting) +``` + +### 5.1 Module responsibilities + +**`sync/config.py`** — `RcloneConfig` dataclass + `load_sync_config()`. +- Reads the `sync:` block via `utils.logger._get_config()` (consistent caching + test reset). +- **Validates** (raises `SyncConfigError`): + - `local_path` → expand user/vars, must be **absolute**, must resolve to a directory **under the repo's `data/corpus`** (reject `..`, reject symlink escape via `Path.resolve()`). + - `remote_name` / `remote_path` → strict whitelist regex (e.g. `^[A-Za-z0-9_.-]+$` for name, no shell metacharacters, no leading `-` so it can't be parsed as a flag). + - `direction ∈ {pull, bisync}`; `max_delete ≥ 0`; `schedule_hour ∈ 0..23`; `schedule_min ∈ 0..59`; `conflict_resolve ∈ {newer,older,larger,smaller,none}`. + - `include_soul` bool (default False). +- Computes default state paths under `XDG_CONFIG_HOME`/`~/.config/rclone` (filter file, log dir, bisync workdir) — all overridable. +- **Never** holds any secret. Unknown keys are collected and surfaced as a non-fatal warning (typo visibility). + +**`sync/filters.py`** — `generate_filters(cfg)` / `write_filter_file(cfg)`. +- Emits the §4.3 hardened denylist; conditionally drops the `data/personality/**` line **only** if `include_soul=true` (and writes a loud WARNING header into the file when it does). + +**`sync/runner.py`** — `run_sync(cfg, dry_run=False, resync=False)` → `SyncResult`. +- `rclone_bin = shutil.which("rclone")`; `None` → `RcloneNotInstalledError` (defeats Bandit S607 partial-path). +- `check_rclone_version()` asserts ≥ 1.68.2 → `RcloneVersionError` if older. +- Builds an **argv list** (never a string, never `shell=True`); all argv elements come from **validated config** + a fixed flag list (no taint → satisfies CodeQL/DevSkim/Fortify). +- `subprocess.run(argv, capture_output=True, text=True, timeout=..., check=False)`; inspects `returncode` (no `check=True`). +- Parses the JSON log → per-file `FileEvent(kind, path, sha256)`; hashes added/modified files under `data/corpus/` with stdlib `hashlib.sha256` (streamed, 64 KiB chunks). +- Emits audit events (see §6). Maps nonzero exit → `SyncResult(success=False)` / `RcloneError` with `details={"rclone_exit": N}`. +- `reindex_exit_code_for(result, cfg)` → 0 / 10 / 1 (safety abort) / 2 (other failure). + +**`sync/scheduler.py`** — `get_scheduler(cfg)` factory → platform impl with idempotent `install()` / `remove()` / `status()`. +- **Linux:** prefer a **systemd `--user` timer + `Type=oneshot` service** (inherent overlap protection, journald logging, `Persistent=true` catch-up). Provide a **cron fallback** (tagged line, `flock -n` lockfile to prevent overlap). +- **macOS:** launchd `LaunchAgent` plist (`StartCalendarInterval`; won't relaunch a running job). +- **Windows:** `schtasks` Task Scheduler with "do not start a new instance" policy. +- Every registered job is tagged (`CYCLAW_DROPBOX_SYNC`) so install/remove only ever touches our own entry. +- **Wrapper-level lockfile** (`os.O_CREAT|os.O_EXCL` / `flock`) as belt-and-suspenders so a manual run can't collide with a scheduled one (rclone has no built-in single-instance guard). + +**`sync/cli.py`** — `python -m sync.cli `. argparse; subcommands `setup [--schedule]`, `sync [--dry-run] [--resync]`, `test`, `schedule`, `unschedule`, `status`. No import of `gate`/`graph`/`mcp_hybrid_server`. Exit codes per §7. + +**`sync/selftest.py`** — pre-flight checks (rclone present + version, config valid, filter file writable, `rclone lsd` reachability dry-run, soul-exclusion asserted). Drives `cli test`. + +--- + +## 6. Audit event schema + +All through `utils.logger.audit_log()`. **Key rules** (from logger behavior at `utils/logger.py:106-119`): +- Do **not** name any field `query` (it would be SHA-256-hashed and lose readability). Use `file` / `local_path`. +- String fields (except `event`/`timestamp`/`query_hash`) pass through `redact_sensitive` — emails/IPs/secret-regex scrubbed automatically; path text otherwise preserved. +- Counts/booleans are ints/bools (not redacted) — keep them typed. +- **Never** place a refresh token or raw rclone stderr that could echo a secret into any field. Log only metadata. + +| Event | When | Key fields | +|---|---|---| +| `sync_started` | run begins | `direction`, `dry_run`, `remote`, `local_path`, `include_soul` | +| `sync_file_added` | per file | `file`, `sha256` | +| `sync_file_modified` | per file | `file`, `sha256` | +| `sync_file_deleted` | per file | `file` (no hash — bytes gone) | +| `sync_completed` | success | `direction`, `duration_sec`, `rclone_exit_code`, `counts`, `corpus_changed`, `dry_run` | +| `sync_failed` | failure | `direction`, `rclone_exit_code`, `errors_n`, `aborted_for_safety` | + +--- + +## 7. Exit-code contract (CLI) + +| Code | Meaning | Caller action | +|---|---|---| +| 0 | Success, no corpus change | none | +| **10** | Success **and `data/corpus/**` changed** | run `python -m retrieval.indexer`, then restart gateway | +| 1 | Aborted by safety fuse (`--max-delete`/`--max-transfer`) | investigate the remote; do **not** blindly raise the fuse | +| 2 | Sync failed (other) | inspect audit log / rclone log | +| 3 | Config/environment problem (rclone missing/old, config invalid) | fix env per error details | + +Cron-friendly chain: +```bash +python -m sync.cli sync; rc=$? +if [ "$rc" -eq 10 ]; then python -m retrieval.indexer; fi +``` + +--- + +## 8. `config.yaml` additive block + +Append a new top-level block — **touch no existing keys, no secrets**: + +```yaml +# =========================== +# sync: Dropbox corpus sync (out-of-band, rclone-based) +# =========================== +# Absence of this block disables sync entirely. Sync runs strictly via +# `python -m sync.cli` — never imported by gate.py, graph.py, or the MCP server. +# The Dropbox refresh token lives ONLY in rclone.conf, never here. +sync: + enabled: false # off by default; out-of-band only + local_path: "data/corpus" # validated: absolute, under repo data/corpus, a dir + remote_name: "dropbox_cyclaw" # must match `rclone listremotes` + remote_path: "CyClaw/corpus" # folder inside the Dropbox App Folder + direction: "pull" # "pull" (safe default) | "bisync" (opt-in, discouraged) + include_soul: false # leave false — data/personality/ is NOT sync-safe + reindex_on_change: true # exit 10 when data/corpus/** changed + checksum: true # rclone --checksum (hash compare, not mtime) + max_delete: 20 # safety fuse: abort if > N deletions + max_transfer: "1G" # safety fuse: abort if run would move > this + schedule_hour: 2 # 24h local time (cron/timer/Task Scheduler) + schedule_min: 0 + conflict_resolve: "newer" # bisync-only: newer modtime wins + conflict_loser: "rename" # bisync-only: loser saved as .conflict1 (never deleted) + # extra_excludes: # optional, appended AFTER hardened block + # - "scratch/**" +``` + +Loader note: `utils/logger._get_config()` caches `config.yaml`; `sync/` should load through it (and tests call `reset_config_cache()`). `gate.py` and `retrieval.indexer` each `yaml.safe_load` independently — they are not cache-coupled, so adding this block cannot perturb them. + +--- + +## 9. Errors — additive to `utils/errors.py` + +Follow the existing subclass pattern exactly (each subclass calls `super().__init__(message, code=..., details=details)`): + +```python +class SyncError(RAGError): + def __init__(self, message, details=None): + super().__init__(message, code="SYNC_ERROR", details=details) + +class RcloneNotInstalledError(SyncError): # code="RCLONE_NOT_INSTALLED" +class RcloneVersionError(SyncError): # code="RCLONE_VERSION_TOO_OLD" +class SyncConfigError(SyncError): # code="SYNC_CONFIG_INVALID" +class SchedulerError(SyncError): # code="SYNC_SCHEDULER_ERROR" +class SyncRuntimeError(SyncError): # code="SYNC_RUNTIME_ERROR" +``` + +Centralizing in `utils/errors.py` keeps the typed-error hierarchy uniform (matches `IndexNotFoundError`, `CorpusEmptyError`, etc.). These are never raised in the request path. + +--- + +## 10. Security & CI compliance checklist + +CyClaw runs **Ruff (incl. Bandit `S`), mypy strict, CodeQL, DevSkim, Fortify, OSV-Scanner, pip-audit**. The `sync/` package must pass all of them. + +| Gate | Requirement for `sync/` | +|---|---| +| **Ruff `select = [E,F,I,B,C4,UP,S]`** (`pyproject.toml:73`) | The `S` (Bandit) rules are the main concern: **`S602`** (`shell=True`) — never use it; **`S603`** (subprocess call) — argv list only, justify any residual with a targeted `# noqa: S603` + comment; **`S607`** (partial exe path) — resolve via `shutil.which("rclone")` to an absolute path. `sync/*` is **not** under the `tests/*` per-file-ignore, so it must be clean or explicitly suppressed with justification. Line length 120 (E501 ignored). | +| **mypy strict** (`:84`) | Full type annotations on every function/param/return, matching `utils/logger.py` style. | +| **CodeQL** (python, build-mode none) | No **tainted** value into argv. All argv elements come from validated/whitelisted config — not env, not network, not file contents. Static flag list + validated `remote`/`local_path` ⇒ no command-injection taint path. | +| **DevSkim** | Scans Python (ignores `*.md`/`*.json`). Add justified inline `# DevSkim: ignore ` only where provably safe (mirrors existing `config.yaml` usage), with a loopback/offline rationale. | +| **Fortify** | Only runs with `FOD_TENANT`/`SSC_URL`; same subprocess hygiene. Keep `SSC_TOKEN` out of logs (already redacted by `gate._sanitize_error`, but sync should never log it). | +| **OSV-Scanner + pip-audit** | Trigger on `requirements.txt`/`constraints.txt`/manifests. **Zero new deps ⇒ these files unchanged ⇒ nothing new to flag.** Do not add to `.osv-scanner.toml` (it ignores only the two accepted CVEs: chromadb, nltk). | +| **Coverage `fail_under = 80`** (`pyproject.toml:95`) | Add `"sync"` to `[tool.coverage.run] source` (`:91`) **and** ship `tests/test_sync.py` with adequate coverage. (Alternatively keep `sync` out of `source` to stay outside the gate — but adding it is the honest choice and PsyClaw shipped 25 mocked tests.) | + +**Secrets discipline (structural, not just scanner-satisfying):** +- Refresh token only in user-owned `rclone.conf` (mode 600 / restricted ACL). Never in repo, config.yaml, logs, audit events, or argv. +- `.gitignore`: add patterns for any local `rclone.conf` copy and sync state so they can never be committed. +- Sync logs rclone *metadata* only — never raw stderr that could echo a token. + +**Subprocess env hygiene:** spawn rclone with a minimal clean env; do **not** enable `--rc` (remote control) or any rclone usage reporting; keep its network strictly Dropbox-API-only. + +--- + +## 11. Test plan (`tests/test_sync.py`) + +Follow CyClaw conventions (`tests/conftest.py`, `test_audit.py`, `test_gate.py`): **fully mocked, no network, plain sync tests** (the suite avoids `pytest.mark.asyncio`; sync is synchronous CLI code). + +- `autouse` fixture calling `reset_config_cache()` before/after each test (sync exercises `audit_log`/`_get_config`). +- `tmp_path` fixture writing a temp `config.yaml` with a `sync:` block and `logging.audit_file` → `tmp_path`. +- **Mock the boundary:** `patch("sync.runner.subprocess.run", return_value=MagicMock(returncode=0, stdout="...", stderr=""))` and `patch("sync.runner.shutil.which", return_value="/usr/bin/rclone")`. No rclone binary required. + +Coverage targets (port PsyClaw's 25-test suite intent): +1. **argv is a list**, contains no `shell=True`, no untrusted interpolation; binary path absolute. +2. Version gate: `< 1.68.2` raises `RcloneVersionError`; missing binary raises `RcloneNotInstalledError`. +3. Config validation: rejects relative `local_path`, `..` escape, path outside `data/corpus`, bad `direction`, bad `remote_name` (metacharacters / leading `-`), out-of-range schedule. +4. Filter generation: `data/personality/**` present by default; **removed** only when `include_soul=true` (and WARNING header emitted); `extra_excludes` appended after hardened block. +5. Log parsing → `FileEvent`s; `corpus_changed` true only when a `data/corpus/**` path appears; exit code 10 wiring. +6. Audit events: correct event names, `sha256` populated for added/modified, **no secret-bearing fields**, no field named `query`. +7. Safety: simulated `--max-delete` abort → `success=False`, `aborted_for_safety=True`, exit 1. +8. Scheduler: idempotent install/remove with mocked `subprocess`/crontab/schtasks; tagged-line add/replace/remove; no touching of unrelated entries. +9. CLI: each subcommand returns the documented exit code (mock the runner). + +--- + +## 12. Documentation deliverables + +- **`docs/SYNC_README.md`** — operator guide: install rclone (≥1.68.2), App-Folder OAuth via `rclone config`, edit the `sync:` block, `setup`/`test`/`sync --dry-run`/`schedule`, exit-code chain, troubleshooting table, and an explicit "why pull / why soul is excluded" security section. +- **`README.md`** — one paragraph under the roadmap marking v1.4.0 "Dropbox corpus sync" delivered + link to `docs/SYNC_README.md`. (Additive; no security-claim changes.) +- **`docs/SETUP.md`** — mention rclone as an optional external binary (like LM Studio). + +--- + +## 13. Phased delivery (recommended PR sequencing) + +To keep review tractable, land in small reviewed PRs on the feature branch: + +1. **PR-1 — Scaffolding & errors:** `utils/errors.py` additions, `sync/__init__.py`, `sync/config.py` + `sync/filters.py`, `config.yaml` block, `tests/test_sync.py` (config + filter tests only), coverage source update. *No subprocess yet.* Easiest to review; establishes the contract. +2. **PR-2 — Runner:** `sync/runner.py` (version check, argv builders, run, log parse, hashing, audit), runner tests (mocked subprocess). Security-tooling review focus. +3. **PR-3 — CLI & self-test:** `sync/cli.py`, `sync/selftest.py`, CLI tests. +4. **PR-4 — Scheduler:** `sync/scheduler.py` (systemd/cron/launchd/schtasks), scheduler tests. +5. **PR-5 — Docs:** `docs/SYNC_README.md`, README/SETUP touch-ups. + +(If preferred, ship as one larger PR mirroring PsyClaw v1.0 — but the phased route gives the security scanners a smaller surface per review.) + +--- + +## 14. Risk register + +| Risk | Likelihood | Mitigation | +|---|---|---| +| Synced file overwrites `soul.md`/soul DB, bypassing governance | Low (excluded by default) | Hardened filter excludes `data/personality/**` + `*.db*`; `include_soul` opt-in is loud + discouraged; test asserts exclusion | +| Command injection via config into argv | Low | Whitelist-validate `remote_name`/`local_path`; argv list; no `shell=True`; CodeQL/DevSkim gates | +| Refresh token leaks into repo/logs | Low | Token only in `rclone.conf` (600); sync logs metadata only; `.gitignore` guards | +| rclone CVE (symlink) | Mitigated | Pin ≥1.68.2; never pass `--links`/`--metadata` | +| Expensive needless reindex | Medium | Reindex only on `data/corpus/**` change (exit 10); never inline/in-process | +| Overlapping scheduled + manual run | Low | `Type=oneshot`/launchd/Task "no new instance" + wrapper lockfile | +| Accidental mass deletion from remote | Low | Default `pull`/`copy` never deletes; `--max-delete`/`--max-transfer`/`--check-first` fuses | +| Stale index served after sync | Medium | Document required gateway restart after reindex | + +--- + +## 15. Definition of done + +- [ ] `sync/` package added; **no** import of it in `gate.py`/`graph.py`/`mcp_hybrid_server.py` (grep-verified in CI or review). +- [ ] No FastAPI route, no listener, no graph node/edge added. +- [ ] Zero new entries in `requirements.txt` / `constraints.txt` / `pyproject` dependencies. +- [ ] `config.yaml` `sync:` block additive; no existing key changed; no secret in it. +- [ ] `data/personality/**` excluded by default; `include_soul` opt-in is loud. +- [ ] All audit events flow through `utils.logger.audit_log`; no field named `query`; no secret fields. +- [ ] rclone invoked via argv list, `shutil.which`, version ≥1.68.2 asserted, no `--links`/`--metadata`, no `shell=True`. +- [ ] Ruff(S)/mypy-strict/CodeQL/DevSkim/Fortify clean (justified inline suppressions only); OSV/pip-audit unaffected. +- [ ] `tests/test_sync.py` fully mocked (no network), `reset_config_cache()` hygiene, ≥80% coverage with `"sync"` in coverage source. +- [ ] `docs/SYNC_README.md` ships; README roadmap updated. +- [ ] Each of the 8 invariants in §2 demonstrably preserved in the PR description. + +--- + +## 16. Three-subagent implementation delegation + +The implementation is split across **three subagent roles** working against a **frozen interface contract** (Appendix C). Because Roles B and C both import the types Role A owns (`RcloneConfig`, the `SyncError` hierarchy), the safe ordering is **A first (solo), then B + C in parallel**. This is the only hard dependency edge; once A's contract is on disk, B and C touch **disjoint files** and never import each other's *implementations* (only the frozen signatures), so they parallelize cleanly. + +### 16.1 Role A — Foundation (runs first, solo) + +Owns the data contract everything else codes against. + +| Deliverable | Notes | +|---|---| +| `utils/errors.py` (additive edit) | Add `SyncError` + 5 subclasses (Appendix C-1). No change to existing classes. | +| `sync/__init__.py` | Public re-exports + `__version__`. | +| `sync/config.py` | `RcloneConfig` dataclass (Appendix C-2) + `load_sync_config()`; full validation; loads via `utils.logger._get_config()`. | +| `sync/filters.py` | `generate_filters(cfg)` / `write_filter_file(cfg)` / `filter_summary(cfg)` — hardened denylist (§4.3), conditional soul line. | +| `config.yaml` (additive edit) | Append the `sync:` block (§8). Touch no existing keys. | +| `.gitignore` (additive edit) | Ignore local `rclone.conf` copies + `.rclone-state/`. | +| `pyproject.toml` (additive edit) | Add `"sync"` to `[tool.coverage.run] source`. | +| `tests/test_sync_config.py`, `tests/test_sync_filters.py` | Self-contained; `--noconftest`-runnable. | + +**Exit criterion for A:** `python3.12 -c "from sync.config import RcloneConfig, load_sync_config; from sync.filters import generate_filters; from utils.errors import SyncError"` succeeds, and A's tests pass under `pytest --noconftest`. + +### 16.2 Role B — Runner + CLI (after A; parallel with C) + +| Deliverable | Notes | +|---|---| +| `sync/runner.py` | `check_rclone_version()` (floor **1.68.2**), argv builders (argv list, no `shell=True`), `run_sync()`, log parse → `FileEvent`, `hashlib` SHA-256, audit emit, `reindex_exit_code_for()`. | +| `sync/selftest.py` | `run_self_test()` pre-flight (drives `cli test`). | +| `sync/cli.py` | argparse entry `python -m sync.cli {setup,sync,test,schedule,unschedule,status}`. **Imports `sync.scheduler.get_scheduler` lazily inside the `schedule`/`unschedule`/`setup --schedule` handlers** so B is testable with the scheduler mocked and B↔C stay decoupled. | +| `tests/test_sync_runner.py`, `tests/test_sync_cli.py` | Patch `sync.runner.subprocess.run` + `sync.runner.shutil.which`; assert argv is a list, no `shell=True`, version gate, exit codes, audit fields (no `query` key, no secrets). | + +### 16.3 Role C — Scheduler + Docs (after A; parallel with B) + +| Deliverable | Notes | +|---|---| +| `sync/scheduler.py` | `ScheduleEntry` dataclass + `get_scheduler(cfg)` factory → `CronScheduler` (Linux/macOS) and `WindowsTaskScheduler`; idempotent `install`/`remove`/`status`; tagged `CYCLAW_DROPBOX_SYNC`. (systemd-timer note documented; cron is the portable baseline with overlap caveat.) | +| `tests/test_sync_scheduler.py` | Patch `subprocess.run`/`crontab`/`schtasks`; assert tagged add/replace/remove, no touching of unrelated entries. | +| `docs/SYNC_README.md` | Operator guide (install rclone ≥1.68.2, App-Folder OAuth, config, usage, exit codes, troubleshooting, security rationale). | +| `README.md` (additive touch) | Mark roadmap item delivered + link to `docs/SYNC_README.md`. | + +### 16.4 Sequencing & "use 1 subagent until you can parallelize" rule + +1. **Launch Role A alone.** Wait for completion + verify the exit criterion myself. +2. **If A's contract matches Appendix C**, launch **Role B and Role C in parallel** (single message, two agents, disjoint files). +3. **If A deviated** from the contract (renamed a field, changed a signature), I reconcile it myself first (or re-task A) so B and C build against a stable interface — *only then* parallelize. Any ambiguity → fall back to a single sequential subagent rather than risk a parallel interface clash. +4. **Integration verification is mine, not the subagents'** (§16.5). Subagents do **not** run `git` and do **not** commit. + +### 16.5 Verification protocol (orchestrator-owned, this environment) + +- **Import/runtime under real Python 3.12:** `/usr/bin/python3.12` (has pyyaml) must import the whole `sync` package and run `python3.12 -m sync.cli status`/`test` against a temp config — with rclone absent, the expected, *clean* outcome is `RCLONE_NOT_INSTALLED` / exit 3 (this itself verifies the not-installed path). +- **Test suite under Python 3.12:** `/tmp/py312venv/bin/python -m pytest tests/test_sync_*.py --noconftest -q`. (`--noconftest` is required because the repo's `tests/conftest.py` imports `chromadb`, which isn't installed in this sandbox; sync tests are deliberately self-contained so they don't need it.) +- **Lint/type gates:** `ruff check sync tests/test_sync_*.py` (must be clean on the `S`/`B`/`UP`/`I` rules; justified inline `# noqa: S603` only) and `mypy sync` (strict-compatible annotations). +- **Invariant grep:** confirm no `import sync` / `from sync` appears in `gate.py`, `graph.py`, `mcp_hybrid_server.py`. +- **No-auth guarantee:** confirm no Dropbox token/key/secret is written anywhere; rclone is never actually authenticated or invoked against the network during verification (all subprocess calls are mocked in tests; live CLI runs stop at the rclone-missing/version gate). + +--- + +## Appendix A — PsyClaw → CyClaw mapping (why this is low-risk) + +| PsyClaw | CyClaw | Same? | +|---|---|---| +| `utils.logger.audit_log(event, config_path)` | identical | ✅ | +| `utils.errors.RAGError(message, code, details)` | identical | ✅ | +| `data/corpus/` | `data/corpus/` | ✅ | +| `data/personality/soul.md` + `psyclaw_soul.db` | `soul.md` + `cyclaw_soul.db` | ✅ (rename) | +| `index/chroma_db`, `index/bm25.json` | identical | ✅ | +| `.emb_cache/` | identical | ✅ | +| `python -m retrieval.indexer` | identical | ✅ | +| `POST /soul/apply` governance | identical | ✅ | +| config.yaml single source | identical | ✅ | +| offline-first / minimal-deps / no telemetry | identical | ✅ | + +The PsyClaw `sync/` v1.0 module (config/filters/runner/scheduler/cli/tests) ports almost verbatim; the substantive **net-new** decisions for CyClaw are: **rclone floor raised to 1.68.2** (CVE-2024-52522), **systemd `--user` oneshot timer preferred** over bare cron on Linux, **`*.db*` added** to the hardened excludes (soul DB), and **`"sync"` added to coverage source** to satisfy CI's 80% gate. + +## Appendix C — Frozen interface contract (so Roles B & C parallelize safely) + +Role A produces exactly these public surfaces; Roles B and C import only these. + +**C-1 — `utils/errors.py` additions** (each subclass sets its own `code`): + +```python +class SyncError(RAGError): # code="SYNC_ERROR" +class RcloneNotInstalledError(SyncError): # code="RCLONE_NOT_INSTALLED" +class RcloneVersionError(SyncError): # code="RCLONE_VERSION_TOO_OLD" +class SyncConfigError(SyncError): # code="SYNC_CONFIG_INVALID" +class SchedulerError(SyncError): # code="SYNC_SCHEDULER_ERROR" +class SyncRuntimeError(SyncError): # code="SYNC_RUNTIME_ERROR" +# signature for all: __init__(self, message: str, details: Optional[dict] = None) +``` + +**C-2 — `sync/config.py` public surface:** + +```python +@dataclass +class RcloneConfig: + local_path: str # validated absolute, under repo data/corpus, a dir + remote_name: str = "dropbox_cyclaw" + remote_path: str = "CyClaw/corpus" + direction: str = "pull" # "pull" | "bisync" + include_soul: bool = False + reindex_on_change: bool = True + checksum: bool = True + max_delete: int = 20 + max_transfer: str = "1G" + conflict_resolve: str = "newer" + conflict_loser: str = "rename" + schedule_hour: int = 2 + schedule_min: int = 0 + workdir: Optional[str] = None # bisync state dir (default under rclone state dir) + filter_file: Optional[str] = None + log_dir: Optional[str] = None + extra_excludes: List[str] = field(default_factory=list) + REINDEX_EXIT_CODE: int = 10 + # properties: + @property + def remote(self) -> str # f"{remote_name}:{remote_path}" + @property + def log_path(self) -> str # os.path.join(log_dir, "rclone_cyclaw.log") + @property + def is_windows(self) -> bool + +def load_sync_config(config_path: str = "config.yaml") -> RcloneConfig: ... +``` + +**C-3 — `sync/filters.py` public surface:** + +```python +def generate_filters(cfg: RcloneConfig) -> str: ... +def write_filter_file(cfg: RcloneConfig) -> str: ... # returns abs path written +def filter_summary(cfg: RcloneConfig) -> dict: ... +``` + +**C-4 — `sync/scheduler.py` public surface (Role C; consumed lazily by Role B's cli):** + +```python +@dataclass +class ScheduleEntry: + platform_name: str + command: str + cron_or_time: str + raw: str + +def get_scheduler(cfg: RcloneConfig): ... # -> CronScheduler | WindowsTaskScheduler +# scheduler objects expose: install() -> ScheduleEntry +# remove() -> bool +# status() -> Optional[ScheduleEntry] +TASK_TAG = "CYCLAW_DROPBOX_SYNC" +WINDOWS_TASK_NAME = "CyClaw Dropbox Sync" +``` + +**C-5 — `sync/runner.py` public surface (Role B):** + +```python +MIN_RCLONE_MAJOR, MIN_RCLONE_MINOR, MIN_RCLONE_PATCH = 1, 68, 2 +def check_rclone_version(rclone_bin: str = "rclone") -> Tuple[int, int, int]: ... +@dataclass +class FileEvent: kind: str; path: str; sha256: Optional[str] = None +@dataclass +class SyncResult: success: bool; direction: str; ...; corpus_changed: bool +def run_sync(cfg, dry_run=False, resync=False, rclone_bin="rclone") -> SyncResult: ... +def reindex_exit_code_for(result: SyncResult, cfg: RcloneConfig) -> int: ... +``` + +## Appendix D — Test isolation in CI vs. this sandbox + +The repo's `tests/conftest.py` imports `retrieval.hybrid_search` (→ `chromadb`). In CI those deps are installed, so the full suite (incl. `tests/test_sync_*.py`) collects normally. In a minimal sandbox without chromadb, run sync tests with **`pytest --noconftest tests/test_sync_*.py`**. To keep that possible, **sync tests must be self-contained**: they may import `reset_config_cache` directly and use the builtin `tmp_path` fixture, but must **not** depend on `conftest.py` fixtures. (`tests/*` are exempt from Bandit `S101/S603/S108` via `pyproject.toml` per-file-ignores, so `assert` and mocked subprocess in tests are fine.) + +## Appendix B — Key source references + +- rclone: filtering, bisync, docs (max-delete/max-transfer/check-first/exit codes), Dropbox backend — rclone.org +- CVE-2024-52522 (GHSA-hrxh-9w67-g4cv) — fixed in rclone 1.68.2 +- Dropbox OAuth (offline access, scoped apps, App Folder, content_hash) — developers.dropbox.com / dropbox.tech +- CyClaw code: `gate.py`, `graph.py`, `utils/logger.py:106`, `utils/errors.py:10`, `config.yaml`, `retrieval/indexer.py`, `pyproject.toml:73,91,95`, `.gitignore` +- Prior art: PsyClaw Sync v1.0 (`sync/` package + `SYNC_README.md`) diff --git a/docs/SYNC_README.md b/docs/SYNC_README.md new file mode 100644 index 0000000..7996b11 --- /dev/null +++ b/docs/SYNC_README.md @@ -0,0 +1,330 @@ +# CyClaw Sync — Rclone Dropbox Corpus Integration + +**Module:** `sync/` (Dropbox corpus sync, v1.4.0 cycle) +**Status:** Out-of-band, audit-logged, **zero new Python dependencies**, no FastAPI surface. + +CyClaw's sync module mirrors a Dropbox folder into your local `data/corpus/` +without weakening any of CyClaw's security invariants. It is a thin Python +wrapper around the `rclone` binary, runs as a **separate process** (cron / +systemd timer / launchd / Task Scheduler), and emits per-file audit events into +the same `logs/audit.jsonl` the gateway uses. + +> **Sync is NOT a graph node and NOT a FastAPI endpoint.** It is invoked **only** +> via `python -m sync.cli`. CyClaw's request path — `gate.py`, `graph.py`, +> `mcp_hybrid_server.py` — never imports anything from `sync/`. There is no new +> listener, no new route, and no new dependency. + +--- + +## File layout + +``` +CyClaw/ +├── sync/ +│ ├── __init__.py public API re-exports + __version__ +│ ├── config.py RcloneConfig dataclass + validating YAML loader +│ ├── filters.py cyclaw_filters.txt generator (hardened denylist) +│ ├── runner.py rclone subprocess + JSON-log parser + SHA-256 audit +│ ├── scheduler.py cron + Task Scheduler abstraction (idempotent) +│ ├── selftest.py pre-flight self-test +│ └── cli.py python -m sync.cli entry point +├── tests/ +│ ├── test_sync_config.py +│ ├── test_sync_filters.py +│ ├── test_sync_runner.py +│ ├── test_sync_cli.py +│ └── test_sync_scheduler.py fully mocked, no network +└── docs/ + └── SYNC_README.md this guide +``` + +The Dropbox refresh token lives **only** in `rclone.conf`, owned by your user. +CyClaw's process never sees it; it is never written to `config.yaml`, the repo, +the audit log, or any argv. + +--- + +## Security posture (read this before you sync anything) + +| Default | Why | +|---|---| +| **`direction: pull`** | `rclone copy` never deletes at the destination. One-way pull is the safest default for an RAG corpus. Bidirectional `bisync` is a silent-rewrite path into governed state. | +| **`include_soul: false`** | `data/personality/` is governed via `POST /soul/apply` with a human reason string and an injection scan. Replicating it via cron bypasses that gate. **This is the single most important path-safety rule.** | +| **Hardened exclude list** | Model weights, indices, caches, venvs, logs, secrets, `.git`, and the soul DB (`*.db*`) are all excluded by default. See `sync/filters.py`. | +| **`max_delete: 20`** | rclone aborts the run if more than 20 deletions would occur. Tune up only when you understand exactly why. | +| **Per-file SHA-256 audit** | Every added/modified file under `data/corpus/` gets a SHA-256 hash logged in `logs/audit.jsonl`. | +| **No gateway surface** | No FastAPI endpoint, no socket, no listener, no graph node/edge. The only outbound call is `rclone` → Dropbox, operator-initiated, out-of-band. | +| **Zero new deps** | stdlib + existing `pyyaml`/`utils.*` only. `rclone` is an external binary, installed out-of-band like LM Studio. | + +If you flip `include_soul: true`, `python -m sync.cli setup` prints a loud +`[WARN]` and the generated filter file carries a warning header — so it is never +an accident. + +### Why these invariants hold + +1. **RAG-First.** Sync only changes files on disk that are later consumed at + index time. The retrieve node remains the graph entry point. `graph.py` is + not modified. +2. **Topology = Policy.** Sync adds no StateGraph node and no edge. Corpus + mutation stays *outside* the graph, exactly as soul evolution does. +3. **Triple-Gated External.** Dropbox is contacted only by the `rclone` + subprocess you explicitly invoke — never triggered by a user query, never an + inbound surface. +4. **Audit Convergence.** Sync events terminate in the same `audit.jsonl` with + the same PII redaction and timestamp handling as the gateway. +5. **Soul Governance.** `data/personality/**` and `*.db*` are excluded by + default; a synced file can never overwrite `soul.md` / `cyclaw_soul.db` and + bypass the `apply_evolution()` injection scan behind `POST /soul/apply`. + +--- + +## Step 1 — Install rclone (**≥ 1.68.2**) + +A hard floor of **rclone ≥ 1.68.2** is enforced at runtime. This version fixes +**CVE-2024-52522** (insecure symlink handling with `--links`/`--metadata`). +CyClaw never passes `--links` or `--metadata`, but the floor is asserted anyway +as defense in depth. Older rclone → `RCLONE_VERSION_TOO_OLD` (exit 3). + +### Linux / macOS / WSL +```bash +curl https://rclone.org/install.sh | sudo bash +``` + +### Windows +```powershell +winget install Rclone.Rclone +``` + +Verify: +```bash +rclone version # must show v1.68.2 or higher +``` + +After install, restrict the config file once you've authenticated: +```bash +chmod 600 ~/.config/rclone/rclone.conf # Linux/macOS; restrict ACLs on Windows +``` + +--- + +## Step 2 — Create an App-Folder-scoped Dropbox remote + +Use an **App Folder**-scoped Dropbox app (least privilege), **not** Full +Dropbox. Run rclone's interactive config and complete the browser OAuth flow: + +```bash +rclone config +# n) New remote +# name> dropbox_cyclaw (must match config.yaml remote_name) +# Storage> dropbox +# Use App Folder access (scoped), not Full Dropbox. +# Complete the browser OAuth — choose "offline access". +``` + +The resulting **refresh token lives only in `rclone.conf`** +(`~/.config/rclone/rclone.conf`, or `%APPDATA%\rclone\rclone.conf` on Windows), +owned by your user, managed entirely by rclone. CyClaw never holds it. + +> Higher-security option: enable rclone config encryption +> (`rclone config` → `s) Set configuration password`). Trade-off: an unattended +> scheduled run then needs the password supplied (e.g. via +> `RCLONE_CONFIG_PASS`), which is its own secret to manage. + +--- + +## Step 3 — Edit the `sync:` block in `config.yaml` + +The additive `sync:` block was appended to `config.yaml`. No secrets go here — +only metadata. Absence of the block disables sync entirely. + +```yaml +sync: + enabled: false # off by default; `sync.cli sync` no-ops (exit 0) while false + local_path: "data/corpus" # validated: absolute, under repo data/corpus, a dir + remote_name: "dropbox_cyclaw" # must match `rclone listremotes` + remote_path: "CyClaw/corpus" # folder inside the Dropbox App Folder + direction: "pull" # "pull" (safe default) | "bisync" (opt-in, discouraged) + include_soul: false # leave false — data/personality/ is NOT sync-safe + reindex_on_change: true # exit 10 when data/corpus/** changed + checksum: true # rclone --checksum (hash compare, not mtime) + max_delete: 20 # safety fuse: abort if > N deletions + max_transfer: "1G" # safety fuse: abort if run would move > this + schedule_hour: 2 # 24h local time (cron / Task Scheduler) + schedule_min: 0 + conflict_resolve: "newer" # bisync-only: newer modtime wins + conflict_loser: "rename" # bisync-only: loser saved as .conflict1 (never deleted) + # extra_excludes: # optional, appended AFTER the hardened block + # - "scratch/**" +``` + +--- + +## Step 4 — Validate + +```bash +cd /path/to/CyClaw + +# Pre-flight self-test (rclone present + version, config valid, +# filter file writable, remote reachability dry-run, soul-exclusion asserted) +python -m sync.cli test + +# The pytest unit suite (mocked, no network) +pytest tests/test_sync_*.py +``` + +With rclone absent, `python -m sync.cli test` / `status` exits **3** with +`RCLONE_NOT_INSTALLED` — that clean failure is itself the not-installed path +working as intended. + +--- + +## Daily usage + +```bash +python -m sync.cli setup # validate config + write filter file + print OAuth steps +python -m sync.cli setup --schedule # also register the daily job in one shot +python -m sync.cli sync --dry-run # preview — nothing changes +python -m sync.cli sync # live sync (pull by default) +python -m sync.cli status # current state + last schedule +python -m sync.cli schedule # (re-)register the daily job +python -m sync.cli unschedule # remove the daily job +``` + +Sync is invoked **only** via `python -m sync.cli`; it is never imported by the +gateway or the graph. + +### Exit codes + +| Code | Meaning | Caller action | +|---|---|---| +| 0 | Success, no corpus change | none | +| **10** | Success **and `data/corpus/**` changed** | run `python -m retrieval.indexer`, then **restart the gateway** | +| 1 | Aborted by a safety fuse (`--max-delete` / `--max-transfer`) | investigate the remote; do **not** blindly raise the fuse | +| 2 | Sync failed (other) | inspect the audit log / rclone log | +| 3 | Config / environment problem (rclone missing or old, config invalid) | fix per the error details | + +The retriever is constructed at import time in `gate.py`, so a **gateway +restart is required** to pick up the rebuilt index after a reindex. + +### Cron-friendly reindex chain + +```bash +python -m sync.cli sync; rc=$? +if [ "$rc" -eq 10 ]; then + python -m retrieval.indexer # full rebuild: Chroma + BM25 + # then restart the gateway so it loads the new index +fi +``` + +`retrieval.indexer` runs as a **separate process** — never inline, never in the +gateway event loop. + +--- + +## Scheduling + +`python -m sync.cli schedule` registers a single tagged daily job and is +idempotent (re-running replaces our own entry, never touches yours). + +| Platform | Mechanism | Tag | +|---|---|---| +| Linux / macOS / WSL | `crontab` — one line `MIN HOUR * * * # CYCLAW_DROPBOX_SYNC` (via `crontab -l` / `crontab -`, never `crontab -e`) | comment `CYCLAW_DROPBOX_SYNC` | +| Windows | `schtasks /Create /SC DAILY /ST HH:MM /RL LIMITED /F` | task name `CyClaw Dropbox Sync` | + +The scheduled command `cd`s into the repo root (so `config.yaml` resolves) and +runs `python -m sync.cli sync`. On Windows the task points at a generated +`cyclaw_sync.bat` launcher (written next to the rclone logs) rather than an +inline `cmd /c` string — this avoids the quote fragility of passing a full +command with a space-containing repo path through `schtasks /TR`. + +> **Overlap protection:** `run_sync` holds a single-instance lock (an atomically +> created lock directory under the rclone log dir) for the duration of a run, so +> a scheduled run and a manual run cannot drive rclone concurrently — the second +> exits with `SYNC_RUNTIME` rather than racing. A lock left behind by a crashed +> run is reclaimed automatically after 3 hours. +> +> **More robust Linux option:** a systemd `--user` `Type=oneshot` service driven +> by a timer unit additionally gives journald logging and `Persistent=true` +> catch-up after downtime. Cron is the implemented portable baseline (it works on +> macOS/WSL/BSD too). + +--- + +## Conflict resolution (bisync mode only) + +`direction: bisync` is **opt-in and discouraged.** If you enable it: + +| Scenario | Behavior | +|---|---| +| Both sides changed the same file | rclone detects via its `.lst` baseline | +| Winner | Newer `modtime` wins (`conflict_resolve: newer`) | +| Loser | Renamed `.conflict1` — **never silently deleted** | +| Your job | Review `.conflict*` files, delete the unwanted version | +| Next sync | Your choice propagates to both sides | + +--- + +## What is NOT synced (filter exclusions) + +| Pattern | Reason | +|---|---| +| `data/personality/**` | Soul layer — governed via `POST /soul/apply`, never via file replication | +| `*.gguf`, `*.bin`, `*.safetensors`, `*.onnx`, `*.pt`, `*.pth` | AI model weights — managed locally | +| `index/**`, `.chroma/**` | ChromaDB + BM25 — rebuildable via `python -m retrieval.indexer` | +| `.emb_cache/**` | embeddings cache — auto-downloads | +| `venv/**`, `.venv/**`, `env/**` | virtualenvs — rebuild via `pip install -r requirements.txt` | +| `__pycache__/**`, `*.pyc`, `*.pyo` | bytecode | +| `logs/**`, `*.log`, `*.jsonl` | local forensic data incl. `audit.jsonl` — never share across machines | +| `.env`, `*.pem`, `*.key`, `*_secret*`, `credentials*` | secrets | +| `*.db`, `*.db-wal`, `*.db-shm` | governed soul DB state (`cyclaw_soul.db`) | +| `.git/**`, `.gitignore` | use `git push/pull` separately | +| `.DS_Store`, `Thumbs.db`, `desktop.ini`, `.idea/**`, `.vscode/**` | OS/editor noise | +| `.rclone-state/**`, `*.rclone.lst*` | rclone's own state | + +`extra_excludes:` in the `sync:` block is appended **after** the hardened block. +First-match-wins, most-restrictive-first — you can tighten further but cannot +accidentally re-include something the hardened rules already excluded. + +--- + +## Audit events + +Every sync run emits these into `logs/audit.jsonl` via `utils.logger.audit_log()` +— the same path the gateway uses, with the same PII redaction. Only metadata is +logged; never a token and never raw rclone stderr that could echo a secret. + +| Event | When | Key fields | +|---|---|---| +| `sync_started` | run begins | `direction`, `dry_run`, `remote`, `local_path`, `include_soul` | +| `sync_file_added` | per file | `file`, `sha256` | +| `sync_file_modified` | per file | `file`, `sha256` | +| `sync_file_deleted` | per file | `file` (no hash — bytes gone) | +| `sync_completed` | success | `direction`, `duration_sec`, `rclone_exit_code`, `counts`, `corpus_changed`, `dry_run` | +| `sync_failed` | failure | `direction`, `rclone_exit_code`, `errors_n`, `aborted_for_safety` | + +No field is ever named `query` (that would be SHA-256-hashed by the logger). + +--- + +## Troubleshooting + +| Symptom | Likely cause / fix | +|---|---| +| `RCLONE_NOT_INSTALLED` (exit 3) | Install rclone (Step 1) | +| `RCLONE_VERSION_TOO_OLD` (exit 3) | Upgrade to **v1.68.2** or higher (CVE-2024-52522 fix) | +| `SYNC_CONFIG_INVALID` (exit 3) | Check the `sync:` block in `config.yaml` — error details name the failing field | +| `aborted_for_safety: true` (exit 1) | A safety fuse tripped (`--max-delete`/`--max-transfer`). Either many files were genuinely changed upstream (raise the fuse only if intentional) or the remote is wrong — investigate, don't blindly raise it. | +| `unknown_keys` warning at setup | Typo in `config.yaml` — extra `sync:` keys are ignored but flagged | +| `SYNC_SCHEDULER_ERROR` | `crontab`/`schtasks` not on PATH (e.g. running schtasks under WSL), or the scheduler write failed — see the error details | +| Soul changed on a second machine | `include_soul` was set to true. Set it back to false and rebuild soul from the canonical machine's `data/personality/` via `POST /soul/apply`. | +| Stale answers after a sync | The gateway caches the index at import time. After exit 10 + reindex, **restart the gateway**. | + +--- + +## Why rclone (and not Maestral or the Dropbox SDK) + +| Tool | Verdict | +|---|---| +| **rclone** ✅ | Battle-tested transport, **zero Python deps**, runs as a separate process, refresh token stays in `rclone.conf`. The CLI surface is the security boundary. Native Windows support. | +| Maestral | Always-on **daemon** — wrong shape for an offline-first agent; no native Windows. | +| `dropbox` Python SDK | Forces CyClaw to **hold the refresh token** and drags `requests`/`urllib3` into a deliberately minimal dependency tree. | diff --git a/pyproject.toml b/pyproject.toml index 474c4d6..5aa17b6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -88,7 +88,7 @@ exclude_dirs = ["tests"] skips = ["B101"] [tool.coverage.run] -source = ["gate", "graph", "mcp_hybrid_server", "metrics", "llm", "retrieval", "utils"] +source = ["gate", "graph", "mcp_hybrid_server", "metrics", "llm", "retrieval", "utils", "sync"] omit = ["tests/*", "*/test_*", "*/__pycache__/*"] [tool.coverage.report] diff --git a/sync/__init__.py b/sync/__init__.py new file mode 100644 index 0000000..9388c2d --- /dev/null +++ b/sync/__init__.py @@ -0,0 +1,45 @@ +"""CyClaw Dropbox corpus sync package. + +Out-of-band, rclone-based mirror of a Dropbox folder into the local corpus. +Runs strictly as a separate process (cron / systemd timer / launchd / Task +Scheduler), never imported by gate.py, graph.py, or mcp_hybrid_server.py -- +which preserves CyClaw's RAG-first and topology-as-policy invariants by +construction. + +Public API: + from sync import RcloneConfig, load_sync_config, generate_filters, SyncError + +Usage from the CLI: + python -m sync.cli setup + python -m sync.cli sync [--dry-run] + python -m sync.cli test + python -m sync.cli schedule + python -m sync.cli unschedule + python -m sync.cli status +""" + +from sync.config import RcloneConfig, load_sync_config +from sync.filters import generate_filters, write_filter_file +from utils.errors import ( + RcloneNotInstalledError, + RcloneVersionError, + SchedulerError, + SyncConfigError, + SyncError, + SyncRuntimeError, +) + +__all__ = [ + "RcloneConfig", + "load_sync_config", + "generate_filters", + "write_filter_file", + "SyncError", + "RcloneNotInstalledError", + "RcloneVersionError", + "SyncConfigError", + "SchedulerError", + "SyncRuntimeError", +] + +__version__ = "1.0.0" diff --git a/sync/cli.py b/sync/cli.py new file mode 100644 index 0000000..7d50f10 --- /dev/null +++ b/sync/cli.py @@ -0,0 +1,323 @@ +"""Command-line entry point: ``python -m sync.cli ``. + +Subcommands: + + setup Verify rclone, load+show config, write filters, print the + Dropbox OAuth hint, optionally schedule the daily job. + sync Run one sync now. ``--dry-run`` previews; ``--resync`` rebuilds + the bisync baseline. + test Run the pre-flight self-test. + schedule Register the daily job (cron / launchd / Task Scheduler). + unschedule Remove the daily job. + status Print current sync + schedule status. + +Exit codes (see §7 of the implementation plan): + 0 success, no corpus change + 10 success and data/corpus/** changed -- run `python -m retrieval.indexer` + 1 aborted by a safety fuse (--max-delete / --max-transfer) + 2 sync failed (other) + 3 config / environment problem (rclone missing or too old, config invalid) + +This module never imports gate.py, graph.py, or mcp_hybrid_server.py. The +scheduler is imported LAZILY inside the schedule/unschedule/setup handlers so +this module imports even while the scheduler is still in development, and so +tests can patch ``sync.cli.get_scheduler`` (re-exported below) cleanly. +""" + +from __future__ import annotations + +import argparse +import platform +import sys +import textwrap +from typing import Any + +from sync.config import load_sync_config +from sync.filters import filter_summary, write_filter_file +from sync.runner import check_rclone_version, reindex_exit_code_for, run_sync +from utils.errors import ( + RcloneNotInstalledError, + RcloneVersionError, + SchedulerError, + SyncConfigError, + SyncError, +) + +EXIT_OK = 0 +EXIT_SAFETY = 1 +EXIT_FAIL = 2 +EXIT_ENV = 3 +EXIT_REINDEX = 10 + + +def get_scheduler(cfg: Any) -> Any: + """Lazy proxy to ``sync.scheduler.get_scheduler``. + + Imported inside the function so ``sync.cli`` imports even if the scheduler + module is absent / in flight, and so the import is deferred until a + scheduling subcommand actually runs. Tests patch this name directly. + """ + from sync.scheduler import get_scheduler as _get_scheduler + + return _get_scheduler(cfg) + + +# --------------------------------------------------------------------------- +# Pretty-printing helpers (stdlib only). +# --------------------------------------------------------------------------- + +def _heading(text: str) -> None: + print(f"\n{text}\n{'-' * len(text)}") + + +def _kv(key: str, value: object) -> None: + print(f" {key:.<24} {value}") + + +def _warn(text: str) -> None: + print(f" [WARN] {text}", file=sys.stderr) + + +def _err(text: str) -> None: + print(f" [ERR ] {text}", file=sys.stderr) + + +def _ok(text: str) -> None: + print(f" [OK ] {text}") + + +# --------------------------------------------------------------------------- +# Subcommand handlers +# --------------------------------------------------------------------------- + +def cmd_setup(args: argparse.Namespace) -> int: + """First-time setup: config, version, filters, OAuth hint, optional schedule.""" + _heading("CyClaw Dropbox Sync -- Setup") + + try: + cfg = load_sync_config(args.config) + except SyncConfigError as exc: + _err(f"Config error: {exc.message}") + for k, v in (exc.details or {}).items(): + _err(f" {k}: {v}") + return EXIT_ENV + _ok(f"Loaded sync config from {args.config}") + _kv("local_path", cfg.local_path) + _kv("remote", cfg.remote) + _kv("direction", cfg.direction) + _kv("include_soul", cfg.include_soul) + + if cfg.include_soul: + _warn("include_soul=true -- soul.md and cyclaw_soul.db WILL be mirrored.") + _warn("This bypasses the POST /soul/apply governance path. Confirm this is intentional.") + + try: + v = check_rclone_version() + _ok(f"rclone {v[0]}.{v[1]}.{v[2]} installed") + except (RcloneNotInstalledError, RcloneVersionError) as exc: + _err(exc.message) + for k, val in (exc.details or {}).items(): + _err(f" {k}: {val}") + return EXIT_ENV + + try: + path = write_filter_file(cfg) + _ok(f"Wrote filter file: {path}") + except OSError as exc: + _err(f"Could not write filter file: {exc}") + return EXIT_ENV + + _heading("Next: configure the Dropbox remote") + print(textwrap.dedent(f""" + 1. Run: rclone config + 2. Choose: n (new remote) + 3. Name: {cfg.remote_name} + 4. Storage: dropbox + 5. Accept defaults; complete the browser OAuth flow. + 6. Verify: rclone lsd {cfg.remote_name}: + """).strip()) + + if args.schedule: + try: + entry = get_scheduler(cfg).install() + _ok(f"Scheduled daily sync ({entry.cron_or_time})") + except SchedulerError as exc: + _err(f"Scheduling failed: {exc.message}") + return EXIT_ENV + + return EXIT_OK + + +def cmd_sync(args: argparse.Namespace) -> int: + """Run one sync now.""" + try: + cfg = load_sync_config(args.config) + except SyncConfigError as exc: + _err(f"Config error: {exc.message}") + return EXIT_ENV + + # Honour the config toggle: `sync.enabled: false` is an intentional "off", + # not an error -- a scheduled run should no-op cleanly (exit 0), never fail. + if not getattr(cfg, "enabled", True): + _heading("Sync disabled") + print(" sync.enabled is false in config.yaml; nothing to do.") + print(" Set sync.enabled: true (or remove the key) to run sync.") + return EXIT_OK + + try: + result = run_sync(cfg, dry_run=args.dry_run, resync=args.resync) + except (RcloneNotInstalledError, RcloneVersionError) as exc: + _err(exc.message) + return EXIT_ENV + except SyncError as exc: + _err(f"Sync error: {exc.message}") + return EXIT_FAIL + + counts = result.event_counts() + _heading("Sync complete" if result.success else "Sync FAILED") + _kv("direction", result.direction) + _kv("exit_code", result.rclone_exit_code) + _kv("duration_sec", f"{result.duration_sec:.2f}") + _kv("added", counts["added"]) + _kv("modified", counts["modified"]) + _kv("deleted", counts["deleted"]) + _kv("corpus_changed", result.corpus_changed) + if result.errors: + _kv("errors_n", len(result.errors)) + for line in result.errors[:5]: + _err(line[:200]) + + return reindex_exit_code_for(result, cfg) + + +def cmd_test(args: argparse.Namespace) -> int: + """Run the pre-flight self-test.""" + from sync.selftest import run_self_test + + passed, total, lines = run_self_test(args.config, dry_run=True) + _heading(f"Self-test: {passed}/{total} passed") + for line in lines: + print(line) + return EXIT_OK if passed == total else EXIT_FAIL + + +def cmd_schedule(args: argparse.Namespace) -> int: + try: + cfg = load_sync_config(args.config) + entry = get_scheduler(cfg).install() + except (SyncConfigError, SchedulerError) as exc: + _err(exc.message) + return EXIT_ENV + _ok(f"Scheduled: {entry.cron_or_time} on {entry.platform_name}") + return EXIT_OK + + +def cmd_unschedule(args: argparse.Namespace) -> int: + try: + cfg = load_sync_config(args.config) + removed = get_scheduler(cfg).remove() + except (SyncConfigError, SchedulerError) as exc: + _err(exc.message) + return EXIT_ENV + if removed: + _ok("Scheduled job removed.") + else: + _ok("No CyClaw scheduled job was registered.") + return EXIT_OK + + +def cmd_status(args: argparse.Namespace) -> int: + try: + cfg = load_sync_config(args.config) + except SyncConfigError as exc: + _err(exc.message) + return EXIT_ENV + + _heading("CyClaw Sync Status") + _kv("enabled", getattr(cfg, "enabled", True)) + _kv("local_path", cfg.local_path) + _kv("remote", cfg.remote) + _kv("direction", cfg.direction) + _kv("include_soul", cfg.include_soul) + _kv("schedule", f"{cfg.schedule_hour:02d}:{cfg.schedule_min:02d}") + _kv("filter_file", cfg.filter_file) + _kv("log_dir", cfg.log_dir) + _kv("platform", platform.system()) + + try: + v = check_rclone_version() + _ok(f"rclone {v[0]}.{v[1]}.{v[2]}") + except (RcloneNotInstalledError, RcloneVersionError) as exc: + _err(exc.message) + + try: + entry = get_scheduler(cfg).status() + if entry: + _ok(f"Scheduled: {entry.cron_or_time}") + else: + print(" [-] Not scheduled.") + except SchedulerError as exc: + _warn(f"Could not read scheduler state: {exc.message}") + except ImportError: + # Scheduler module not available in this environment -- non-fatal for status. + _warn("Scheduler module unavailable; schedule state not read.") + + fsummary = filter_summary(cfg) + print() + print(" Filter summary:") + print(f" soul excluded: {fsummary['soul_excluded']}") + print(f" total rules: {fsummary['total_rules']}") + print(f" extra excludes: {len(fsummary['extra_excludes'])}") + + return EXIT_OK + + +# --------------------------------------------------------------------------- +# Entry point +# --------------------------------------------------------------------------- + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + prog="python -m sync.cli", + description="CyClaw Dropbox sync -- rclone-based, audit-logged, out-of-band.", + ) + parser.add_argument( + "--config", default="config.yaml", + help="Path to CyClaw config.yaml (default: %(default)s)", + ) + + sub = parser.add_subparsers(dest="cmd", required=True) + + p_setup = sub.add_parser("setup", help="Bootstrap: verify env, write filters, optional schedule.") + p_setup.add_argument("--schedule", action="store_true", help="Also register the daily scheduled job.") + p_setup.set_defaults(func=cmd_setup) + + p_sync = sub.add_parser("sync", help="Run one sync now.") + p_sync.add_argument("--dry-run", action="store_true", help="Preview only; modify nothing.") + p_sync.add_argument("--resync", action="store_true", help="bisync only: rebuild baseline state.") + p_sync.set_defaults(func=cmd_sync) + + p_test = sub.add_parser("test", help="Run the pre-flight self-test.") + p_test.set_defaults(func=cmd_test) + + p_sched = sub.add_parser("schedule", help="Register the daily scheduled job.") + p_sched.set_defaults(func=cmd_schedule) + + p_unsched = sub.add_parser("unschedule", help="Remove the daily scheduled job.") + p_unsched.set_defaults(func=cmd_unschedule) + + p_status = sub.add_parser("status", help="Print sync + schedule status.") + p_status.set_defaults(func=cmd_status) + + return parser + + +def main(argv: list[str] | None = None) -> int: + parser = build_parser() + args = parser.parse_args(argv) + func = args.func + return int(func(args)) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/sync/config.py b/sync/config.py new file mode 100644 index 0000000..2141a90 --- /dev/null +++ b/sync/config.py @@ -0,0 +1,277 @@ +"""RcloneConfig dataclass and validating loader for the CyClaw sync: block. + +Reads the ``sync:`` block from CyClaw's single-source-of-truth ``config.yaml`` +via ``utils.logger._get_config`` (so it shares the same cached load and the same +``reset_config_cache`` test hook). Purely additive: absence of the block disables +sync entirely without perturbing the gateway or indexer. + +Hardened defaults (conservative, in line with CyClaw's offline-first / +soul-governance posture): + + - direction: "pull" one-way Dropbox -> local; bisync is opt-in + - include_soul: False data/personality/ NOT synced by default + - max_delete: 20 safety fuse: rclone aborts if > N deletions + - conflict_resolve: "newer" bisync only -- newer modtime wins + - conflict_loser: "rename" bisync only -- loser saved as .conflict1 + +To change defaults, edit the sync: block in config.yaml -- never edit this file +unless you are changing the schema itself. +""" + +from __future__ import annotations + +import os +import platform +import re +from dataclasses import asdict, dataclass, field +from pathlib import Path + +from utils.errors import SyncConfigError +from utils.logger import _get_config + +# Defaults -- every key here can be overridden by config.yaml. +DEFAULT_REMOTE_NAME = "dropbox_cyclaw" +DEFAULT_REMOTE_PATH = "CyClaw/corpus" +DEFAULT_DIRECTION = "pull" # "pull" (safe default) | "bisync" (opt-in) +DEFAULT_SCHEDULE_HOUR = 2 +DEFAULT_SCHEDULE_MIN = 0 +DEFAULT_MAX_DELETE = 20 +DEFAULT_MAX_TRANSFER = "1G" +DEFAULT_CONFLICT_RESOLVE = "newer" +DEFAULT_CONFLICT_LOSER = "rename" +DEFAULT_INCLUDE_SOUL = False +DEFAULT_REINDEX_ON_CHANGE = True # exit 10 if corpus files changed +DEFAULT_CHECKSUM = True + +# Validation constants. +_REMOTE_NAME_RE = re.compile(r"^[A-Za-z0-9_.-]+$") +# Shell metacharacters that must never appear in remote_path (defense in depth; +# we never use shell=True, but reject taint at the boundary anyway). +_SHELL_METACHARS = set(";|&$`<>(){}[]!*?\"'\\\n\r\t ") +_VALID_DIRECTIONS = ("pull", "bisync") +_VALID_CONFLICT_RESOLVE = ("newer", "older", "larger", "smaller", "none") + + +def _default_rclone_state_dir() -> Path: + """Return rclone's state directory, honouring XDG_CONFIG_HOME.""" + base = os.environ.get("XDG_CONFIG_HOME") + if base: + return Path(base) / "rclone" + return Path.home() / ".config" / "rclone" + + +@dataclass +class RcloneConfig: + """Parsed and validated sync: block from config.yaml.""" + + # Required (validated: absolute, under repo data/corpus tree). + local_path: str + + # Remote identity (rclone remote name and path inside Dropbox). + remote_name: str = DEFAULT_REMOTE_NAME + remote_path: str = DEFAULT_REMOTE_PATH + + # Sync behaviour. + direction: str = DEFAULT_DIRECTION # "pull" | "bisync" + include_soul: bool = DEFAULT_INCLUDE_SOUL + reindex_on_change: bool = DEFAULT_REINDEX_ON_CHANGE + checksum: bool = DEFAULT_CHECKSUM + + # Safety fuses. + max_delete: int = DEFAULT_MAX_DELETE + max_transfer: str = DEFAULT_MAX_TRANSFER + + # bisync-only knobs (ignored when direction == "pull"). + conflict_resolve: str = DEFAULT_CONFLICT_RESOLVE + conflict_loser: str = DEFAULT_CONFLICT_LOSER + + # Scheduling (cron / systemd / launchd / Task Scheduler). + schedule_hour: int = DEFAULT_SCHEDULE_HOUR + schedule_min: int = DEFAULT_SCHEDULE_MIN + + # File locations (defaults computed at load time, all overridable). + workdir: str | None = None # bisync state dir + filter_file: str | None = None # cyclaw_filters.txt path + log_dir: str | None = None # rclone log dir + + # Extra exclusions appended AFTER the built-in hardened defaults. + extra_excludes: list[str] = field(default_factory=list) + + # Reindex exit code -- caller uses this to detect "corpus changed". + REINDEX_EXIT_CODE: int = 10 + + # --- Validation ------------------------------------------------------- + + def __post_init__(self) -> None: + self._validate_local_path() + self._validate_remote_name() + self._validate_remote_path() + + if self.direction not in _VALID_DIRECTIONS: + raise SyncConfigError( + f"sync.direction must be 'pull' or 'bisync', got: {self.direction!r}", + details={"received": self.direction}, + ) + + if self.max_delete < 0: + raise SyncConfigError( + f"sync.max_delete must be >= 0, got: {self.max_delete}", + details={"received": self.max_delete}, + ) + + if not 0 <= self.schedule_hour <= 23: + raise SyncConfigError( + f"sync.schedule_hour must be 0-23, got: {self.schedule_hour}", + details={"received": self.schedule_hour}, + ) + + if not 0 <= self.schedule_min <= 59: + raise SyncConfigError( + f"sync.schedule_min must be 0-59, got: {self.schedule_min}", + details={"received": self.schedule_min}, + ) + + if self.conflict_resolve not in _VALID_CONFLICT_RESOLVE: + raise SyncConfigError( + f"sync.conflict_resolve invalid: {self.conflict_resolve!r}", + details={"valid": list(_VALID_CONFLICT_RESOLVE)}, + ) + + self._fill_default_paths() + + def _validate_local_path(self) -> None: + if not self.local_path: + raise SyncConfigError( + "sync.local_path is required", + details={"hint": "Set sync.local_path to a path under the repo's data/corpus tree."}, + ) + + # Expand ~ and env vars early so downstream code does not have to. + expanded = os.path.expanduser(os.path.expandvars(self.local_path)) + + # A relative default like "data/corpus" is resolved against the repo + # root (cwd) into an absolute path; an already-absolute path is kept. + resolved = Path(expanded).resolve() + + # Repo root is two levels up from this file: sync/config.py -> repo/. + repo_root = Path(__file__).resolve().parent.parent + corpus_root = (repo_root / "data" / "corpus").resolve() + + # Must resolve to corpus_root itself or a path inside it. resolve() + # collapses ".." and follows symlinks, so an escape via ".." or a + # symlink cannot land inside corpus_root. + if resolved != corpus_root and corpus_root not in resolved.parents: + raise SyncConfigError( + "sync.local_path must resolve to a path inside the repo's data/corpus tree", + details={"resolved": str(resolved), "corpus_root": str(corpus_root)}, + ) + + # After resolution, store an absolute path so callers never see a + # relative value or an unresolved "..". + self.local_path = str(resolved) + + def _validate_remote_name(self) -> None: + if not _REMOTE_NAME_RE.match(self.remote_name): + raise SyncConfigError( + f"sync.remote_name must match ^[A-Za-z0-9_.-]+$, got: {self.remote_name!r}", + details={"received": self.remote_name}, + ) + + def _validate_remote_path(self) -> None: + if self.remote_path.startswith("-"): + raise SyncConfigError( + f"sync.remote_path must not start with '-' (would be parsed as a flag): {self.remote_path!r}", + details={"received": self.remote_path}, + ) + bad = sorted(_SHELL_METACHARS & set(self.remote_path)) + if bad: + raise SyncConfigError( + f"sync.remote_path contains forbidden characters: {bad!r}", + details={"received": self.remote_path, "forbidden": bad}, + ) + + def _fill_default_paths(self) -> None: + state_dir = _default_rclone_state_dir() + if self.workdir is None: + self.workdir = str(state_dir / "bisync_state") + if self.filter_file is None: + self.filter_file = str(state_dir / "cyclaw_filters.txt") + if self.log_dir is None: + self.log_dir = str(state_dir / "logs") + + self.workdir = os.path.expanduser(os.path.expandvars(self.workdir)) + self.filter_file = os.path.expanduser(os.path.expandvars(self.filter_file)) + self.log_dir = os.path.expanduser(os.path.expandvars(self.log_dir)) + + # --- Computed properties --------------------------------------------- + + @property + def remote(self) -> str: + """Combined remote spec for rclone, e.g. 'dropbox_cyclaw:CyClaw/corpus'.""" + return f"{self.remote_name}:{self.remote_path}" + + @property + def log_path(self) -> str: + """Full path to the active rclone log file.""" + return os.path.join(self.log_dir or "", "rclone_cyclaw.log") + + @property + def is_windows(self) -> bool: + return platform.system() == "Windows" + + # --- Serialization --------------------------------------------------- + + def to_dict(self) -> dict: + return asdict(self) + + +def load_sync_config(config_path: str = "config.yaml") -> RcloneConfig: + """Read config.yaml's sync: block and return a validated RcloneConfig. + + Loads through ``utils.logger._get_config`` (cached; tests reset via + ``reset_config_cache``). Raises ``SyncConfigError`` if the block is absent, + malformed, or any value fails validation. Unknown keys are collected on a + non-fatal ``_unknown_keys`` attribute for typo visibility. + """ + cfg = _get_config(config_path) or {} + + block = cfg.get("sync") + if not block: + raise SyncConfigError( + "sync: block missing from config.yaml", + details={ + "hint": "Append the sync: block to config.yaml. " + "See docs/SYNC_README.md or sync/config.py for the schema." + }, + ) + + if not isinstance(block, dict): + raise SyncConfigError( + f"sync: block must be a mapping, got {type(block).__name__}", + details={"received_type": type(block).__name__}, + ) + + # Pass through only fields RcloneConfig knows about (excluding the constant + # REINDEX_EXIT_CODE). Unknown keys are collected, not fatal. + known_fields = {f for f in RcloneConfig.__dataclass_fields__ if f != "REINDEX_EXIT_CODE"} + unknown = set(block.keys()) - known_fields + # "enabled" is CyClaw's own on/off toggle, not an rclone parameter, so it is + # not an RcloneConfig field and not a typo. It is read out here and enforced + # by the CLI (``cmd_sync`` no-ops when false); drop it from the unknown set. + unknown.discard("enabled") + kwargs = {k: v for k, v in block.items() if k in known_fields} + + try: + rc = RcloneConfig(**kwargs) + except TypeError as exc: + raise SyncConfigError( + f"sync: block invalid: {exc}", + details={"unknown_keys": sorted(unknown)}, + ) from exc + + # Default to enabled when the key is absent (a present sync: block is opt-in + # already). Stored as a plain attribute, not a dataclass field, to keep it + # out of the rclone-parameter surface (to_dict / argv). + rc.enabled = bool(block.get("enabled", True)) # type: ignore[attr-defined] + rc._unknown_keys = sorted(unknown) # type: ignore[attr-defined] + return rc diff --git a/sync/filters.py b/sync/filters.py new file mode 100644 index 0000000..3b3f2d8 --- /dev/null +++ b/sync/filters.py @@ -0,0 +1,154 @@ +"""rclone filter file generation for CyClaw corpus sync. + +Produces cyclaw_filters.txt -- the file rclone reads (``--filter-from``) to +decide what to sync. The defaults are deliberately exclusive: anything not in +this denylist still syncs, but the things that MUST NOT sync are listed first. +rclone evaluates filters top-to-bottom, first match wins, so the most-specific +/ most-dangerous rules come first. + +Why this matters (security posture, in priority order): + + 1. data/personality/ -- soul.md and cyclaw_soul.db. Mirroring these is a + silent-rewrite path: a stale soul.md on a second machine can clobber an + evolved one without going through POST /soul/apply. Excluded by default; + the override (include_soul=true) is loud. + 2. AI model weights (*.gguf, *.bin, *.safetensors, ...). Managed per machine. + 3. ChromaDB / BM25 indices and embedding caches. Rebuildable in seconds via + `python -m retrieval.indexer`. + 4. Python virtualenv, __pycache__, .git. Use normal channels. + 5. Audit logs (logs/**, *.log, *.jsonl). Per-machine forensic record. + 6. Per-host secrets, credentials, and governed soul DB state. + +rclone filter syntax (https://rclone.org/filtering/): + '- pattern' exclude '+ pattern' include +""" + +from __future__ import annotations + +from pathlib import Path + +from sync.config import RcloneConfig + +# The single soul-layer rule -- conditionally dropped when include_soul=true. +_SOUL_RULE = "- data/personality/**" + +# Built-in hardened exclusions. Order matters -- most-specific first. +_HARDENED_EXCLUDES: list[str] = [ + # 1. Soul layer -- governed via POST /soul/apply only (opt-in to sync). + _SOUL_RULE, + # 2. AI model weights. + "- *.gguf", + "- *.bin", + "- *.safetensors", + "- *.onnx", + "- *.pt", + "- *.pth", + # 3. Vector store + keyword index + embedding cache (rebuildable). + "- index/**", + "- .emb_cache/**", + "- .chroma/**", + # 4. Python and virtualenv noise. + "- venv/**", + "- .venv/**", + "- env/**", + "- __pycache__/**", + "- *.pyc", + "- *.pyo", + "- *.egg-info/**", + # 5. Local forensic data -- never share across machines. + "- logs/**", + "- *.log", + "- *.jsonl", + # 6. Per-host secrets and credentials -- defense in depth. + "- .env", + "- .env.*", + "- *.env", + "- *.pem", + "- *.key", + "- *_secret*", + "- credentials*", + # 7. Governed soul DB state. + "- *.db", + "- *.db-wal", + "- *.db-shm", + # 8. VCS metadata (use git push/pull). + "- .git/**", + "- .gitignore", + # 9. OS / editor metadata. + "- .DS_Store", + "- Thumbs.db", + "- desktop.ini", + "- *.swp", + "- .idea/**", + "- .vscode/**", + # 10. rclone state (avoid recursive sync of state into Dropbox). + "- .rclone-state/**", + "- *.rclone.lst", + "- *.rclone.lst-new", + "- *.rclone.lst-prev", +] + +_HEADER = """# CyClaw rclone filter file +# Auto-generated by sync.filters -- DO NOT EDIT BY HAND. +# To customise: add `extra_excludes` to the sync: block in config.yaml. +# Format: rclone filter rules -- '-' excludes, '+' includes, first match wins. +# https://rclone.org/filtering/ +""" + +_SOUL_WARNING = [ + "# WARNING: include_soul=true -- data/personality/ IS being synced.", + "# This bypasses POST /soul/apply governance. Stale soul.md on a", + "# second machine can silently clobber an evolved one. See config.yaml.", +] + + +def generate_filters(cfg: RcloneConfig) -> str: + """Return the full text content of cyclaw_filters.txt for the given config.""" + lines: list[str] = [_HEADER.rstrip(), ""] + + excludes = list(_HARDENED_EXCLUDES) + if cfg.include_soul: + # Drop the data/personality/** rule (and only that one) so the soul + # layer is mirrored. Loud, opt-in, discouraged. + excludes = [e for e in excludes if e != _SOUL_RULE] + lines.extend(_SOUL_WARNING) + lines.append("") + + lines.extend(excludes) + + # User-supplied extras come AFTER the hardened block so users can tighten + # further but cannot accidentally re-include something already excluded + # (first-match-wins means a hardened rule already matched). + if cfg.extra_excludes: + lines.append("") + lines.append("# --- extra_excludes from config.yaml ---") + for raw in cfg.extra_excludes: + pat = raw.strip() + if not pat: + continue + # Tolerate entries written without a leading '- '. + if not pat.startswith(("- ", "+ ", "!")): + pat = f"- {pat}" + lines.append(pat) + + return "\n".join(lines) + "\n" + + +def write_filter_file(cfg: RcloneConfig) -> str: + """Generate cyclaw_filters.txt at cfg.filter_file; return the absolute path.""" + target = Path(cfg.filter_file or "") + target.parent.mkdir(parents=True, exist_ok=True) + target.write_text(generate_filters(cfg), encoding="utf-8") + return str(target.resolve()) + + +def filter_summary(cfg: RcloneConfig) -> dict: + """Return a small dict describing the active filter posture (for status/test).""" + rules = [r for r in generate_filters(cfg).splitlines() if r and not r.startswith("#")] + return { + "soul_excluded": not cfg.include_soul, + "include_soul": cfg.include_soul, + "total_rules": len(rules), + "filter_file": cfg.filter_file, + "extra_excludes": list(cfg.extra_excludes), + } diff --git a/sync/runner.py b/sync/runner.py new file mode 100644 index 0000000..10c0c59 --- /dev/null +++ b/sync/runner.py @@ -0,0 +1,541 @@ +"""rclone subprocess wrapper for CyClaw corpus sync. + +Responsibilities: + - Locate and version-check the rclone binary (floor 1.68.2 -- CVE-2024-52522). + - Build the right argv for pull (``rclone copy``) or bisync (``rclone bisync``). + - Run rclone with a structured log file. + - Parse the log to derive per-file events (added / modified / deleted). + - Hash each touched file under data/corpus/ with SHA-256 for audit. + - Emit audit events via ``utils.logger.audit_log``. + - Decide whether to signal "corpus changed -> reindex" via exit code. + +This module does NOT import anything from gate.py, graph.py, or the FastAPI / +MCP layer. It runs strictly out-of-band and writes only to the local filesystem +and the audit log. That isolation is what keeps CyClaw's five security +invariants intact: it cannot bypass retrieval, alter graph topology, or modify +soul state (soul is excluded by default -- see filters.py). + +argv is ALWAYS a list and rclone is ALWAYS resolved to an absolute path via +``shutil.which`` -- never ``shell=True``, never a string command. Every argv +element comes from validated config plus a fixed flag list, so there is no taint +path into the subprocess. +""" + +from __future__ import annotations + +import dataclasses +import hashlib +import os +import re +import shutil +import subprocess # noqa: S404 -- argv-list rclone invocation only; never shell=True (see run_sync) +import time +from collections.abc import Sequence +from dataclasses import dataclass, field + +from sync.config import RcloneConfig +from sync.filters import write_filter_file +from utils.errors import ( + RcloneNotInstalledError, + RcloneVersionError, + SyncRuntimeError, +) +from utils.logger import audit_log + +# --------------------------------------------------------------------------- +# Version handling +# --------------------------------------------------------------------------- + +MIN_RCLONE_MAJOR, MIN_RCLONE_MINOR, MIN_RCLONE_PATCH = 1, 68, 2 + +_RCLONE_VERSION_RE = re.compile(r"rclone\s+v?(\d+)\.(\d+)(?:\.(\d+))?", re.IGNORECASE) + + +def check_rclone_version(rclone_bin: str = "rclone") -> tuple[int, int, int]: + """Confirm rclone is installed and >= 1.68.2. + + Returns the parsed ``(major, minor, patch)`` tuple. Raises + ``RcloneNotInstalledError`` if the binary is not on PATH, or + ``RcloneVersionError`` if the version is too old (CVE-2024-52522 floor). + """ + binary = shutil.which(rclone_bin) + if binary is None: + raise RcloneNotInstalledError( + "rclone binary not found on PATH", + details={ + "looked_for": rclone_bin, + "install_hint_linux": "curl https://rclone.org/install.sh | sudo bash", + "install_hint_windows": "winget install Rclone.Rclone", + }, + ) + + try: + # argv list; binary is an absolute path from shutil.which; no shell. + result = subprocess.run( # noqa: S603 -- argv list, absolute binary, no shell + [binary, "version"], + capture_output=True, + text=True, + timeout=10, + check=False, + ) + except subprocess.TimeoutExpired as exc: + raise RcloneNotInstalledError( + f"rclone version check timed out: {exc}", + details={"binary": binary}, + ) from exc + + output = (result.stdout or "") + (result.stderr or "") + match = _RCLONE_VERSION_RE.search(output) + if not match: + raise RcloneVersionError( + "Could not parse rclone version output", + details={"binary": binary, "output": output[:500]}, + ) + + major = int(match.group(1)) + minor = int(match.group(2)) + patch = int(match.group(3)) if match.group(3) else 0 + + if (major, minor, patch) < (MIN_RCLONE_MAJOR, MIN_RCLONE_MINOR, MIN_RCLONE_PATCH): + raise RcloneVersionError( + f"rclone {major}.{minor}.{patch} is too old; need >= " + f"{MIN_RCLONE_MAJOR}.{MIN_RCLONE_MINOR}.{MIN_RCLONE_PATCH} " + "(CVE-2024-52522 fix)", + details={ + "found": f"{major}.{minor}.{patch}", + "required": f">={MIN_RCLONE_MAJOR}.{MIN_RCLONE_MINOR}.{MIN_RCLONE_PATCH}", + "binary": binary, + }, + ) + + return (major, minor, patch) + + +# --------------------------------------------------------------------------- +# Log parsing +# --------------------------------------------------------------------------- + +# rclone's default log verbs are stable; we anchor on the trailing verb only: +# 2026/05/21 02:10:01 INFO : file.md: Copied (new) +# 2026/05/21 02:10:01 INFO : file.md: Copied (replaced existing) +# 2026/05/21 02:10:01 INFO : file.md: Deleted +_LOG_ADDED_RE = re.compile(r":\s*([^:]+?):\s*Copied\s*\(new\)", re.IGNORECASE) +_LOG_MODIFIED_RE = re.compile( + r":\s*([^:]+?):\s*(?:Copied \(replaced existing\)|Updated modification time)", + re.IGNORECASE, +) +_LOG_DELETED_RE = re.compile(r":\s*([^:]+?):\s*Deleted(?:\s|$)", re.IGNORECASE) +_LOG_ERROR_RE = re.compile(r"\bERROR\b\s*:\s*(.+)", re.IGNORECASE) + +# rclone's own scratch / state artifacts that may appear in a log line but are +# NOT corpus content. They should never trip the "corpus changed -> reindex" +# signal even if one ever leaks past the filter file. +_RCLONE_INTERNAL_PREFIXES = (".rclone-", "bisync-", ".tmp-", "RCLONE_TEST") + + +@dataclass +class FileEvent: + """A single per-file event derived from the rclone log.""" + + kind: str # "added" | "modified" | "deleted" + path: str # path relative to the local repo root + sha256: str | None = None # populated by hash_changed_files(); None for deletions + + def to_audit_dict(self, base: str) -> dict: + # Use a "file" key, never "query": audit_log() SHA-256-hashes any field + # named "query" and we want this path to stay human-readable. + return { + "event": f"sync_file_{self.kind}", + "file": self.path, + "sha256": self.sha256 or "", + "base": base, + } + + +@dataclass +class SyncResult: + """Outcome of a single sync run.""" + + success: bool + direction: str # "pull" | "bisync" | "dry-run" + started_at: float # epoch seconds + finished_at: float # epoch seconds + rclone_exit_code: int + events: list[FileEvent] = field(default_factory=list) + errors: list[str] = field(default_factory=list) + log_path: str | None = None + aborted_for_safety: bool = False # True if --max-delete / --max-transfer tripped + dry_run: bool = False + corpus_changed: bool = False # True if any event hit data/corpus/** + + @property + def duration_sec(self) -> float: + return max(0.0, self.finished_at - self.started_at) + + def event_counts(self) -> dict: + counts = {"added": 0, "modified": 0, "deleted": 0} + for ev in self.events: + counts[ev.kind] = counts.get(ev.kind, 0) + 1 + return counts + + def to_audit_dict(self) -> dict: + return { + "event": "sync_completed" if self.success else "sync_failed", + "direction": self.direction, + "duration_sec": round(self.duration_sec, 3), + "rclone_exit_code": self.rclone_exit_code, + "counts": self.event_counts(), + "errors_n": len(self.errors), + "aborted_for_safety": self.aborted_for_safety, + "dry_run": self.dry_run, + "corpus_changed": self.corpus_changed, + } + + +def parse_log(log_path: str) -> tuple[list[FileEvent], list[str]]: + """Parse an rclone log file into ``(events, errors)``. + + Tolerant: any line that does not match a known pattern is ignored. Errors + are captured as raw strings. + + Scope note: the regexes target the ``rclone copy`` execution verbs + (``Copied (new)``, ``Copied (replaced existing)``, ``Deleted``). ``rclone + bisync`` also emits these verbs during its execution phase, so deletions and + copies are still captured for bisync. Its *pre-sync* structured-diff lines + (``- Path1 File is new - file.md``) are intentionally not parsed: bisync + is opt-in/discouraged here, and the execution-phase verbs are sufficient to + derive ``corpus_changed``. Per-file event counts for bisync may therefore be + a lower bound; rely on the run's exit status, not the count, for bisync. + """ + events: list[FileEvent] = [] + errors: list[str] = [] + + try: + with open(log_path, encoding="utf-8", errors="replace") as f: + for line in f: + m = _LOG_ADDED_RE.search(line) + if m: + events.append(FileEvent(kind="added", path=m.group(1).strip())) + continue + m = _LOG_MODIFIED_RE.search(line) + if m: + events.append(FileEvent(kind="modified", path=m.group(1).strip())) + continue + m = _LOG_DELETED_RE.search(line) + if m: + events.append(FileEvent(kind="deleted", path=m.group(1).strip())) + continue + m = _LOG_ERROR_RE.search(line) + if m: + errors.append(m.group(1).strip()) + except FileNotFoundError: + # No log file == nothing happened. Caller decides whether that is an error. + pass + + return events, errors + + +def hash_changed_files(events: Sequence[FileEvent], local_root: str) -> list[FileEvent]: + """Populate ``FileEvent.sha256`` for added/modified entries still on disk. + + Deleted files keep ``sha256=None`` (the bytes are gone -- nothing to hash). + Files missing from disk are also left as ``None``. Hashing is streamed in + 64 KiB chunks with stdlib ``hashlib.sha256``. Returns a NEW list; the input + is untouched. + """ + out: list[FileEvent] = [] + for ev in events: + if ev.kind == "deleted": + out.append(ev) + continue + abs_path = os.path.join(local_root, ev.path) + try: + with open(abs_path, "rb") as f: + h = hashlib.sha256() + for chunk in iter(lambda f=f: f.read(65536), b""): + h.update(chunk) + out.append(dataclasses.replace(ev, sha256=h.hexdigest())) + except OSError: + out.append(ev) + return out + + +# --------------------------------------------------------------------------- +# Argv builders -- always lists; every element is a fixed flag or validated cfg. +# --------------------------------------------------------------------------- + +def _common_args(cfg: RcloneConfig, log_path: str) -> list[str]: + """Args shared between ``copy`` and ``bisync``. + + Note: ``--max-delete`` is intentionally excluded here. ``rclone copy`` + never deletes destination files, so the flag would be a no-op for pull + mode. It is added only in ``build_bisync_argv`` where deletions can occur. + """ + args: list[str] = [ + "--filter-from", cfg.filter_file or "", + f"--max-transfer={cfg.max_transfer}", + "--check-first", + "--log-file", log_path, + "--log-level", "INFO", + ] + if cfg.checksum: + args.append("--checksum") + return args + + +def build_pull_argv( + cfg: RcloneConfig, + dry_run: bool, + log_path: str, + rclone_bin: str = "rclone", +) -> list[str]: + """Argv for one-way pull (Dropbox -> local). ``rclone copy`` never deletes.""" + argv = [ + rclone_bin, "copy", + cfg.remote, cfg.local_path, + *_common_args(cfg, log_path), + ] + if dry_run: + argv.append("--dry-run") + return argv + + +def build_bisync_argv( + cfg: RcloneConfig, + dry_run: bool, + log_path: str, + resync: bool = False, + rclone_bin: str = "rclone", +) -> list[str]: + """Argv for bidirectional ``bisync`` (opt-in, discouraged).""" + argv = [ + rclone_bin, "bisync", + cfg.remote, cfg.local_path, + f"--conflict-resolve={cfg.conflict_resolve}", + f"--conflict-loser={cfg.conflict_loser}", + "--workdir", cfg.workdir or "", + f"--max-delete={cfg.max_delete}", # only meaningful where deletions occur + *_common_args(cfg, log_path), + ] + if resync: + argv.append("--resync") + if dry_run: + argv.append("--dry-run") + return argv + + +# --------------------------------------------------------------------------- +# Public entry point +# --------------------------------------------------------------------------- + +def _detect_safety_abort(errors: Sequence[str], stderr: str) -> bool: + """True if rclone tripped the --max-delete or --max-transfer safety fuse.""" + haystacks = list(errors) + [stderr or ""] + return any( + ("max-delete" in h.lower() or "max-transfer" in h.lower()) + for h in haystacks + ) + + +# A daily scheduled run and a manual run must never drive rclone against the +# same remote/destination at once: their log writes would interleave and corrupt +# parsing, and concurrent filter-file writes would race. ``os.mkdir`` is atomic +# on every platform, so it doubles as a zero-dependency, cross-platform lock -- +# no fcntl/msvcrt branching, no third-party dep. +_LOCK_STALE_SEC = 3 * 60 * 60 # reclaim a lock left by a crashed run after 3h + + +def _acquire_sync_lock(lock_dir: str) -> None: + """Acquire the single-instance lock, or raise ``SyncRuntimeError``. + + Reclaims a lock older than ``_LOCK_STALE_SEC`` (a prior run that crashed + without releasing it) so a stale directory can never wedge sync forever. + """ + try: + os.mkdir(lock_dir) + return + except FileExistsError: + pass + try: + age = time.time() - os.path.getmtime(lock_dir) + except OSError: + age = 0.0 + if age > _LOCK_STALE_SEC: + try: + os.rmdir(lock_dir) + os.mkdir(lock_dir) + return + except OSError: + pass + raise SyncRuntimeError( + "Another CyClaw sync appears to be running", + details={ + "lock_dir": lock_dir, + "hint": "Wait for the other run to finish, or remove the lock dir if it is stale.", + }, + ) + + +def _release_sync_lock(lock_dir: str) -> None: + """Release the single-instance lock; tolerant if it is already gone.""" + try: + os.rmdir(lock_dir) + except OSError: + pass + + +def run_sync( + cfg: RcloneConfig, + dry_run: bool = False, + resync: bool = False, + rclone_bin: str = "rclone", +) -> SyncResult: + """Execute one sync run end-to-end. + + 1. Confirm rclone is installed and recent enough (>= 1.68.2). + 2. (Re)write the rclone filter file. + 3. Build argv for pull or bisync. + 4. Run rclone, capturing exit code and the log file. + 5. Parse the log into FileEvents. + 6. Hash any added/modified files under data/corpus/ for the audit row. + 7. Emit audit events: sync_started, sync_file_*, sync_completed | sync_failed. + 8. Return a SyncResult. + + Raises ``RcloneNotInstalledError`` / ``RcloneVersionError`` on environment + failure, ``SyncRuntimeError`` on unexpected subprocess failure. Normal rclone + non-zero exits (e.g. a tripped safety fuse) do NOT raise -- they return a + ``SyncResult`` with ``success=False`` and ``aborted_for_safety=True`` so the + caller can react cleanly. + + Security: argv is a list, the binary is absolute, ``shell`` is never set, and + only metadata is logged -- never raw stderr that could echo a token. + + Concurrency: a process-wide single-instance lock (an atomically created lock + directory under ``log_dir``) prevents a manual run and the scheduled run from + driving rclone against the same remote at once. A second concurrent run + raises ``SyncRuntimeError``; a lock left by a crashed run is reclaimed after + ``_LOCK_STALE_SEC``. + """ + check_rclone_version(rclone_bin) + + os.makedirs(cfg.log_dir or ".", exist_ok=True) + lock_dir = os.path.join(cfg.log_dir or ".", "sync.lock.d") + _acquire_sync_lock(lock_dir) + try: + return _run_sync_locked(cfg, dry_run, resync, rclone_bin) + finally: + _release_sync_lock(lock_dir) + + +def _run_sync_locked( + cfg: RcloneConfig, + dry_run: bool, + resync: bool, + rclone_bin: str, +) -> SyncResult: + """Body of ``run_sync`` executed while holding the single-instance lock.""" + write_filter_file(cfg) + + log_path = cfg.log_path + # Clear the previous run's log so parsing is unambiguous. + try: + if os.path.exists(log_path): + os.remove(log_path) + except OSError: + # Non-fatal: rclone appends; we still parse the appended portion. + pass + + if cfg.direction == "bisync": + argv = build_bisync_argv( + cfg, dry_run=dry_run, log_path=log_path, resync=resync, rclone_bin=rclone_bin + ) + else: + argv = build_pull_argv(cfg, dry_run=dry_run, log_path=log_path, rclone_bin=rclone_bin) + + started_at = time.time() + audit_log({ + "event": "sync_started", + "direction": cfg.direction, + "dry_run": dry_run, + "remote": cfg.remote, + "local_path": cfg.local_path, + "include_soul": cfg.include_soul, + }) + + try: + # argv is a list of a fixed flag set + validated config; never shell=True. + completed = subprocess.run( # noqa: S603 -- argv list, validated inputs, no shell + argv, + capture_output=True, + text=True, + check=False, + ) + except FileNotFoundError as exc: + # rclone disappeared between the version check and now (race). Do not + # include argv (would leak the remote spec into the error string). + raise RcloneNotInstalledError( + "rclone binary disappeared during execution", + details={"direction": cfg.direction}, + ) from exc + except subprocess.SubprocessError as exc: + raise SyncRuntimeError( + f"rclone subprocess failed: {type(exc).__name__}", + details={"direction": cfg.direction}, + ) from exc + + finished_at = time.time() + exit_code = completed.returncode + + # Parse log -> events -> hash -> audit. + events, errors = parse_log(log_path) + events = hash_changed_files(events, cfg.local_path) + + aborted_for_safety = _detect_safety_abort(errors, completed.stderr or "") + + # rclone logs file paths RELATIVE TO THE TRANSFER ROOT (the destination + # directory), not relative to the repo root -- e.g. "notes.md", never + # "data/corpus/notes.md". Since cfg.local_path is validated to resolve under + # data/corpus (RcloneConfig.__post_init__), every parsed file event is by + # construction a corpus change. We still defensively skip rclone's own + # scratch/state artifacts in case one ever slips past the filter file. + corpus_changed = any( + not ev.path.startswith(_RCLONE_INTERNAL_PREFIXES) for ev in events + ) + + result = SyncResult( + success=(exit_code == 0), + direction=("dry-run" if dry_run else cfg.direction), + started_at=started_at, + finished_at=finished_at, + rclone_exit_code=exit_code, + events=events, + errors=errors, + log_path=log_path, + aborted_for_safety=aborted_for_safety, + dry_run=dry_run, + corpus_changed=corpus_changed, + ) + + # Per-file audit events -- one row per file, with sha256 when available. + for ev in events: + audit_log(ev.to_audit_dict(base=cfg.local_path)) + + # Summary audit event -- sync_completed (success) or sync_failed (otherwise). + audit_log(result.to_audit_dict()) + + return result + + +def reindex_exit_code_for(result: SyncResult, cfg: RcloneConfig) -> int: + """Return the CLI exit code to use after a sync run. + + 0 -- sync OK, no corpus change (or reindex_on_change disabled). + 10 -- sync OK, corpus changed; caller should run ``python -m retrieval.indexer``. + 1 -- sync failed for safety reasons (--max-delete / --max-transfer tripped). + 2 -- sync failed for any other reason. + """ + if not result.success: + return 1 if result.aborted_for_safety else 2 + if cfg.reindex_on_change and result.corpus_changed: + return cfg.REINDEX_EXIT_CODE + return 0 diff --git a/sync/scheduler.py b/sync/scheduler.py new file mode 100644 index 0000000..8c33dd5 --- /dev/null +++ b/sync/scheduler.py @@ -0,0 +1,354 @@ +"""Cross-platform scheduler abstraction for the CyClaw Dropbox corpus sync. + +Linux/macOS: crontab manipulation via ``crontab -l`` and ``crontab -`` piping + (never ``crontab -e`` -- no interactive editor in an unattended + flow). A single tagged line is added/replaced/removed. +Windows: ``schtasks`` for Task Scheduler create/delete/query. + +We intentionally avoid third-party deps (python-crontab, pywin32) so this plugs +into CyClaw's offline-first / minimal-deps philosophy. The trade-off is +platform-specific subprocess calls -- kept tight, argv-list only, never +``shell=True``, and binaries resolved via ``shutil.which`` so no partial-path +taint reaches the kernel. + +systemd note (Linux): the more robust Linux option (per the implementation plan +section 5.1) is a systemd ``--user`` ``Type=oneshot`` service driven by a timer +unit: it gives inherent overlap protection, journald logging, and +``Persistent=true`` catch-up after downtime. We do not auto-generate the unit +here; **cron is the implemented portable baseline** (works on macOS/WSL/BSD +too). Operators who prefer systemd should run the sync via a ``--user`` timer +calling ``python -m sync.cli sync`` and skip ``schedule``/``unschedule``. The +cron baseline has no built-in single-instance guard, so a wrapper-level lockfile +(or systemd) is recommended if manual and scheduled runs might collide. + +Scheduler identity: every task we register is tagged with ``TASK_TAG`` (a +trailing comment on Linux/macOS, the task name on Windows) so install/remove +only ever touch our own entry and never anything the user added by hand. +""" + +from __future__ import annotations + +import os +import platform +import shutil +import subprocess +import sys +from dataclasses import dataclass + +from sync.config import RcloneConfig +from utils.errors import SchedulerError + +TASK_TAG = "CYCLAW_DROPBOX_SYNC" +WINDOWS_TASK_NAME = "CyClaw Dropbox Sync" + + +@dataclass +class ScheduleEntry: + """Description of a scheduled job, in platform-neutral form.""" + + platform_name: str # "linux", "darwin", "windows" + command: str # the actual command line that will be run + cron_or_time: str # cron expression OR HH:MM + raw: str # the raw line / schtasks output for debugging + + +def _python_executable() -> str: + """Best guess at the python interpreter to invoke from the scheduler.""" + candidate = sys.executable or "python" + if candidate and os.path.isfile(candidate): + return candidate + found = shutil.which("python3") or shutil.which("python") + return found or "python" + + +def _repo_root(cfg: RcloneConfig) -> str: + """Directory the scheduled command should cd into before running sync. + + ``cfg.local_path`` resolves to ``.../data/corpus``; ``config.yaml`` lives at + the repo root (the parent of ``data/``). So the repo root is two levels up + from ``data/corpus``. If a ``config.yaml`` is found there we trust it; + otherwise we still fall back to that two-levels-up directory, which is the + documented layout. Keeping this simple and explicit avoids any taint or + surprise about *where* config.yaml is resolved at run time. + """ + corpus = os.path.abspath(cfg.local_path) + repo_root = os.path.dirname(os.path.dirname(corpus)) # .../data/corpus -> repo + return repo_root + + +def _sync_command(cfg: RcloneConfig) -> str: + """The actual command the scheduler will invoke. + + cd into the repo root (so ``config.yaml`` resolves correctly), then run + ``python -m sync.cli sync`` as a separate process. + + On Windows the scheduler does NOT register this string directly -- see + ``_write_windows_launcher`` and ``WindowsTaskScheduler.install``. Passing a + full ``cmd /c "cd /d "..." && "..." ..."`` string through ``schtasks /TR`` is + quote-fragile (Task Scheduler re-parses it, and a repo path containing spaces + can break the nesting). A ``.bat`` launcher sidesteps that entirely. This + string is kept only for human-readable status output. + """ + py = _python_executable() + root = _repo_root(cfg) + if platform.system() == "Windows": + return f'cmd /c "cd /d "{root}" && "{py}" -m sync.cli sync"' + return f'cd "{root}" && "{py}" -m sync.cli sync' + + +def _write_windows_launcher(cfg: RcloneConfig) -> str: + """Write a ``.bat`` launcher for the scheduled sync and return its path. + + Registering a path to a one-line batch file via ``schtasks /TR`` avoids the + fragile quoting of embedding a full ``cmd /c`` command string with a repo + path that may contain spaces. The batch file itself uses ordinary quoting, + which cmd.exe handles reliably. + """ + root = _repo_root(cfg) + py = _python_executable() + bat_dir = cfg.log_dir or root + os.makedirs(bat_dir, exist_ok=True) + bat_path = os.path.join(bat_dir, "cyclaw_sync.bat") + # CRLF line endings + explicit quoting so paths with spaces are safe. + content = ( + "@echo off\r\n" + f'cd /d "{root}"\r\n' + f'"{py}" -m sync.cli sync\r\n' + ) + with open(bat_path, "w", encoding="utf-8", newline="") as f: + f.write(content) + return bat_path + + +# --------------------------------------------------------------------------- +# Linux / macOS -- cron +# --------------------------------------------------------------------------- + + +class CronScheduler: + """Manage a single CyClaw cron entry via ``crontab -l`` / ``crontab -``.""" + + def __init__(self, cfg: RcloneConfig) -> None: + self.cfg = cfg + + @staticmethod + def _crontab_bin() -> str: + path = shutil.which("crontab") + if not path: + raise SchedulerError( + "crontab not available on this system", + details={"hint": "Install cron, or schedule via a systemd --user timer manually."}, + ) + return path + + # crontab interactions: avoid -e (editor); use stdin piping. + def _read_crontab(self) -> str: + crontab = self._crontab_bin() + try: + result = subprocess.run( # noqa: S603 # argv list, crontab resolved via shutil.which + [crontab, "-l"], + capture_output=True, + text=True, + timeout=10, + check=False, + ) + except FileNotFoundError as exc: + raise SchedulerError( + "crontab not available on this system", + details={"hint": "Install cron or schedule via a systemd --user timer manually."}, + ) from exc + # `crontab -l` returns 1 when the user has no crontab -- not an error. + if result.returncode not in (0, 1): + raise SchedulerError( + f"crontab -l failed (rc={result.returncode}): {result.stderr.strip()}", + details={"stderr": result.stderr[:500]}, + ) + return result.stdout or "" + + def _write_crontab(self, content: str) -> None: + crontab = self._crontab_bin() + try: + proc = subprocess.run( # noqa: S603 # argv list, crontab resolved via shutil.which + [crontab, "-"], + input=content, + text=True, + capture_output=True, + timeout=10, + check=False, + ) + except FileNotFoundError as exc: + raise SchedulerError("crontab binary not available") from exc + if proc.returncode != 0: + raise SchedulerError( + f"crontab write failed (rc={proc.returncode}): {proc.stderr.strip()}", + details={"stderr": proc.stderr[:500]}, + ) + + def _our_line(self) -> str: + """The single tagged cron line we want active.""" + cmd = _sync_command(self.cfg) + return f"{self.cfg.schedule_min} {self.cfg.schedule_hour} * * * {cmd} # {TASK_TAG}" + + def install(self) -> ScheduleEntry: + """Add or replace the CyClaw cron entry (idempotent).""" + current = self._read_crontab().splitlines() + # Strip any existing CyClaw entries (tagged with TASK_TAG), then append + # exactly one fresh tagged line. + filtered = [ln for ln in current if TASK_TAG not in ln] + line = self._our_line() + filtered.append(line) + new_content = "\n".join(filtered) + "\n" + self._write_crontab(new_content) + return ScheduleEntry( + platform_name=platform.system().lower(), + command=_sync_command(self.cfg), + cron_or_time=f"{self.cfg.schedule_min} {self.cfg.schedule_hour} * * *", + raw=line, + ) + + def remove(self) -> bool: + """Remove any CyClaw cron entries. Returns True if anything was removed.""" + current = self._read_crontab().splitlines() + filtered = [ln for ln in current if TASK_TAG not in ln] + if len(filtered) == len(current): + return False + new_content = "\n".join(filtered) + ("\n" if filtered else "") + self._write_crontab(new_content) + return True + + def status(self) -> ScheduleEntry | None: + """Return the active entry if installed, else None.""" + for ln in self._read_crontab().splitlines(): + if TASK_TAG in ln: + # Expected shape: "MIN HOUR * * * cmd # TAG" + parts = ln.split(maxsplit=5) + if len(parts) >= 6: + cron_expr = " ".join(parts[:5]) + return ScheduleEntry( + platform_name=platform.system().lower(), + command=parts[5].rsplit("#", 1)[0].strip(), + cron_or_time=cron_expr, + raw=ln, + ) + return None + + +# --------------------------------------------------------------------------- +# Windows -- schtasks +# --------------------------------------------------------------------------- + + +class WindowsTaskScheduler: + """Manage a single CyClaw task via schtasks.exe.""" + + def __init__(self, cfg: RcloneConfig) -> None: + self.cfg = cfg + + @staticmethod + def _schtasks() -> str: + path = shutil.which("schtasks") + if not path: + raise SchedulerError( + "schtasks.exe not available on PATH", + details={"hint": "schtasks is a built-in Windows tool. Run on Windows, not WSL."}, + ) + return path + + def install(self) -> ScheduleEntry: + # Register a .bat launcher path (robust) rather than an inline cmd /c + # string (quote-fragile through schtasks /TR for paths with spaces). + launcher = _write_windows_launcher(self.cfg) + time_str = f"{self.cfg.schedule_hour:02d}:{self.cfg.schedule_min:02d}" + argv = [ + self._schtasks(), + "/Create", + "/TN", + WINDOWS_TASK_NAME, + "/TR", + launcher, + "/SC", + "DAILY", + "/ST", + time_str, + "/F", # force overwrite of an existing task with the same name + "/RL", + "LIMITED", + ] + try: + proc = subprocess.run( # noqa: S603 # argv list, schtasks resolved via shutil.which + argv, capture_output=True, text=True, timeout=15, check=False + ) + except subprocess.SubprocessError as exc: + raise SchedulerError(f"schtasks /Create failed: {exc}") from exc + if proc.returncode != 0: + raise SchedulerError( + f"schtasks /Create failed (rc={proc.returncode}): {proc.stderr.strip()}", + details={"stderr": proc.stderr[:500]}, + ) + return ScheduleEntry( + platform_name="windows", + command=launcher, + cron_or_time=time_str, + raw=proc.stdout.strip(), + ) + + def remove(self) -> bool: + argv = [self._schtasks(), "/Delete", "/TN", WINDOWS_TASK_NAME, "/F"] + try: + proc = subprocess.run( # noqa: S603 # argv list, schtasks resolved via shutil.which + argv, capture_output=True, text=True, timeout=15, check=False + ) + except subprocess.SubprocessError as exc: + raise SchedulerError(f"schtasks /Delete failed: {exc}") from exc + if proc.returncode == 0: + return True + # schtasks /Delete returns nonzero when the task didn't exist -- treat + # "not found" as a no-op (False), never an error. + combined = proc.stdout + proc.stderr + if "cannot find the file specified" in combined or "does not exist" in combined.lower(): + return False + if proc.returncode == 1: + return False + raise SchedulerError( + f"schtasks /Delete failed (rc={proc.returncode}): {proc.stderr.strip()}", + details={"stderr": proc.stderr[:500]}, + ) + + def status(self) -> ScheduleEntry | None: + argv = [self._schtasks(), "/Query", "/TN", WINDOWS_TASK_NAME, "/FO", "LIST"] + try: + proc = subprocess.run( # noqa: S603 # argv list, schtasks resolved via shutil.which + argv, capture_output=True, text=True, timeout=15, check=False + ) + except subprocess.SubprocessError: + return None + if proc.returncode != 0: + return None + return ScheduleEntry( + platform_name="windows", + command=_sync_command(self.cfg), + cron_or_time=f"{self.cfg.schedule_hour:02d}:{self.cfg.schedule_min:02d}", + raw=proc.stdout.strip(), + ) + + +# --------------------------------------------------------------------------- +# Factory +# --------------------------------------------------------------------------- + + +def get_scheduler(cfg: RcloneConfig) -> CronScheduler | WindowsTaskScheduler: + """Return the right scheduler for the current OS. + + linux/darwin -> CronScheduler; windows -> WindowsTaskScheduler; anything + else raises SchedulerError. + """ + sys_name = platform.system().lower() + if sys_name == "windows": + return WindowsTaskScheduler(cfg) + if sys_name in ("linux", "darwin"): + return CronScheduler(cfg) + raise SchedulerError( + f"Unsupported platform for scheduling: {sys_name}", + details={"platform": sys_name, "supported": ["linux", "darwin", "windows"]}, + ) diff --git a/sync/selftest.py b/sync/selftest.py new file mode 100644 index 0000000..06b16c0 --- /dev/null +++ b/sync/selftest.py @@ -0,0 +1,143 @@ +"""Operator-facing pre-flight self-test for ``python -m sync.cli test``. + +This is NOT the pytest suite. It is a fast, no-mocking smoke test runnable on +any machine to confirm the sync module will work in this environment. It +exercises the config loader, filter writer, version check, and the argv +builders, but it does NOT actually contact Dropbox. The dry-run reachability +check is best-effort and tolerates a missing rclone binary by reporting a +failed point rather than crashing. + +Use the pytest suite (``tests/test_sync_runner.py`` / ``tests/test_sync_cli.py``) +for mocked unit coverage. +""" + +from __future__ import annotations + +import os + +from sync.config import RcloneConfig, load_sync_config +from sync.filters import filter_summary, generate_filters, write_filter_file +from sync.runner import build_bisync_argv, build_pull_argv, check_rclone_version +from utils.errors import RcloneNotInstalledError, RcloneVersionError, SyncConfigError + + +def _ok(name: str) -> tuple[bool, str]: + return True, f" [OK ] {name}" + + +def _fail(name: str, reason: str) -> tuple[bool, str]: + return False, f" [FAIL] {name}: {reason}" + + +def _skip(name: str, reason: str) -> tuple[bool, str]: + # Skips count as PASS for the overall result -- they are environment-conditional. + return True, f" [SKIP] {name}: {reason}" + + +def run_self_test( + config_path: str = "config.yaml", + dry_run: bool = True, +) -> tuple[int, int, list[str]]: + """Run all pre-flight checks. Returns ``(passed, total, output_lines)``.""" + results: list[tuple[bool, str]] = [] + cfg: RcloneConfig + + # 1. config.yaml exists, parses, and validates. + try: + cfg = load_sync_config(config_path) + results.append(_ok("01. Config loads and validates")) + except SyncConfigError as exc: + results.append(_fail("01. Config loads and validates", exc.message)) + # Cannot continue without a config; the rest are skipped. + for n in range(2, 9): + results.append(_skip(f"{n:02d}. (skipped -- no config)", "config invalid")) + return _finalize(results) + + # 2. local_path is absolute (and ideally an existing directory). + if os.path.isabs(cfg.local_path): + if os.path.isdir(cfg.local_path): + results.append(_ok("02. local_path is an existing directory")) + else: + results.append(_skip( + "02. local_path is an existing directory", + f"path does not yet exist: {cfg.local_path}", + )) + else: + results.append(_fail("02. local_path is absolute", f"got relative: {cfg.local_path}")) + + # 3. rclone installed and recent enough (floor 1.68.2). Tolerate absence. + try: + v = check_rclone_version() + results.append(_ok(f"03. rclone {v[0]}.{v[1]}.{v[2]} installed (>= 1.68.2)")) + except (RcloneNotInstalledError, RcloneVersionError) as exc: + results.append(_fail("03. rclone >= 1.68.2 installed", exc.message)) + + # 4. Filter content asserts the hardened soul exclusion (or its loud absence). + text = generate_filters(cfg) + soul_rule_present = "- data/personality/**" in text + if cfg.include_soul: + if not soul_rule_present: + results.append(_ok("04. include_soul=true and soul rule absent")) + else: + results.append(_fail( + "04. include_soul=true and soul rule absent", + "soul exclusion still present in filter", + )) + else: + if soul_rule_present: + results.append(_ok("04. data/personality/** excluded by default")) + else: + results.append(_fail( + "04. data/personality/** excluded by default", + "soul exclusion missing from filter", + )) + + # 5. Filter file can be written to disk. + try: + path = write_filter_file(cfg) + results.append(_ok(f"05. Wrote filter file: {path}")) + except OSError as exc: + results.append(_fail("05. Write filter file", str(exc))) + + # 6. Filter summary is consistent with the config. + summary = filter_summary(cfg) + if summary["soul_excluded"] == (not cfg.include_soul): + results.append(_ok("06. Filter summary consistent with config")) + else: + results.append(_fail("06. Filter summary consistent with config", f"got: {summary}")) + + # 7. Pull argv is well-formed (subprocess is NOT invoked here). + try: + argv = build_pull_argv(cfg, dry_run=dry_run, log_path=cfg.log_path) + if isinstance(argv, list) and argv[1] == "copy" and cfg.remote in argv and cfg.local_path in argv: + results.append(_ok("07. Pull argv well-formed (list, no shell)")) + else: + results.append(_fail("07. Pull argv well-formed", f"unexpected argv: {argv[:6]}")) + except (OSError, ValueError) as exc: + results.append(_fail("07. Pull argv well-formed", str(exc))) + + # 8. Bisync argv is well-formed. + try: + argv = build_bisync_argv(cfg, dry_run=dry_run, log_path=cfg.log_path) + if isinstance(argv, list) and argv[1] == "bisync" and cfg.remote in argv and cfg.local_path in argv: + results.append(_ok("08. Bisync argv well-formed (list, no shell)")) + else: + results.append(_fail("08. Bisync argv well-formed", f"unexpected argv: {argv[:6]}")) + except (OSError, ValueError) as exc: + results.append(_fail("08. Bisync argv well-formed", str(exc))) + + return _finalize(results) + + +def _finalize(results: list[tuple[bool, str]]) -> tuple[int, int, list[str]]: + lines = [text for _, text in results] + passed = sum(1 for ok, _ in results if ok) + return passed, len(results), lines + + +if __name__ == "__main__": + p, t, out = run_self_test() + for ln in out: + print(ln) + print(f"\n{p}/{t} passed") + raise SystemExit(0 if p == t else 1) diff --git a/tests/test_sync_cli.py b/tests/test_sync_cli.py new file mode 100644 index 0000000..c1e78ad --- /dev/null +++ b/tests/test_sync_cli.py @@ -0,0 +1,213 @@ +"""Self-contained unit tests for sync.cli (no network, no real rclone/scheduler). + +Runnable with ``pytest --noconftest tests/test_sync_cli.py``. The runner and the +lazily-imported scheduler are patched on the ``sync.cli`` module so no subprocess +or real scheduler is touched. Asserts the documented exit-code contract (§7). +""" + +from __future__ import annotations + +from unittest.mock import MagicMock, patch + +import pytest + +from sync.cli import ( + EXIT_ENV, + EXIT_FAIL, + EXIT_OK, + EXIT_REINDEX, + EXIT_SAFETY, + main, +) +from sync.config import RcloneConfig +from utils.errors import RcloneNotInstalledError, SchedulerError, SyncConfigError +from utils.logger import reset_config_cache + + +@pytest.fixture(autouse=True) +def _reset_cache(): + reset_config_cache() + yield + reset_config_cache() + + +def _cfg() -> RcloneConfig: + corpus = __import__("pathlib").Path(__file__).resolve().parent.parent / "data" / "corpus" + return RcloneConfig(local_path=str(corpus)) + + +def _result(success=True, corpus_changed=False, aborted=False, exit_code=0): + r = MagicMock() + r.success = success + r.corpus_changed = corpus_changed + r.aborted_for_safety = aborted + r.rclone_exit_code = exit_code + r.direction = "pull" + r.duration_sec = 0.1 + r.errors = [] + r.event_counts.return_value = {"added": 0, "modified": 0, "deleted": 0} + return r + + +# --------------------------------------------------------------------------- +# sync subcommand -- exit codes 0 / 10 / 1 / 2 / 3 +# --------------------------------------------------------------------------- + +def test_sync_ok_exit_0(): + with patch("sync.cli.load_sync_config", return_value=_cfg()), \ + patch("sync.cli.run_sync", return_value=_result()), \ + patch("sync.cli.reindex_exit_code_for", return_value=EXIT_OK): + assert main(["sync"]) == EXIT_OK + + +def test_sync_corpus_changed_exit_10(): + with patch("sync.cli.load_sync_config", return_value=_cfg()), \ + patch("sync.cli.run_sync", return_value=_result(corpus_changed=True)), \ + patch("sync.cli.reindex_exit_code_for", return_value=EXIT_REINDEX): + assert main(["sync"]) == EXIT_REINDEX + + +def test_sync_safety_abort_exit_1(): + with patch("sync.cli.load_sync_config", return_value=_cfg()), \ + patch("sync.cli.run_sync", return_value=_result(success=False, aborted=True)), \ + patch("sync.cli.reindex_exit_code_for", return_value=EXIT_SAFETY): + assert main(["sync"]) == EXIT_SAFETY + + +def test_sync_other_failure_exit_2(): + with patch("sync.cli.load_sync_config", return_value=_cfg()), \ + patch("sync.cli.run_sync", return_value=_result(success=False)), \ + patch("sync.cli.reindex_exit_code_for", return_value=EXIT_FAIL): + assert main(["sync"]) == EXIT_FAIL + + +def test_sync_rclone_missing_exit_3(): + with patch("sync.cli.load_sync_config", return_value=_cfg()), \ + patch("sync.cli.run_sync", side_effect=RcloneNotInstalledError("nope")): + assert main(["sync"]) == EXIT_ENV + + +def test_sync_bad_config_exit_3(): + with patch("sync.cli.load_sync_config", side_effect=SyncConfigError("bad")): + assert main(["sync"]) == EXIT_ENV + + +def test_sync_disabled_noops_exit_0_without_running(): + # sync.enabled: false is an intentional off, not an error: cmd_sync must + # return EXIT_OK and never invoke run_sync. + cfg = _cfg() + cfg.enabled = False # set by load_sync_config in production + with patch("sync.cli.load_sync_config", return_value=cfg), \ + patch("sync.cli.run_sync") as mrun: + assert main(["sync"]) == EXIT_OK + mrun.assert_not_called() + + +def test_sync_enabled_true_runs(): + cfg = _cfg() + cfg.enabled = True + with patch("sync.cli.load_sync_config", return_value=cfg), \ + patch("sync.cli.run_sync", return_value=_result()) as mrun, \ + patch("sync.cli.reindex_exit_code_for", return_value=EXIT_OK): + assert main(["sync"]) == EXIT_OK + mrun.assert_called_once() + + +def test_sync_dry_run_passes_flag(): + captured = {} + + def fake_run(cfg, dry_run=False, resync=False): + captured["dry_run"] = dry_run + return _result() + + with patch("sync.cli.load_sync_config", return_value=_cfg()), \ + patch("sync.cli.run_sync", side_effect=fake_run), \ + patch("sync.cli.reindex_exit_code_for", return_value=EXIT_OK): + assert main(["sync", "--dry-run"]) == EXIT_OK + assert captured["dry_run"] is True + + +# --------------------------------------------------------------------------- +# status / test subcommands +# --------------------------------------------------------------------------- + +def test_status_ok_with_rclone_present(): + with patch("sync.cli.load_sync_config", return_value=_cfg()), \ + patch("sync.cli.check_rclone_version", return_value=(1, 70, 0)), \ + patch("sync.cli.get_scheduler") as mgs: + mgs.return_value.status.return_value = None + assert main(["status"]) == EXIT_OK + + +def test_status_rclone_missing_still_exit_0(): + # status reports rclone-missing but does not itself fail (env reporting view). + with patch("sync.cli.load_sync_config", return_value=_cfg()), \ + patch("sync.cli.check_rclone_version", side_effect=RcloneNotInstalledError("nope")), \ + patch("sync.cli.get_scheduler") as mgs: + mgs.return_value.status.return_value = None + assert main(["status"]) == EXIT_OK + + +def test_status_bad_config_exit_3(): + with patch("sync.cli.load_sync_config", side_effect=SyncConfigError("bad")): + assert main(["status"]) == EXIT_ENV + + +def test_test_subcommand_all_pass_exit_0(): + with patch("sync.selftest.run_self_test", return_value=(3, 3, ["ok"])): + assert main(["test"]) == EXIT_OK + + +def test_test_subcommand_some_fail_exit_2(): + with patch("sync.selftest.run_self_test", return_value=(2, 3, ["x"])): + assert main(["test"]) == EXIT_FAIL + + +# --------------------------------------------------------------------------- +# schedule / unschedule -- lazily-imported get_scheduler is patched on sync.cli +# --------------------------------------------------------------------------- + +def test_schedule_ok(): + entry = MagicMock(cron_or_time="0 2 * * *", platform_name="cron") + with patch("sync.cli.load_sync_config", return_value=_cfg()), \ + patch("sync.cli.get_scheduler") as mgs: + mgs.return_value.install.return_value = entry + assert main(["schedule"]) == EXIT_OK + + +def test_schedule_failure_exit_3(): + with patch("sync.cli.load_sync_config", return_value=_cfg()), \ + patch("sync.cli.get_scheduler") as mgs: + mgs.return_value.install.side_effect = SchedulerError("cron unavailable") + assert main(["schedule"]) == EXIT_ENV + + +def test_unschedule_ok(): + with patch("sync.cli.load_sync_config", return_value=_cfg()), \ + patch("sync.cli.get_scheduler") as mgs: + mgs.return_value.remove.return_value = True + assert main(["unschedule"]) == EXIT_OK + + +def test_setup_schedule_uses_lazy_scheduler(): + entry = MagicMock(cron_or_time="0 2 * * *", platform_name="cron") + with patch("sync.cli.load_sync_config", return_value=_cfg()), \ + patch("sync.cli.check_rclone_version", return_value=(1, 70, 0)), \ + patch("sync.cli.write_filter_file", return_value="/tmp/filters.txt"), \ + patch("sync.cli.get_scheduler") as mgs: + mgs.return_value.install.return_value = entry + assert main(["setup", "--schedule"]) == EXIT_OK + mgs.return_value.install.assert_called_once() + + +# --------------------------------------------------------------------------- +# Module imports without the scheduler present (B<->C decoupling) +# --------------------------------------------------------------------------- + +def test_cli_imports_without_scheduler(): + import importlib + + import sync.cli as cli_mod + + importlib.reload(cli_mod) + assert callable(cli_mod.main) diff --git a/tests/test_sync_config.py b/tests/test_sync_config.py new file mode 100644 index 0000000..f60123b --- /dev/null +++ b/tests/test_sync_config.py @@ -0,0 +1,179 @@ +"""Self-contained tests for sync.config (runnable under `pytest --noconftest`). + +No dependence on tests/conftest.py fixtures: uses the builtin tmp_path fixture +and resets the utils.logger config cache between tests so the temp config.yaml +is re-read each time. +""" + +from __future__ import annotations + +import os +from pathlib import Path + +import pytest +import yaml + +from sync.config import RcloneConfig, load_sync_config +from utils.errors import SyncConfigError +from utils.logger import reset_config_cache + +# Repo's data/corpus tree -- the only place local_path is allowed to resolve to. +REPO_ROOT = Path(__file__).resolve().parent.parent +CORPUS_ROOT = (REPO_ROOT / "data" / "corpus").resolve() + + +@pytest.fixture(autouse=True) +def _reset_cache(): + reset_config_cache() + yield + reset_config_cache() + + +def _write_config(tmp_path: Path, sync_block: dict) -> str: + cfg = {"logging": {"audit_file": str(tmp_path / "audit.jsonl")}, "sync": sync_block} + path = tmp_path / "config.yaml" + path.write_text(yaml.safe_dump(cfg), encoding="utf-8") + return str(path) + + +def _base_block(**overrides: object) -> dict: + block = { + "enabled": True, + "local_path": "data/corpus", + "remote_name": "dropbox_cyclaw", + "remote_path": "CyClaw/corpus", + "direction": "pull", + } + block.update(overrides) + return block + + +def test_valid_load(tmp_path: Path) -> None: + path = _write_config(tmp_path, _base_block()) + cfg = load_sync_config(path) + assert isinstance(cfg, RcloneConfig) + # Relative default resolves against repo root into the corpus tree. + assert cfg.local_path == str(CORPUS_ROOT) + assert os.path.isabs(cfg.local_path) + assert cfg.remote == "dropbox_cyclaw:CyClaw/corpus" + assert cfg.REINDEX_EXIT_CODE == 10 + assert cfg.filter_file is not None and cfg.filter_file.endswith("cyclaw_filters.txt") + assert cfg.log_path.endswith("rclone_cyclaw.log") + + +def test_absolute_corpus_subdir_is_accepted(tmp_path: Path) -> None: + abs_path = str(CORPUS_ROOT / "sub") + cfg = load_sync_config(_write_config(tmp_path, _base_block(local_path=abs_path))) + assert cfg.local_path == abs_path + + +def test_rejects_relative_escape(tmp_path: Path) -> None: + path = _write_config(tmp_path, _base_block(local_path="data/corpus/../../etc")) + with pytest.raises(SyncConfigError) as exc: + load_sync_config(path) + assert exc.value.code == "SYNC_CONFIG_INVALID" + + +def test_rejects_path_outside_corpus(tmp_path: Path) -> None: + path = _write_config(tmp_path, _base_block(local_path="/tmp/not-corpus")) + with pytest.raises(SyncConfigError): + load_sync_config(path) + + +def test_rejects_repo_data_dir_not_under_corpus(tmp_path: Path) -> None: + # data/ (parent of corpus) must be rejected -- it is not inside corpus. + path = _write_config(tmp_path, _base_block(local_path="data")) + with pytest.raises(SyncConfigError): + load_sync_config(path) + + +def test_rejects_bad_direction(tmp_path: Path) -> None: + path = _write_config(tmp_path, _base_block(direction="push")) + with pytest.raises(SyncConfigError): + load_sync_config(path) + + +def test_rejects_remote_name_with_metacharacters(tmp_path: Path) -> None: + path = _write_config(tmp_path, _base_block(remote_name="bad name; rm -rf")) + with pytest.raises(SyncConfigError): + load_sync_config(path) + + +def test_rejects_leading_dash_remote_path(tmp_path: Path) -> None: + path = _write_config(tmp_path, _base_block(remote_path="--config=evil")) + with pytest.raises(SyncConfigError): + load_sync_config(path) + + +def test_rejects_remote_path_with_shell_metachars(tmp_path: Path) -> None: + path = _write_config(tmp_path, _base_block(remote_path="corpus; touch pwned")) + with pytest.raises(SyncConfigError): + load_sync_config(path) + + +def test_rejects_out_of_range_schedule_hour(tmp_path: Path) -> None: + path = _write_config(tmp_path, _base_block(schedule_hour=24)) + with pytest.raises(SyncConfigError): + load_sync_config(path) + + +def test_rejects_out_of_range_schedule_min(tmp_path: Path) -> None: + path = _write_config(tmp_path, _base_block(schedule_min=60)) + with pytest.raises(SyncConfigError): + load_sync_config(path) + + +def test_rejects_negative_max_delete(tmp_path: Path) -> None: + path = _write_config(tmp_path, _base_block(max_delete=-1)) + with pytest.raises(SyncConfigError): + load_sync_config(path) + + +def test_rejects_bad_conflict_resolve(tmp_path: Path) -> None: + path = _write_config(tmp_path, _base_block(conflict_resolve="random")) + with pytest.raises(SyncConfigError): + load_sync_config(path) + + +def test_missing_block_raises(tmp_path: Path) -> None: + cfg = {"logging": {"audit_file": str(tmp_path / "audit.jsonl")}} + path = tmp_path / "config.yaml" + path.write_text(yaml.safe_dump(cfg), encoding="utf-8") + with pytest.raises(SyncConfigError): + load_sync_config(str(path)) + + +def test_unknown_keys_collected_not_fatal(tmp_path: Path) -> None: + path = _write_config(tmp_path, _base_block(typo_field="oops", another="x")) + cfg = load_sync_config(path) + # "enabled" is not flagged; the genuine typos are. + assert set(cfg._unknown_keys) == {"another", "typo_field"} # type: ignore[attr-defined] + + +def test_enabled_flag_read_from_block(tmp_path: Path) -> None: + # enabled is read out as a plain attribute (not an rclone field). The config + # cache is process-global, so reset between loads of different temp files. + d_on = tmp_path / "on" + d_off = tmp_path / "off" + d_def = tmp_path / "default" + for d in (d_on, d_off, d_def): + d.mkdir() + + on = load_sync_config(_write_config(d_on, _base_block(enabled=True))) + assert on.enabled is True # type: ignore[attr-defined] + + reset_config_cache() + off = load_sync_config(_write_config(d_off, _base_block(enabled=False))) + assert off.enabled is False # type: ignore[attr-defined] + + # Absent key defaults to enabled. + reset_config_cache() + block = _base_block() + del block["enabled"] + default = load_sync_config(_write_config(d_def, block)) + assert default.enabled is True # type: ignore[attr-defined] + + +def test_is_windows_property(tmp_path: Path) -> None: + cfg = load_sync_config(_write_config(tmp_path, _base_block())) + assert isinstance(cfg.is_windows, bool) diff --git a/tests/test_sync_filters.py b/tests/test_sync_filters.py new file mode 100644 index 0000000..d26164c --- /dev/null +++ b/tests/test_sync_filters.py @@ -0,0 +1,114 @@ +"""Self-contained tests for sync.filters (runnable under `pytest --noconftest`). + +No dependence on tests/conftest.py fixtures: uses tmp_path and resets the +utils.logger config cache between tests. +""" + +from __future__ import annotations + +from pathlib import Path + +import pytest +import yaml + +from sync.config import load_sync_config +from sync.filters import filter_summary, generate_filters, write_filter_file +from utils.logger import reset_config_cache + +REPO_ROOT = Path(__file__).resolve().parent.parent +SOUL_RULE = "- data/personality/**" + + +@pytest.fixture(autouse=True) +def _reset_cache(): + reset_config_cache() + yield + reset_config_cache() + + +def _load(tmp_path: Path, **overrides: object): + block = { + "enabled": True, + "local_path": "data/corpus", + "remote_name": "dropbox_cyclaw", + "remote_path": "CyClaw/corpus", + "direction": "pull", + } + block.update(overrides) + cfg = {"logging": {"audit_file": str(tmp_path / "audit.jsonl")}, "sync": block} + path = tmp_path / "config.yaml" + path.write_text(yaml.safe_dump(cfg), encoding="utf-8") + return load_sync_config(str(path)) + + +def test_soul_excluded_by_default(tmp_path: Path) -> None: + text = generate_filters(_load(tmp_path)) + assert SOUL_RULE in text + assert "WARNING" not in text + + +def test_hardened_categories_present(tmp_path: Path) -> None: + text = generate_filters(_load(tmp_path)) + for rule in ( + "- *.gguf", + "- index/**", + "- .emb_cache/**", + "- .chroma/**", + "- logs/**", + "- *.jsonl", + "- *.db", + "- *.db-wal", + "- *.db-shm", + "- .env", + "- credentials*", + "- .git/**", + "- .gitignore", + "- desktop.ini", + "- .rclone-state/**", + ): + assert rule in text, f"missing hardened rule: {rule}" + + +def test_soul_dropped_with_warning_when_include_soul(tmp_path: Path) -> None: + text = generate_filters(_load(tmp_path, include_soul=True)) + assert SOUL_RULE not in text + assert "WARNING" in text + assert "include_soul=true" in text + # Other hardened rules survive. + assert "- *.db" in text + + +def test_extra_excludes_appended_after_hardened_block(tmp_path: Path) -> None: + text = generate_filters(_load(tmp_path, extra_excludes=["scratch/**", "- already/**"])) + lines = text.splitlines() + # Bare entry gets a leading "- "; entry with leading "- " is preserved. + assert "- scratch/**" in lines + assert "- already/**" in lines + # Extras come after the hardened soul rule. + assert lines.index("- scratch/**") > lines.index(SOUL_RULE) + assert lines.index("- scratch/**") > lines.index("- *.gguf") + + +def test_write_filter_file_returns_abs_path(tmp_path: Path) -> None: + target = tmp_path / "state" / "cyclaw_filters.txt" + cfg = _load(tmp_path, filter_file=str(target)) + written = write_filter_file(cfg) + assert written == str(target.resolve()) + assert Path(written).is_file() + assert SOUL_RULE in Path(written).read_text(encoding="utf-8") + + +def test_filter_summary_shape(tmp_path: Path) -> None: + cfg = _load(tmp_path, extra_excludes=["scratch/**"]) + summary = filter_summary(cfg) + assert set(summary) == {"soul_excluded", "include_soul", "total_rules", "filter_file", "extra_excludes"} + assert summary["soul_excluded"] is True + assert summary["include_soul"] is False + assert isinstance(summary["total_rules"], int) and summary["total_rules"] > 0 + assert summary["extra_excludes"] == ["scratch/**"] + + +def test_filter_summary_soul_included(tmp_path: Path) -> None: + summary = filter_summary(_load(tmp_path, include_soul=True)) + assert summary["soul_excluded"] is False + assert summary["include_soul"] is True diff --git a/tests/test_sync_runner.py b/tests/test_sync_runner.py new file mode 100644 index 0000000..9bf71b5 --- /dev/null +++ b/tests/test_sync_runner.py @@ -0,0 +1,430 @@ +"""Self-contained unit tests for sync.runner (no network, no real rclone). + +Runnable with ``pytest --noconftest tests/test_sync_runner.py`` -- these tests do +NOT depend on tests/conftest.py fixtures (which import chromadb). The rclone +subprocess boundary is mocked via ``sync.runner.subprocess.run`` and the binary +resolution via ``sync.runner.shutil.which``. +""" + +from __future__ import annotations + +import sys +import textwrap +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +from sync.config import RcloneConfig +from sync.runner import ( + MIN_RCLONE_MAJOR, + MIN_RCLONE_MINOR, + MIN_RCLONE_PATCH, + FileEvent, + SyncResult, + build_bisync_argv, + build_pull_argv, + check_rclone_version, + hash_changed_files, + parse_log, + reindex_exit_code_for, + run_sync, +) +from utils.errors import RcloneNotInstalledError, RcloneVersionError, SyncRuntimeError +from utils.logger import reset_config_cache + +# shutil.which returns a drive-letter absolute path on Windows; POSIX path on Linux. +FAKE_RCLONE = r"C:\Windows\rclone.exe" if sys.platform == "win32" else "/usr/bin/rclone" + + +@pytest.fixture(autouse=True) +def _reset_cache(): + reset_config_cache() + yield + reset_config_cache() + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _make_cfg(tmp_path: Path, **overrides) -> RcloneConfig: + """Build an RcloneConfig pointed entirely at tmp_path (no repo writes).""" + corpus = Path(__file__).resolve().parent.parent / "data" / "corpus" + kwargs: dict = { + "local_path": str(corpus), + "filter_file": str(tmp_path / "filters.txt"), + "log_dir": str(tmp_path / "logs"), + "workdir": str(tmp_path / "workdir"), + } + kwargs.update(overrides) + return RcloneConfig(**kwargs) + + +def _version_mock(version: str) -> MagicMock: + return MagicMock(returncode=0, stdout=f"rclone v{version}\n", stderr="") + + +# --------------------------------------------------------------------------- +# Version gate +# --------------------------------------------------------------------------- + +def test_min_version_floor_is_1_68_2(): + assert (MIN_RCLONE_MAJOR, MIN_RCLONE_MINOR, MIN_RCLONE_PATCH) == (1, 68, 2) + + +def test_version_ok(): + with patch("sync.runner.shutil.which", return_value=FAKE_RCLONE), \ + patch("sync.runner.subprocess.run", return_value=_version_mock("1.68.2")): + assert check_rclone_version() == (1, 68, 2) + + +def test_version_newer_ok(): + with patch("sync.runner.shutil.which", return_value=FAKE_RCLONE), \ + patch("sync.runner.subprocess.run", return_value=_version_mock("1.72.1")): + assert check_rclone_version() == (1, 72, 1) + + +def test_version_too_old_raises(): + with patch("sync.runner.shutil.which", return_value=FAKE_RCLONE), \ + patch("sync.runner.subprocess.run", return_value=_version_mock("1.68.1")): + with pytest.raises(RcloneVersionError): + check_rclone_version() + + +def test_version_old_minor_raises(): + with patch("sync.runner.shutil.which", return_value=FAKE_RCLONE), \ + patch("sync.runner.subprocess.run", return_value=_version_mock("1.65.0")): + with pytest.raises(RcloneVersionError): + check_rclone_version() + + +def test_missing_binary_raises(): + with patch("sync.runner.shutil.which", return_value=None): + with pytest.raises(RcloneNotInstalledError): + check_rclone_version() + + +def test_unparseable_version_raises(): + with patch("sync.runner.shutil.which", return_value=FAKE_RCLONE), \ + patch("sync.runner.subprocess.run", return_value=MagicMock(returncode=0, stdout="garbage", stderr="")): + with pytest.raises(RcloneVersionError): + check_rclone_version() + + +# --------------------------------------------------------------------------- +# argv builders -- always a list, no shell, absolute binary +# --------------------------------------------------------------------------- + +def test_pull_argv_is_list_no_shell(tmp_path): + cfg = _make_cfg(tmp_path) + argv = build_pull_argv(cfg, dry_run=False, log_path="/tmp/x.log", rclone_bin=FAKE_RCLONE) + assert isinstance(argv, list) + assert "shell" not in argv + assert all(isinstance(a, str) for a in argv) + assert argv[0] == FAKE_RCLONE + assert Path(argv[0]).is_absolute() + assert argv[1] == "copy" + assert cfg.remote in argv + assert cfg.local_path in argv + assert "--filter-from" in argv + assert cfg.filter_file in argv + # rclone copy never deletes, so --max-delete is bisync-only (not in pull argv) + assert f"--max-delete={cfg.max_delete}" not in argv + assert f"--max-transfer={cfg.max_transfer}" in argv + assert "--check-first" in argv + assert "--log-file" in argv + assert "--log-level" in argv + + +def test_pull_argv_dry_run_flag(tmp_path): + cfg = _make_cfg(tmp_path) + argv = build_pull_argv(cfg, dry_run=True, log_path="/tmp/x.log", rclone_bin=FAKE_RCLONE) + assert "--dry-run" in argv + + +def test_checksum_toggles_with_cfg(tmp_path): + cfg_on = _make_cfg(tmp_path, checksum=True) + cfg_off = _make_cfg(tmp_path, checksum=False) + assert "--checksum" in build_pull_argv(cfg_on, False, "/tmp/x.log", FAKE_RCLONE) + assert "--checksum" not in build_pull_argv(cfg_off, False, "/tmp/x.log", FAKE_RCLONE) + + +def test_bisync_argv_is_list_with_conflict_flags(tmp_path): + cfg = _make_cfg(tmp_path, direction="bisync") + argv = build_bisync_argv(cfg, dry_run=False, log_path="/tmp/x.log", resync=True, rclone_bin=FAKE_RCLONE) + assert isinstance(argv, list) + assert argv[1] == "bisync" + assert f"--conflict-resolve={cfg.conflict_resolve}" in argv + assert f"--conflict-loser={cfg.conflict_loser}" in argv + assert "--workdir" in argv + assert cfg.workdir in argv + assert "--resync" in argv + # --max-delete is bisync-only (rclone copy never deletes) + assert f"--max-delete={cfg.max_delete}" in argv + + +# --------------------------------------------------------------------------- +# Log parsing + hashing +# --------------------------------------------------------------------------- + +def _write_log(tmp_path: Path) -> str: + log = tmp_path / "rclone.log" + log.write_text(textwrap.dedent("""\ + 2026/06/20 02:10:01 INFO : data/corpus/new.md: Copied (new) + 2026/06/20 02:10:02 INFO : data/corpus/changed.md: Copied (replaced existing) + 2026/06/20 02:10:03 INFO : data/corpus/gone.md: Deleted + 2026/06/20 02:10:04 ERROR : something went wrong on the wire + 2026/06/20 02:10:05 INFO : unrelated noise line that should be ignored + """), encoding="utf-8") + return str(log) + + +def test_parse_log_events_and_errors(tmp_path): + events, errors = parse_log(_write_log(tmp_path)) + kinds = {e.kind for e in events} + assert kinds == {"added", "modified", "deleted"} + assert any("something went wrong" in e for e in errors) + + +def test_parse_log_missing_file_tolerant(tmp_path): + events, errors = parse_log(str(tmp_path / "nope.log")) + assert events == [] + assert errors == [] + + +def test_hash_changed_files_streams_and_skips_deleted(tmp_path): + f = tmp_path / "a.md" + f.write_text("hello", encoding="utf-8") + events = [ + FileEvent(kind="added", path="a.md"), + FileEvent(kind="deleted", path="b.md"), + FileEvent(kind="modified", path="missing.md"), + ] + out = hash_changed_files(events, str(tmp_path)) + by_path = {e.path: e for e in out} + assert by_path["a.md"].sha256 is not None and len(by_path["a.md"].sha256) == 64 + assert by_path["b.md"].sha256 is None # deleted -> never hashed + assert by_path["missing.md"].sha256 is None # not on disk -> None + + +# --------------------------------------------------------------------------- +# Audit dicts -- "file" key, never "query", no secret fields +# --------------------------------------------------------------------------- + +def test_file_event_audit_uses_file_key_not_query(): + ev = FileEvent(kind="added", path="data/corpus/x.md", sha256="abc") + d = ev.to_audit_dict(base="/repo/data/corpus") + assert d["event"] == "sync_file_added" + assert "file" in d + assert "query" not in d + assert d["file"] == "data/corpus/x.md" + + +def test_sync_result_audit_dict_no_secret_fields(): + res = SyncResult( + success=True, direction="pull", started_at=1.0, finished_at=2.0, + rclone_exit_code=0, corpus_changed=True, + ) + d = res.to_audit_dict() + assert "query" not in d + keys = set(d.keys()) + assert not (keys & {"token", "refresh_token", "secret", "password", "stderr"}) + assert d["event"] == "sync_completed" + + +# --------------------------------------------------------------------------- +# run_sync end-to-end (mocked subprocess) + corpus_changed / exit-code wiring +# --------------------------------------------------------------------------- + +def _patch_audit(): + return patch("sync.runner.audit_log") + + +def test_run_sync_corpus_changed_and_exit_10(tmp_path): + cfg = _make_cfg(tmp_path) + log_path = cfg.log_path + + def dispatch(argv, **kwargs): + # argv must always be a list, never shell=True, binary absolute. + assert isinstance(argv, list) + assert kwargs.get("shell") is not True + assert Path(argv[0]).is_absolute() + if argv[1] == "version": + return _version_mock("1.70.0") + Path(log_path).parent.mkdir(parents=True, exist_ok=True) + # rclone logs paths relative to the transfer root (data/corpus), e.g. + # "notes.md" -- NOT "data/corpus/notes.md". corpus_changed must still + # fire on this realistic shape. + Path(log_path).write_text( + "2026/06/20 02:10:01 INFO : notes.md: Copied (new)\n", + encoding="utf-8", + ) + return MagicMock(returncode=0, stdout="", stderr="") + + with patch("sync.runner.shutil.which", return_value=FAKE_RCLONE), \ + patch("sync.runner.subprocess.run", side_effect=dispatch), \ + _patch_audit(): + result = run_sync(cfg, rclone_bin=FAKE_RCLONE) + + assert result.success is True + assert result.corpus_changed is True + assert reindex_exit_code_for(result, cfg) == cfg.REINDEX_EXIT_CODE == 10 + + +def test_run_sync_corpus_changed_ignores_rclone_internal_artifacts(tmp_path): + # An rclone scratch/state file leaking into the log must NOT trip + # corpus_changed (it is not corpus content). + cfg = _make_cfg(tmp_path) + log_path = cfg.log_path + + def dispatch(argv, **kwargs): + if argv[1] == "version": + return _version_mock("1.70.0") + Path(log_path).parent.mkdir(parents=True, exist_ok=True) + Path(log_path).write_text( + "2026/06/20 02:10:01 INFO : RCLONE_TEST: Copied (new)\n", + encoding="utf-8", + ) + return MagicMock(returncode=0, stdout="", stderr="") + + with patch("sync.runner.shutil.which", return_value=FAKE_RCLONE), \ + patch("sync.runner.subprocess.run", side_effect=dispatch), \ + _patch_audit(): + result = run_sync(cfg, rclone_bin=FAKE_RCLONE) + + assert result.success is True + assert result.corpus_changed is False + + +def test_run_sync_single_instance_lock_blocks_concurrent_run(tmp_path): + # A pre-existing (fresh) lock directory means another run holds the lock; + # run_sync must refuse rather than race a second rclone invocation. + cfg = _make_cfg(tmp_path) + lock_dir = Path(cfg.log_dir) / "sync.lock.d" + lock_dir.mkdir(parents=True) + + def dispatch(argv, **kwargs): + if argv[1] == "version": + return _version_mock("1.70.0") + raise AssertionError("rclone must not run while the lock is held") + + with patch("sync.runner.shutil.which", return_value=FAKE_RCLONE), \ + patch("sync.runner.subprocess.run", side_effect=dispatch), \ + _patch_audit(): + with pytest.raises(SyncRuntimeError): + run_sync(cfg, rclone_bin=FAKE_RCLONE) + + +def test_run_sync_releases_lock_after_run(tmp_path): + # After a normal run the lock directory must be gone so the next run can + # acquire it. + cfg = _make_cfg(tmp_path) + log_path = cfg.log_path + + def dispatch(argv, **kwargs): + if argv[1] == "version": + return _version_mock("1.70.0") + Path(log_path).parent.mkdir(parents=True, exist_ok=True) + Path(log_path).write_text("INFO : nothing to do\n", encoding="utf-8") + return MagicMock(returncode=0, stdout="", stderr="") + + with patch("sync.runner.shutil.which", return_value=FAKE_RCLONE), \ + patch("sync.runner.subprocess.run", side_effect=dispatch), \ + _patch_audit(): + run_sync(cfg, rclone_bin=FAKE_RCLONE) + + assert not (Path(cfg.log_dir) / "sync.lock.d").exists() + + +def test_run_sync_no_change_exit_0(tmp_path): + cfg = _make_cfg(tmp_path) + log_path = cfg.log_path + + def dispatch(argv, **kwargs): + if argv[1] == "version": + return _version_mock("1.70.0") + Path(log_path).parent.mkdir(parents=True, exist_ok=True) + Path(log_path).write_text("2026/06/20 02:10:01 INFO : nothing to do\n", encoding="utf-8") + return MagicMock(returncode=0, stdout="", stderr="") + + with patch("sync.runner.shutil.which", return_value=FAKE_RCLONE), \ + patch("sync.runner.subprocess.run", side_effect=dispatch), \ + _patch_audit(): + result = run_sync(cfg, rclone_bin=FAKE_RCLONE) + + assert result.success is True + assert result.corpus_changed is False + assert reindex_exit_code_for(result, cfg) == 0 + + +def test_run_sync_safety_abort_exit_1(tmp_path): + cfg = _make_cfg(tmp_path) + log_path = cfg.log_path + + def dispatch(argv, **kwargs): + if argv[1] == "version": + return _version_mock("1.70.0") + Path(log_path).parent.mkdir(parents=True, exist_ok=True) + Path(log_path).write_text("", encoding="utf-8") + return MagicMock(returncode=9, stdout="", stderr="Fatal error: max-delete threshold exceeded") + + with patch("sync.runner.shutil.which", return_value=FAKE_RCLONE), \ + patch("sync.runner.subprocess.run", side_effect=dispatch), \ + _patch_audit(): + result = run_sync(cfg, rclone_bin=FAKE_RCLONE) + + assert result.success is False + assert result.aborted_for_safety is True + assert reindex_exit_code_for(result, cfg) == 1 + + +def test_run_sync_other_failure_exit_2(tmp_path): + cfg = _make_cfg(tmp_path) + log_path = cfg.log_path + + def dispatch(argv, **kwargs): + if argv[1] == "version": + return _version_mock("1.70.0") + Path(log_path).parent.mkdir(parents=True, exist_ok=True) + Path(log_path).write_text("", encoding="utf-8") + return MagicMock(returncode=1, stdout="", stderr="generic failure") + + with patch("sync.runner.shutil.which", return_value=FAKE_RCLONE), \ + patch("sync.runner.subprocess.run", side_effect=dispatch), \ + _patch_audit(): + result = run_sync(cfg, rclone_bin=FAKE_RCLONE) + + assert result.success is False + assert result.aborted_for_safety is False + assert reindex_exit_code_for(result, cfg) == 2 + + +def test_run_sync_audit_events_have_file_key_no_query(tmp_path): + cfg = _make_cfg(tmp_path) + log_path = cfg.log_path + captured: list[dict] = [] + + def dispatch(argv, **kwargs): + if argv[1] == "version": + return _version_mock("1.70.0") + Path(log_path).parent.mkdir(parents=True, exist_ok=True) + Path(log_path).write_text( + "2026/06/20 02:10:01 INFO : notes.md: Copied (new)\n", + encoding="utf-8", + ) + return MagicMock(returncode=0, stdout="", stderr="") + + with patch("sync.runner.shutil.which", return_value=FAKE_RCLONE), \ + patch("sync.runner.subprocess.run", side_effect=dispatch), \ + patch("sync.runner.audit_log", side_effect=lambda e: captured.append(e)): + run_sync(cfg, rclone_bin=FAKE_RCLONE) + + file_events = [e for e in captured if e.get("event", "").startswith("sync_file_")] + assert file_events, "expected at least one per-file audit event" + for e in file_events: + assert "file" in e + assert "query" not in e + # No secret-looking fields anywhere. + for e in captured: + assert not (set(e.keys()) & {"token", "refresh_token", "secret", "password", "stderr"}) diff --git a/tests/test_sync_scheduler.py b/tests/test_sync_scheduler.py new file mode 100644 index 0000000..0a88205 --- /dev/null +++ b/tests/test_sync_scheduler.py @@ -0,0 +1,365 @@ +"""Self-contained tests for sync.scheduler. + +Runnable with ``pytest --noconftest`` (no conftest fixtures): builds an +``RcloneConfig`` directly via a tmp config.yaml + ``reset_config_cache`` and +patches the subprocess / which / platform boundary. No real crontab or schtasks +is ever invoked, and no network is touched. +""" + +from __future__ import annotations + +import os +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +from sync.config import RcloneConfig, load_sync_config +from sync.scheduler import ( + TASK_TAG, + WINDOWS_TASK_NAME, + CronScheduler, + ScheduleEntry, + WindowsTaskScheduler, + get_scheduler, +) +from utils.errors import SchedulerError +from utils.logger import reset_config_cache + +# A repo-valid corpus path: config validation requires local_path to resolve +# under the repo's data/corpus tree, so derive it from this file's location. +_REPO_ROOT = Path(__file__).resolve().parent.parent +_CORPUS = str(_REPO_ROOT / "data" / "corpus") + + +def _make_cfg(schedule_hour: int = 2, schedule_min: int = 0, **overrides) -> RcloneConfig: + kwargs: dict = dict( + local_path=_CORPUS, + remote_name="dropbox_cyclaw", + remote_path="CyClaw/corpus", + schedule_hour=schedule_hour, + schedule_min=schedule_min, + ) + kwargs.update(overrides) + return RcloneConfig(**kwargs) + + +def _completed(returncode: int = 0, stdout: str = "", stderr: str = "") -> MagicMock: + m = MagicMock() + m.returncode = returncode + m.stdout = stdout + m.stderr = stderr + return m + + +# --------------------------------------------------------------------------- +# get_scheduler factory +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + ("system", "expected"), + [ + ("Linux", CronScheduler), + ("Darwin", CronScheduler), + ("Windows", WindowsTaskScheduler), + ], +) +def test_get_scheduler_returns_right_class(system: str, expected: type) -> None: + cfg = _make_cfg() + with patch("sync.scheduler.platform.system", return_value=system): + sched = get_scheduler(cfg) + assert isinstance(sched, expected) + + +def test_get_scheduler_unsupported_platform_raises() -> None: + cfg = _make_cfg() + with patch("sync.scheduler.platform.system", return_value="Plan9"): + with pytest.raises(SchedulerError): + get_scheduler(cfg) + + +# --------------------------------------------------------------------------- +# CronScheduler.install +# --------------------------------------------------------------------------- + + +def test_cron_install_appends_exactly_one_tagged_line() -> None: + cfg = _make_cfg(schedule_hour=3, schedule_min=15) + existing = "0 1 * * * /usr/bin/backup.sh\n" + written: dict[str, str] = {} + + def fake_run(argv, **kwargs): # type: ignore[no-untyped-def] + if argv[1] == "-l": + return _completed(stdout=existing) + # write path: crontab - + written["content"] = kwargs["input"] + return _completed() + + with ( + patch("sync.scheduler.shutil.which", return_value="/usr/bin/crontab"), + patch("sync.scheduler.subprocess.run", side_effect=fake_run), + patch("sync.scheduler.platform.system", return_value="Linux"), + ): + entry = CronScheduler(cfg).install() + + content = written["content"] + tagged = [ln for ln in content.splitlines() if TASK_TAG in ln] + assert len(tagged) == 1 + # User's unrelated line is preserved. + assert "/usr/bin/backup.sh" in content + assert isinstance(entry, ScheduleEntry) + assert entry.cron_or_time == "15 3 * * *" + assert tagged[0].startswith("15 3 * * *") + assert tagged[0].endswith(f"# {TASK_TAG}") + # Command cd's into the repo root, not data/corpus. + assert "-m sync.cli sync" in tagged[0] + assert os.path.basename(_REPO_ROOT) in tagged[0] + + +def test_cron_install_replaces_prior_tagged_line() -> None: + cfg = _make_cfg() + # Two existing tagged lines plus an unrelated one. + existing = ( + "0 1 * * * /usr/bin/keep.sh\n" + f"9 9 * * * old-cmd # {TASK_TAG}\n" + f"8 8 * * * older-cmd # {TASK_TAG}\n" + ) + written: dict[str, str] = {} + + def fake_run(argv, **kwargs): # type: ignore[no-untyped-def] + if argv[1] == "-l": + return _completed(stdout=existing) + written["content"] = kwargs["input"] + return _completed() + + with ( + patch("sync.scheduler.shutil.which", return_value="/usr/bin/crontab"), + patch("sync.scheduler.subprocess.run", side_effect=fake_run), + patch("sync.scheduler.platform.system", return_value="Linux"), + ): + CronScheduler(cfg).install() + + content = written["content"] + tagged = [ln for ln in content.splitlines() if TASK_TAG in ln] + assert len(tagged) == 1 # both old tagged lines stripped, one fresh appended + assert "old-cmd" not in content + assert "older-cmd" not in content + assert "/usr/bin/keep.sh" in content + + +# --------------------------------------------------------------------------- +# CronScheduler.remove +# --------------------------------------------------------------------------- + + +def test_cron_remove_returns_false_when_no_tagged_line() -> None: + cfg = _make_cfg() + existing = "0 1 * * * /usr/bin/keep.sh\n" + write_called = {"n": 0} + + def fake_run(argv, **kwargs): # type: ignore[no-untyped-def] + if argv[1] == "-l": + return _completed(stdout=existing) + write_called["n"] += 1 + return _completed() + + with ( + patch("sync.scheduler.shutil.which", return_value="/usr/bin/crontab"), + patch("sync.scheduler.subprocess.run", side_effect=fake_run), + ): + result = CronScheduler(cfg).remove() + + assert result is False + assert write_called["n"] == 0 # nothing rewritten when nothing matched + + +def test_cron_remove_returns_true_when_tagged_line_present() -> None: + cfg = _make_cfg() + existing = "0 1 * * * /usr/bin/keep.sh\n" f"2 2 * * * some-cmd # {TASK_TAG}\n" + written: dict[str, str] = {} + + def fake_run(argv, **kwargs): # type: ignore[no-untyped-def] + if argv[1] == "-l": + return _completed(stdout=existing) + written["content"] = kwargs["input"] + return _completed() + + with ( + patch("sync.scheduler.shutil.which", return_value="/usr/bin/crontab"), + patch("sync.scheduler.subprocess.run", side_effect=fake_run), + ): + result = CronScheduler(cfg).remove() + + assert result is True + assert TASK_TAG not in written["content"] + assert "/usr/bin/keep.sh" in written["content"] + + +# --------------------------------------------------------------------------- +# CronScheduler.status +# --------------------------------------------------------------------------- + + +def test_cron_status_parses_tagged_line() -> None: + cfg = _make_cfg() + existing = "0 1 * * * /usr/bin/keep.sh\n" f'30 4 * * * cd "/repo" && python -m sync.cli sync # {TASK_TAG}\n' + + with ( + patch("sync.scheduler.shutil.which", return_value="/usr/bin/crontab"), + patch("sync.scheduler.subprocess.run", return_value=_completed(stdout=existing)), + patch("sync.scheduler.platform.system", return_value="Linux"), + ): + entry = CronScheduler(cfg).status() + + assert entry is not None + assert entry.cron_or_time == "30 4 * * *" + assert "sync.cli sync" in entry.command + assert TASK_TAG not in entry.command # comment stripped from the command + + +def test_cron_status_returns_none_when_no_tagged_line() -> None: + cfg = _make_cfg() + existing = "0 1 * * * /usr/bin/keep.sh\n" + with ( + patch("sync.scheduler.shutil.which", return_value="/usr/bin/crontab"), + patch("sync.scheduler.subprocess.run", return_value=_completed(stdout=existing)), + ): + assert CronScheduler(cfg).status() is None + + +def test_cron_missing_crontab_binary_raises() -> None: + cfg = _make_cfg() + with patch("sync.scheduler.shutil.which", return_value=None): + with pytest.raises(SchedulerError): + CronScheduler(cfg).status() + + +def test_cron_uses_argv_list_never_shell() -> None: + cfg = _make_cfg() + seen: list[object] = [] + + def fake_run(argv, **kwargs): # type: ignore[no-untyped-def] + seen.append(argv) + assert isinstance(argv, list) + assert "shell" not in kwargs or kwargs["shell"] is False + if argv[1] == "-l": + return _completed(stdout="") + return _completed() + + with ( + patch("sync.scheduler.shutil.which", return_value="/usr/bin/crontab"), + patch("sync.scheduler.subprocess.run", side_effect=fake_run), + patch("sync.scheduler.platform.system", return_value="Linux"), + ): + CronScheduler(cfg).install() + + assert seen # at least one call made + for argv in seen: + assert isinstance(argv, list) + assert argv[0] == "/usr/bin/crontab" + + +# --------------------------------------------------------------------------- +# WindowsTaskScheduler +# --------------------------------------------------------------------------- + + +def test_windows_install_builds_schtasks_argv(tmp_path: Path) -> None: + # log_dir -> tmp so the generated .bat launcher does not touch the real home. + cfg = _make_cfg(schedule_hour=5, schedule_min=7, log_dir=str(tmp_path / "logs")) + captured: dict[str, object] = {} + + def fake_run(argv, **kwargs): # type: ignore[no-untyped-def] + captured["argv"] = argv + captured["kwargs"] = kwargs + return _completed(stdout="SUCCESS") + + with ( + patch("sync.scheduler.shutil.which", return_value=r"C:\Windows\System32\schtasks.exe"), + patch("sync.scheduler.subprocess.run", side_effect=fake_run), + patch("sync.scheduler.platform.system", return_value="Windows"), + ): + entry = WindowsTaskScheduler(cfg).install() + + argv = captured["argv"] + assert isinstance(argv, list) + assert "shell" not in captured["kwargs"] or captured["kwargs"]["shell"] is False + assert argv[0] == r"C:\Windows\System32\schtasks.exe" + assert "/Create" in argv + assert argv[argv.index("/TN") + 1] == WINDOWS_TASK_NAME + assert argv[argv.index("/SC") + 1] == "DAILY" + assert argv[argv.index("/ST") + 1] == "05:07" + assert "/F" in argv + assert argv[argv.index("/RL") + 1] == "LIMITED" + assert entry.cron_or_time == "05:07" + assert entry.platform_name == "windows" + + # /TR points at the generated .bat launcher (robust quoting), and the file + # was actually written with the cd + sync invocation. + launcher = argv[argv.index("/TR") + 1] + assert launcher.endswith("cyclaw_sync.bat") + assert entry.command == launcher + bat_text = Path(launcher).read_text(encoding="utf-8") + assert "-m sync.cli sync" in bat_text + assert "cd /d" in bat_text + + +def test_windows_remove_not_found_returns_false() -> None: + cfg = _make_cfg() + with ( + patch("sync.scheduler.shutil.which", return_value=r"C:\Windows\System32\schtasks.exe"), + patch( + "sync.scheduler.subprocess.run", + return_value=_completed(returncode=1, stderr="ERROR: The system cannot find the file specified."), + ), + ): + assert WindowsTaskScheduler(cfg).remove() is False + + +def test_windows_remove_success_returns_true() -> None: + cfg = _make_cfg() + with ( + patch("sync.scheduler.shutil.which", return_value=r"C:\Windows\System32\schtasks.exe"), + patch("sync.scheduler.subprocess.run", return_value=_completed(returncode=0, stdout="SUCCESS")), + ): + assert WindowsTaskScheduler(cfg).remove() is True + + +def test_windows_missing_schtasks_raises() -> None: + cfg = _make_cfg() + with patch("sync.scheduler.shutil.which", return_value=None): + with pytest.raises(SchedulerError): + WindowsTaskScheduler(cfg).install() + + +# --------------------------------------------------------------------------- +# Config-from-yaml path (Appendix D style, no conftest fixtures) +# --------------------------------------------------------------------------- + + +def test_scheduler_from_loaded_config(tmp_path: Path) -> None: + cfg_yaml = tmp_path / "config.yaml" + # Use forward slashes: YAML double-quoted strings interpret backslashes as + # escape sequences, which breaks Windows paths like C:\Users\... in CI. + corpus_fwd = _CORPUS.replace("\\", "/") + cfg_yaml.write_text( + "sync:\n" + " enabled: true\n" + f' local_path: "{corpus_fwd}"\n' + " remote_name: dropbox_cyclaw\n" + " remote_path: CyClaw/corpus\n" + " schedule_hour: 6\n" + " schedule_min: 30\n" + ) + reset_config_cache() + try: + cfg = load_sync_config(str(cfg_yaml)) + finally: + reset_config_cache() + + with patch("sync.scheduler.platform.system", return_value="Linux"): + sched = get_scheduler(cfg) + assert isinstance(sched, CronScheduler) + assert sched.cfg.schedule_hour == 6 + assert sched.cfg.schedule_min == 30 diff --git a/utils/errors.py b/utils/errors.py index 75e53fb..3d55a6e 100644 --- a/utils/errors.py +++ b/utils/errors.py @@ -42,6 +42,53 @@ class ConfigError(RAGError): def __init__(self, message: str, details: Optional[dict] = None): super().__init__(message, code="CONFIG_ERROR", details=details) +class SyncError(RAGError): + """Base error for out-of-band Dropbox corpus sync operations.""" + + def __init__(self, message: str, details: Optional[dict] = None): + super().__init__(message, code="SYNC_ERROR", details=details) + + +class RcloneNotInstalledError(SyncError): + """rclone binary not found on PATH.""" + + def __init__(self, message: str, details: Optional[dict] = None): + super().__init__(message, details=details) + self.code = "RCLONE_NOT_INSTALLED" + + +class RcloneVersionError(SyncError): + """rclone is installed but the version is below the required floor.""" + + def __init__(self, message: str, details: Optional[dict] = None): + super().__init__(message, details=details) + self.code = "RCLONE_VERSION_TOO_OLD" + + +class SyncConfigError(SyncError): + """The sync: block in config.yaml is missing or invalid.""" + + def __init__(self, message: str, details: Optional[dict] = None): + super().__init__(message, details=details) + self.code = "SYNC_CONFIG_INVALID" + + +class SchedulerError(SyncError): + """Cron / systemd / launchd / Task Scheduler registration or removal failed.""" + + def __init__(self, message: str, details: Optional[dict] = None): + super().__init__(message, details=details) + self.code = "SYNC_SCHEDULER_ERROR" + + +class SyncRuntimeError(SyncError): + """rclone subprocess failed at runtime (non-zero exit, safety-fuse abort, etc.).""" + + def __init__(self, message: str, details: Optional[dict] = None): + super().__init__(message, details=details) + self.code = "SYNC_RUNTIME_ERROR" + + @dataclass class HealthStatus: name: str