diff --git a/.github/actions/audit/action.yml b/.github/actions/audit/action.yml
new file mode 100644
index 0000000..fa61702
--- /dev/null
+++ b/.github/actions/audit/action.yml
@@ -0,0 +1,81 @@
+name: 'Data Hygiene Audit'
+description: 'Run data quality checks on Excel/CSV files and fail if score is too low'
+inputs:
+  file:
+    description: 'Path to input file or directory'
+    required: true
+  output:
+    description: 'Output directory for reports'
+    required: false
+    default: './audit-reports'
+  fail-under:
+    description: 'Minimum health score (0-100). Fails if score is below this.'
+    required: false
+    default: '0'
+  threshold:
+    description: 'Fuzzy duplicate similarity threshold (0.0-1.0)'
+    required: false
+    default: '0.85'
+  rules:
+    description: 'Path to custom rules JSON file'
+    required: false
+    default: ''
+  schema:
+    description: 'Path to schema JSON file'
+    required: false
+    default: ''
+outputs:
+  score:
+    description: 'Overall health score (0-100)'
+    value: ${{ steps.audit.outputs.score }}
+  issues:
+    description: 'Total number of issues found'
+    value: ${{ steps.audit.outputs.issues }}
+runs:
+  using: 'composite'
+  steps:
+    - name: Install Data Hygiene Auditor
+      shell: bash
+      run: pip install .
+
+    - name: Run audit
+      id: audit
+      shell: bash
+      run: |
+        ARGS="--input ${{ inputs.file }} --output ${{ inputs.output }} --json"
+        ARGS="$ARGS --threshold ${{ inputs.threshold }}"
+        if [ -n "${{ inputs.rules }}" ]; then
+          ARGS="$ARGS --rules ${{ inputs.rules }}"
+        fi
+        if [ -n "${{ inputs.schema }}" ]; then
+          ARGS="$ARGS --schema ${{ inputs.schema }}"
+        fi
+        if [ "${{ inputs.fail-under }}" != "0" ]; then
+          ARGS="$ARGS --fail-under ${{ inputs.fail-under }}"
+        fi
+        data-hygiene-audit $ARGS
+        # Extract score from JSON output
+        SCORE=$(python -c "
+        import json, glob
+        files = glob.glob('${{ inputs.output }}/*_audit_results.json')
+        if files:
+            with open(files[0]) as f:
+                data = json.load(f)
+            print(data['overall_score'])
+        else:
+            print('0')
+        ")
+        echo "score=$SCORE" >> $GITHUB_OUTPUT
+        # Count issues
+        ISSUES=$(python -c "
+        import json, glob
+        from data_hygiene_auditor.core import count_issues
+        files = glob.glob('${{ inputs.output }}/*_audit_results.json')
+        total = 0
+        for f in files:
+            with open(f) as fh:
+                data = json.load(fh)
+            total += count_issues(data).get('total', 0)
+        print(total)
+        ")
+        echo "issues=$ISSUES" >> $GITHUB_OUTPUT
diff --git a/AUDIT.md b/AUDIT.md
index 20c63a4..ff18979 100644
--- a/AUDIT.md
+++ b/AUDIT.md
@@ -290,3 +290,200 @@ Move #13. AI-powered fix suggestions. Only attempt after the foundation and pres
 - **Don't build a GUI/web app yet.** The interactive HTML report gives you most of the "explorable" benefit without the deployment/hosting/auth complexity. A web app is a different product.
 - **Don't chase pipeline integration** (dbt, Airflow, CI). Your audience is consultants with spreadsheets, not data engineers with warehouses. Pipeline integration dilutes your focus without serving your users.
 - **Don't refactor before testing.** The temptation is to restructure first (it's messy!), but write tests against the current behavior first. Then refactor with confidence.
+
+---
+
+# Audit Round 2 (2026-05-16)
+
+All items from the 2025 audit were shipped (PRs #1-#9). This round assesses the project's current state after that work, with fresh landscape data.
+
+## Phase 1: Baseline Assessment (2026)
+**Date:** 2026-05-16
+**Project:** Data Hygiene Auditor v1.0.0
+
+### What Exists Today
+
+A well-structured Python CLI + library (10 modules, ~3,750 LOC) that scans Excel/CSV/TSV files for data quality issues and produces interactive HTML, Excel, and PDF reports. Features shipped since last audit: schema validation, trend comparison, vectorized detection (3.4x speedup), fuzzy duplicate matching, typed Python API, health scores, interactive HTML, fix suggestions.
+
+### Current Architecture
+
+| Module | LOC | Purpose |
+|--------|-----|---------|
+| `detection.py` | 654 | 7 detection engines |
+| `reporting/html.py` | 841 | Interactive HTML report |
+| `reporting/pdf.py` | 418 | PDF deliverable |
+| `reporting/excel.py` | 335 | Excel findings file |
+| `api.py` | 412 | Typed Python API (dataclasses) |
+| `core.py` | 292 | Orchestrator + data loading |
+| `suggestions.py` | 285 | Fix suggestion engine |
+| `cli.py` | 202 | CLI with colored output |
+| `schema.py` | 144 | Schema validation |
+| `trend.py` | 103 | Trend comparison |
+| **Tests** | 1,576 | 167 tests across 8 files |
+
+### Health Indicators
+
+| Dimension | Status |
+|-----------|--------|
+| Tests | 167 passing, all detection engines covered |
+| CI | GitHub Actions: ruff + pytest on 3.9/3.12/3.13 |
+| Packaging | pyproject.toml, pip-installable, `data-hygiene-audit` CLI |
+| API | `audit_file()` with typed dataclasses, py.typed marker |
+| Docs | Comprehensive README with screenshots and library examples |
+| Performance | Vectorized detection, 3.4x improvement on large files |
+
+### Gap Analysis
+
+**Resolved from 2025 audit:** CSV support, tests, CI, packaging, interactive HTML, health score, vectorized perf, fuzzy matching, typed API, fix suggestions, schema validation, trend comparison — all shipped.
+
+**Remaining or new issues:**
+1. CLI under-counts issues (missing fuzzy duplicates in total)
+2. `_raw` attribute set outside dataclass `__init__` — type-unsafe
+3. Tests import via backward-compat shim, not package directly
+4. No type checker in CI despite py.typed marker
+5. Python 3.8 claimed but untested
+6. Fuzzy matching silently skipped above 500 rows
+7. No CHANGELOG or release tags
+
+## Phase 2: Internal Review (2026)
+**Date:** 2026-05-16
+**Dimensions:** Code Quality, Architecture, Tests, Documentation, Performance, Security, UX, DevEx
+
+### Top Opportunities
+
+| # | Finding | Dimension | Impact | Effort | Leverage | Severity |
+|---|---------|-----------|--------|--------|----------|----------|
+| 1 | CLI missing fuzzy_duplicates in issue count — under-reports total | Code Quality | 3 | 1 | 3.0 | bug |
+| 2 | `AuditResult._raw` monkey-patched outside `__init__` | Code Quality | 4 | 1 | 4.0 | important |
+| 3 | Issue-counting logic still duplicated 3x (cli, html, excel) | Code Quality | 3 | 1 | 3.0 | important |
+| 4 | `requires-python >= 3.8` but CI tests 3.9+ only | DevEx | 3 | 1 | 3.0 | important |
+| 5 | Tests import from `audit` shim, not `data_hygiene_auditor` | Tests | 3 | 2 | 1.5 | important |
+| 6 | No type checker in CI despite py.typed marker | DevEx | 3 | 2 | 1.5 | important |
+| 7 | Levenshtein O(n²) hard-capped at 500 rows — silently skips | Performance | 3 | 3 | 1.0 | important |
+| 8 | No file size guard — OOM on large crafted input | Security | 3 | 2 | 1.5 | important |
+| 9 | `_load_sheets` exported in public `__all__` | Architecture | 2 | 1 | 2.0 | minor |
+| 10 | `--schema`/`--baseline` undocumented in README options table | Documentation | 2 | 1 | 2.0 | minor |
+| 11 | No `--quiet`/`--version` flags | UX | 2 | 1 | 2.0 | minor |
+| 12 | No CHANGELOG | Documentation | 2 | 1 | 2.0 | minor |
+
+### Summary
+
+The project is in strong shape. The 2025 audit's critical issues (monolith, no tests, XSS, no CSV, no packaging) are all resolved. What remains is polish-tier work: a counting bug, a type safety issue, test import paths, and CI completeness. The architecture is clean and the detection logic is solid.
+
+## Phase 3: Landscape Scan (2026)
+**Date:** 2026-05-16
+**Method:** Web research (verified through May 2026)
+
+### Key Landscape Changes (2025 → 2026)
+
+1. **ydata-profiling rebranded to fg-data-profiling** (v4.19.1, Apr 2026). Package/import renamed. Signals stewardship instability.
+2. **GX added ExpectAI** — AI-generated expectations from data patterns. Possible acquisition May 2026 (unconfirmed).
+3. **Data contracts became dominant framing** — Soda Core repositioned as "Data Contracts engine." Irrelevant to file-audit use case.
+4. **Enterprise consolidation** — Metaplane → Datadog, SYNQ → Coalesce, Select Star → Snowflake. Affects $50K+ tier only.
+5. **AI/LLM integration is commercial-tier only** — ExpectAI, SodaGPT. No OSS tool has AI fix suggestions. Window still open.
+6. **DQX (Databricks Labs)** — new PySpark-native DQ framework. Not relevant to file-based auditing.
+7. **DQOps** — OSS + commercial ($499/mo). 150+ built-in checks. Warehouse-only, no file support.
+
+### Competitive Position (2026)
+
+**Unique to this project (confirmed still unmatched):**
+- Placeholder/test data detection
+- Misused field detection (cross-column semantic validation)
+- Triple output format (HTML + Excel + PDF)
+- Severity ratings + plain-English explanations for non-technical stakeholders
+- Health score (0-100)
+- Deterministic fix suggestions with copy-paste code
+- Schema validation + trend comparison (closes previous gaps)
+
+**The consultant gap remains completely unoccupied.** Every competitor is a warehouse connector for engineers, a profiler for data scientists, or an interactive GUI for researchers. No tool takes a file and produces a credentialed audit report with severity ratings and fix language for a client meeting.
+
+### Feature Parity Check
+
+| Table Stakes | Status |
+|-------------|--------|
+| CSV/TSV support | ✅ Shipped |
+| Null/completeness analysis | ✅ |
+| CLI + Python API | ✅ Both |
+| Large file handling (100K+) | 🟡 Vectorized but fuzzy capped at 500 |
+| Interactive report | ✅ Filters, search, TOC, collapsible |
+
+## Phase 4: Differentiation & Next Moves (2026)
+**Date:** 2026-05-16
+
+### Cross-Reference Summary
+
+The situation has inverted since the 2025 audit. A year ago, the project had strong detection but weak everything else. Now:
+- **Foundation:** solid (tests, CI, packaging, clean architecture)
+- **Presentation:** strong (interactive HTML, health score ring, fix suggestions)
+- **Detection:** comprehensive (7 engines + schema + trend)
+- **Competitive position:** unique and uncontested
+
+The remaining work is no longer transformative — it's **incremental quality improvements and strategic positioning**. The highest-impact moves are now about reach (getting the tool in front of users) and polish (fixing the few rough edges that undermine professional credibility).
+
+### Ranked Next Moves
+
+| # | Move | Category | Strategic | Internal | Effort | Score | Description |
+|---|------|----------|-----------|----------|--------|-------|-------------|
+| 1 | Fix CLI fuzzy dup counting bug | Correctness | 1 | 4 | 1 | 5.0 | CLI under-reports total issues by omitting fuzzy duplicates from count. One missing loop. |
+| 2 | Fix `_raw` type safety | Code Quality | 1 | 3 | 1 | 4.0 | Move `_raw` into `AuditResult.__init__` as a proper field. Fixes mypy, IDE autocomplete. |
+| 3 | Extract shared issue-counting helper | Code Quality | 1 | 3 | 1 | 4.0 | Single function used by CLI, HTML, and Excel. Prevents future counting bugs. |
+| 4 | Document `--schema`/`--baseline` in README | Documentation | 2 | 2 | 1 | 4.0 | Features exist but aren't discoverable in README options table. |
+| 5 | Add `--version` and `--quiet` flags | UX | 2 | 2 | 1 | 4.0 | Professional CLI conventions. `--quiet` enables scripted/CI usage. |
+| 6 | Align Python version (drop 3.8 claim or add CI) | DevEx | 2 | 3 | 1 | 5.0 | Either add 3.8 to CI matrix or bump requires-python to >=3.9. |
+| 7 | Add mypy/pyright to CI | DevEx | 2 | 3 | 2 | 2.5 | py.typed marker promises type safety — CI should enforce it. |
+| 8 | Migrate test imports to `data_hygiene_auditor` | Tests | 1 | 3 | 2 | 2.0 | Tests should exercise the package, not the backward-compat shim. |
+| 9 | Warn when fuzzy matching is skipped (>500 rows) | UX | 3 | 2 | 1 | 5.0 | User should know a detection pass was omitted on large sheets. |
+| 10 | Scale fuzzy matching beyond 500 rows | Performance | 4 | 3 | 3 | 2.3 | Locality-sensitive hashing or blocking strategy to handle 10K+ rows. |
+| 11 | Add CHANGELOG and release tagging | DevEx | 3 | 2 | 1 | 5.0 | Version tracking for users. Signal active maintenance. |
+| 12 | PyPI publication | Reach | 5 | 1 | 2 | 3.0 | `pip install data-hygiene-auditor` from anywhere. Major discoverability boost. |
+| 13 | "Data linter" positioning + README refresh | Reach | 4 | 1 | 2 | 2.5 | Adopt the "linter for data" framing that resonates with the developer audience. Keywords for discoverability. |
+| 14 | File size guard / row limit warning | Security | 2 | 2 | 1 | 4.0 | Warn at 500K rows, refuse at 2M unless `--force`. Prevents OOM. |
+| 15 | Remove `_load_sheets` from public `__all__` | Architecture | 1 | 2 | 1 | 3.0 | Private helper shouldn't be in the public API surface. |
+
+### Recommended Sequence
+
+**Sprint 5: Bug Fixes & Polish (half day)**
+Moves #1-6, #9, #11, #14, #15. All effort-1 items. Brings the project to "no rough edges" state.
+- Fix CLI counting bug
+- Fix `_raw` type safety
+- Extract issue-counting helper
+- Document `--schema`/`--baseline` in README
+- Add `--version` and `--quiet`
+- Align Python version requirement
+- Warn on skipped fuzzy matching
+- Add CHANGELOG
+- File size guard
+- Remove `_load_sheets` from `__all__`
+
+**Sprint 6: Engineering Rigor (1 day)**
+Moves #7, #8. Type checking + test migration.
+- Add mypy/pyright to CI
+- Migrate test imports to package
+
+**Sprint 7: Reach (1-2 days)**
+Moves #12, #13. Get the tool in front of users.
+- Publish to PyPI
+- README refresh with "data linter" positioning
+
+**Sprint 8: Scale (2-3 days)**
+Move #10. Requires algorithmic work.
+- Scale fuzzy matching with LSH or blocking
+
+### What NOT to Do (2026 Update)
+
+Previous "don't do" items that were done anyway and **worked out:**
+- ~~Don't add schema validation~~ → Added (PR #9). Lightweight, optional, complements rather than competes with GX/pandera. **Correct call to add it.**
+
+Updated guidance:
+- **Don't add statistical profiling.** fg-data-profiling still owns this despite the rebrand. Your strength is consulting-specific findings.
+- **Don't build a web app.** The interactive HTML file is self-contained, shareable, and zero-deployment. A server-side app is a different product for a different audience.
+- **Don't chase pipeline integration.** The market moved further toward warehouse-native observability (DQOps, Soda, GX Cloud). That's their game. Yours is file-native audit reports.
+- **Don't add LLM-powered features yet.** The deterministic fix suggestions already work well. LLM adds latency, API key requirements, and cost for marginal improvement. Revisit when local models are fast enough to run offline.
+- **Don't over-engineer the fuzzy cap.** The 500-row Levenshtein cap is a reasonable default for spreadsheet-sized data. Add a warning, not a complex distributed algorithm. Only invest in scaling if real users hit the limit.
+- **Don't compete on star count or downloads.** The niche is small but uncontested. One glowing testimonial from a consultant who used it on a real engagement is worth more than 1K GitHub stars from drive-by visitors.
+
+### Strategic Summary
+
+The project has successfully executed its transformation from "Claude Chat artifact" to "genuinely differentiated tool." The 2025 audit's thesis — that the detection was the moat but needed a stage — has been validated. The stage is now built. The next phase is about **credibility and reach**: fixing the remaining rough edges, publishing to PyPI, and positioning the tool where its target audience (consultants, analysts, data teams inheriting messy spreadsheets) can find it.
+
+The competitive landscape has moved *away* from this project's niche (toward warehouse observability and data contracts), which is strategically favorable — it means less competition, not more. The window for "file-native, consultant-focused, severity-rated audit reports" remains wide open with no credible competitor in 2026.
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000..da7a389
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,58 @@
+# Changelog
+
+All notable changes to this project will be documented in this file.
+
+Format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
+
+## [Unreleased]
+
+### Fixed
+- CLI issue count now includes fuzzy duplicates and schema violations
+- `AuditResult._raw` is a proper dataclass field (type-checker visible)
+
+### Added
+- Custom rule engine: define detection rules in JSON (`--rules` flag)
+  - Conditions: `regex_match`, `not_regex_match`, `min_length`, `max_length`, `allowed_values`, `disallowed_values`, `max_missing_pct`
+  - Target columns by regex pattern or explicit list
+  - Findings integrated into all 3 report formats
+- Column-level profiling: cardinality, uniqueness %, avg length, numeric range
+  - Stats shown in HTML, Excel, PDF, and JSON output
+  - `ColumnProfile` dataclass in typed API
+- Multi-file / directory mode: `--input ./data/` audits all supported files
+  - `run_multi_audit()` API for programmatic multi-file audits
+- CI / pipeline integration
+  - `--fail-under` flag: exit code 1 if score < threshold
+  - `--sarif` flag: SARIF 2.1.0 output for GitHub Code Scanning
+  - GitHub Action (`.github/actions/audit/action.yml`)
+- `--version` / `-V` flag
+- `--quiet` / `-q` flag to suppress terminal output
+- `--force` flag to override the 2M row safety limit
+- `count_issues()` shared helper for consistent issue counting
+- Warning when fuzzy (Levenshtein) matching is skipped due to row count
+- File size guard: warns at 500K rows, refuses at 2M without `--force`
+
+### Changed
+- Minimum Python version raised from 3.8 to 3.9
+
+## [1.0.0] - 2026-05-09
+
+### Added
+- Schema validation via `--schema` flag with JSON schema files
+- `--generate-schema` to infer and export a schema from audit results
+- `--baseline` / `-b` for trend comparison against previous audits
+- Trend deltas shown in CLI output and reports
+- `--threshold` / `-t` flag for fuzzy duplicate similarity tuning
+- Typed Python API (`audit_file()`, dataclass results, `py.typed`)
+- Fuzzy duplicate detection (fingerprint clustering + Levenshtein)
+- Health score algorithm (0–100, penalty-based)
+- Interactive HTML report with collapsible sections
+- Fix suggestion engine with copyable code snippets
+- Vectorized detection for 3.4x speedup on large files
+- CSV/TSV support alongside Excel
+- PDF report output (reportlab)
+- Excel findings export (sortable/filterable)
+- Test suite (171 tests) and CI pipeline
+- MIT license
+
+[Unreleased]: https://github.com/MsShawnP/Data-Hygiene-Auditor/compare/v1.0.0...HEAD
+[1.0.0]: https://github.com/MsShawnP/Data-Hygiene-Auditor/releases/tag/v1.0.0
diff --git a/PLAN.md b/PLAN.md
index 96e04cd..2033d0c 100644
--- a/PLAN.md
+++ b/PLAN.md
@@ -1,8 +1,9 @@
 # Data Hygiene Auditor — Improvement Plan
 
-**Source:** Full project audit (2025-05-15)
+**Source:** Full project audit (2025-05-15), re-audited 2026-05-16
 **Tier:** Medium
-**Status:** Complete — all sprints + stretch goal shipped (PRs #1-#6, 2025-05-15)
+**Status:** Sprints 1-4 + stretch complete. Sprint 5 (polish) in progress.
+**Current focus:** Sprint 5 — Bug fixes, polish, and DevEx improvements
 
 ---
 
@@ -362,3 +363,234 @@ Generate actionable fix scripts or transformation suggestions for each finding.
 
 #### Context
 Phase 3 Category Trends: AI-powered fix suggestions are emerging but nobody does them well. This is the leapfrog opportunity — but only after foundation and presentation are solid.
+
+---
+
+## Sprint 5: Polish & DevEx
+
+**Source:** Audit Round 2 (2026-05-16)
+**Priority:** Next
+**Estimated effort:** Half day
+
+### Decomposition: Sprint 5
+
+Goal: Fix remaining rough edges so the project has zero known bugs and professional-grade CLI/packaging.
+
+All items are independent unless noted — can be done in any order.
+
+---
+
+#### A: Fix issue-counting (bug + dedup)
+
+- [ ] A1: Extract shared issue-counting helper into `core.py`
+    - Depends on: none
+    - Done when: a function `count_issues(results) -> dict` exists in `core.py` that returns `{'total': N, 'High': N, 'Medium': N, 'Low': N}` counting all issue sources (field issues, phantom dupes, fuzzy dupes, schema violations); unit test passes
+- [ ] A2: Fix CLI counting bug — add fuzzy duplicates to total
+    - Depends on: A1
+    - Done when: `cli.py` uses the shared helper; running `data-hygiene-audit` on the sample file reports the same total as the HTML report
+- [ ] A3: Migrate html.py and excel.py to use the shared helper
+    - Depends on: A1
+    - Done when: `html.py` and `excel.py` import and use `count_issues()`; all tests pass; HTML report totals unchanged
+
+#### B: Fix `_raw` type safety
+
+- [ ] B1: Make `_raw` a proper field on `AuditResult`
+    - Depends on: none
+    - Done when: `AuditResult` has `_raw: Dict[str, Any] = field(repr=False, default_factory=dict)` (or `init=False`); `audit_file()` sets it normally; `mypy --strict data_hygiene_auditor/api.py` produces no `_raw` errors; all tests pass
+
+#### C: Public API cleanup
+
+- [ ] C1: Remove `_load_sheets` from `__all__` in `__init__.py`
+    - Depends on: none
+    - Done when: `_load_sheets` is not in `__all__`; `from data_hygiene_auditor import _load_sheets` still works (it's not deleted, just not advertised); tests pass
+
+#### D: CLI improvements
+
+- [ ] D1: Add `--version` flag
+    - Depends on: none
+    - Done when: `data-hygiene-audit --version` prints `data-hygiene-auditor 1.0.0`; test or manual verification passes
+- [ ] D2: Add `--quiet` flag
+    - Depends on: none
+    - Done when: `data-hygiene-audit --input ... --output ... --quiet` produces no stdout (only writes files); exit code 0 on success; test confirms no output
+
+#### E: Detection warnings and guards
+
+- [ ] E1: Warn when fuzzy matching is skipped (>500 rows)
+    - Depends on: none
+    - Done when: running on a file with >500 rows prints a warning like "Note: Fuzzy matching skipped for sheet X (501 rows > 500 limit)"; warning included in JSON output as metadata; test confirms warning appears
+- [ ] E2: Add file size / row count guard
+    - Depends on: none
+    - Done when: files >500K rows print a warning "Large file: N rows. Processing may be slow."; files >2M rows exit with error unless `--force` is passed; test confirms both behaviors
+
+#### F: DevEx alignment
+
+- [ ] F1: Align Python version requirement
+    - Depends on: none
+    - Done when: `requires-python` in pyproject.toml set to `>=3.9`; CI matrix remains 3.9/3.12/3.13; README updated if it mentions 3.8
+- [ ] F2: Add CHANGELOG.md
+    - Depends on: none
+    - Done when: `CHANGELOG.md` exists with entries for v1.0.0 (initial feature set) and unreleased section for current work; follows Keep a Changelog format
+
+#### G: Documentation
+
+- [ ] G1: Document `--schema`, `--baseline`, `--generate-schema` in README options table
+    - Depends on: none
+    - Done when: README options table includes all 7 flags (--input, --output, --json, --threshold, --schema, --baseline, --generate-schema) with descriptions
+
+---
+
+### Sprint 5 complete when:
+
+- [x] All sub-tasks checked off
+- [x] `pytest` passes (171 tests)
+- [x] `ruff check .` passes
+- [x] `data-hygiene-audit --version` works
+- [x] `data-hygiene-audit --input samples/input/sample_messy_data.xlsx --output samples/output/ --quiet` produces files with no stdout
+- [x] CLI issue count matches HTML report issue count on sample data
+
+---
+
+## Sprint 6: Custom Rule Engine
+
+**Source:** Audit Round 2 — ranked #1 next move
+**Priority:** Next
+**Estimated effort:** 1–2 days
+
+### Decomposition: Sprint 6
+
+Goal: Let users define detection rules in JSON that run alongside built-in checks, with findings integrated into all report outputs.
+
+---
+
+#### A: Rule file format and loader
+
+- [ ] A1: Define rule JSON schema and implement loader
+    - Depends on: none
+    - Done when: `data_hygiene_auditor/rules.py` exists with `load_rules(path) -> list[Rule]` that parses a JSON file into typed Rule objects (dataclass with fields: name, description, severity, column_pattern, condition, threshold); loader rejects invalid rules with clear error messages; unit tests for valid/invalid inputs pass
+
+- [ ] A2: Implement rule condition evaluator
+    - Depends on: A1
+    - Done when: a function `evaluate_rule(rule, series) -> list[dict]` applies a single rule to a pandas Series and returns findings; supports conditions: `regex_match`, `not_regex_match`, `min_length`, `max_length`, `allowed_values`, `disallowed_values`, `max_missing_pct`; unit tests cover each condition type
+
+#### B: Integration with audit pipeline
+
+- [ ] B1: Wire rules into `run_audit()` and results structure
+    - Depends on: A2
+    - Done when: `run_audit(..., rules_path=...)` loads rules and evaluates them per-column; findings appear in `sheet_results['fields'][col]['issues']` with `type: 'custom_rule'`; `count_issues()` counts them; health score penalizes them; existing tests still pass
+
+- [ ] B2: Add `--rules` CLI flag
+    - Depends on: B1
+    - Done when: `data-hygiene-audit --input data.xlsx --output ./reports --rules rules.json` applies custom rules; findings show in all 3 reports; `--rules` documented in `--help` output
+
+#### C: Reporting integration
+
+- [ ] C1: Display custom rule findings in HTML/Excel/PDF reports
+    - Depends on: B1
+    - Done when: custom rule findings render with rule name as heading, description as "why it matters", and severity badge; visually indistinguishable from built-in findings; verified on sample data with 2+ custom rules
+
+#### D: Documentation and sample
+
+- [ ] D1: Create sample rules file and document in README
+    - Depends on: B2, C1
+    - Done when: `samples/rules_example.json` demonstrates 3–4 rules (regex, allowed values, length, missing pct); README has "Custom Rules" section explaining format, conditions, and usage; CHANGELOG updated
+
+---
+
+### Sprint 6 complete when:
+
+- [x] All sub-tasks checked off
+- [x] `pytest` passes with new rule engine tests
+- [x] `ruff check .` passes
+- [x] Sample rules file works: `data-hygiene-audit --input samples/input/sample_messy_data.xlsx --output ./reports --rules samples/rules_example.json`
+- [x] Custom rule findings appear in HTML, Excel, and PDF reports
+- [x] Invalid rules file produces clear error message
+
+---
+
+## Sprint 7: Profiling, Multi-file, and CI Integration
+
+**Source:** Audit Round 2 — ranked #2, #3, #4 next moves
+**Priority:** Next
+**Estimated effort:** 2–3 days
+
+Three independent tracks that can be done in any order.
+
+### Decomposition: Sprint 7
+
+---
+
+#### Track A: Column-level profiling
+
+Goal: Add statistical profiling (cardinality, uniqueness, min/max/mean) to audit results and reports.
+
+- [ ] A1: Compute column statistics in core audit
+    - Depends on: none
+    - Done when: `sheet_results['fields'][col]` gains a `'profile'` dict with keys: `cardinality` (distinct count), `uniqueness_pct`, `min_length`, `max_length`, `avg_length`; for numeric columns also: `min_value`, `max_value`, `mean_value`, `median_value`; unit tests verify stats on known data
+
+- [ ] A2: Render profile stats in HTML report
+    - Depends on: A1
+    - Done when: each field section in HTML shows a compact stats row (e.g. "123 distinct | 82% unique | avg length 14"); numeric fields show min/max/mean; visually compact, doesn't overwhelm the issue findings
+
+- [ ] A3: Include profile stats in Excel and PDF reports
+    - Depends on: A1
+    - Done when: Excel findings sheet has profile columns (cardinality, uniqueness); PDF shows stats per field; JSON output includes profile data
+
+- [ ] A4: Expose profiling in typed API
+    - Depends on: A1
+    - Done when: `FieldResult` dataclass gains a `profile: ColumnProfile` field; `ColumnProfile` dataclass has all stat fields; accessible via `result.sheets[0].fields[0].profile.cardinality`
+
+---
+
+#### Track B: Multi-file / directory mode
+
+Goal: Accept a directory path or glob and produce a combined report across all matched files.
+
+- [ ] B1: Add directory/glob input resolution
+    - Depends on: none
+    - Done when: `--input ./data/` scans for supported files recursively; `--input "data/*.csv"` expands globs; error if no files found; file list printed before audit starts
+
+- [ ] B2: Run audit across multiple files and merge results
+    - Depends on: B1
+    - Done when: `run_audit()` accepts a list of paths (or new `run_multi_audit()`); results dict gains a `'files'` key mapping filename to per-file results; `overall_score` is the weighted average across all files
+
+- [ ] B3: Multi-file reporting
+    - Depends on: B2
+    - Done when: HTML report has a file-level summary table (filename, row count, health score, issue count) with links to per-file detail sections; Excel has one sheet per file; PDF has file-level table of contents
+
+- [ ] B4: Add `--recursive` flag and document
+    - Depends on: B3
+    - Done when: `--recursive` / `-R` controls directory traversal depth (default: recursive); README documents multi-file usage with examples; CHANGELOG updated
+
+---
+
+#### Track C: CI / pipeline integration
+
+Goal: Provide a GitHub Action and exit codes so audits can gate CI pipelines.
+
+- [ ] C1: Add structured exit codes
+    - Depends on: none
+    - Done when: CLI exits 0 if score >= threshold, exits 1 if score < threshold; new `--fail-under` flag sets the threshold (default: 0, never fails); `--fail-under 70` exits 1 if health score < 70; unit test verifies exit codes
+
+- [ ] C2: Create GitHub Action definition
+    - Depends on: C1
+    - Done when: `.github/actions/audit/action.yml` defines a composite action with inputs (file, rules, fail-under, threshold); uses `pip install .` + runs the CLI; outputs health score and issue count as step outputs; README documents usage in a workflow
+
+- [ ] C3: Add SARIF output for GitHub Code Scanning
+    - Depends on: C1
+    - Done when: `--sarif` flag outputs findings in SARIF format compatible with `github/codeql-action/upload-sarif`; findings appear as code scanning alerts tied to the input file; test validates SARIF schema compliance
+
+- [ ] C4: Document CI usage in README
+    - Depends on: C2, C3
+    - Done when: README has "CI / Pipeline Integration" section with GitHub Actions example workflow YAML showing: audit on push, fail-under threshold, SARIF upload; CHANGELOG updated
+
+---
+
+### Sprint 7 complete when:
+
+- [ ] All sub-tasks checked off
+- [ ] `pytest` passes with new profiling and multi-file tests
+- [ ] `ruff check .` passes
+- [ ] `data-hygiene-audit --input samples/input/ --output ./reports` audits all files in directory
+- [ ] HTML report shows column stats (cardinality, uniqueness)
+- [ ] `--fail-under 70` exits non-zero on low-scoring data
+- [ ] GitHub Action YAML is valid and documented
diff --git a/README.md b/README.md
index 10ba2a7..3810d4d 100644
--- a/README.md
+++ b/README.md
@@ -100,10 +100,19 @@ Supports `.xlsx`, `.xls`, `.csv`, and `.tsv` files.
 
 | Flag | Description |
 |------|-------------|
-| `--input`, `-i` | Path to the file to audit — `.xlsx`, `.csv`, or `.tsv` (required) |
+| `--input`, `-i` | Path to file, directory, or glob pattern (required) |
 | `--output`, `-o` | Directory for generated reports (required) |
 | `--json` | Also output the raw findings as structured JSON |
-| `--threshold`, `-t` | Fuzzy duplicate similarity threshold, 0.0-1.0 (default: 0.85) |
+| `--threshold`, `-t` | Fuzzy duplicate similarity threshold, 0.0–1.0 (default: 0.85) |
+| `--schema`, `-s` | Path to a schema JSON for type/completeness validation |
+| `--generate-schema` | Infer types from the data and save a schema JSON to the given path |
+| `--baseline`, `-b` | Path to a previous audit JSON for trend comparison (shows deltas) |
+| `--rules`, `-r` | Path to custom rules JSON for additional checks |
+| `--sarif` | Output findings in SARIF format (for GitHub Code Scanning) |
+| `--fail-under` | Exit with code 1 if health score is below this threshold (0-100) |
+| `--quiet`, `-q` | Suppress all terminal output (just write report files) |
+| `--force` | Process files exceeding the 2M row safety limit |
+| `--version`, `-V` | Print version and exit |
 
 ### Example
 
@@ -115,16 +124,16 @@ python audit.py --input samples/input/sample_messy_data.xlsx --output ./reports
   Data Hygiene Auditor
   Auditing: samples/input/sample_messy_data.xlsx
 
-  [1/2] Analyzed sheet: Customers
-  [2/2] Analyzed sheet: Orders
+  [1/2] Analyzed sheet: Customers  (score: 42)
+  [2/2] Analyzed sheet: Orders  (score: 68)
 
   Generating reports...
     HTML  -> ./reports/sample_messy_data_audit_report.html
     Excel -> ./reports/sample_messy_data_audit_findings.xlsx
     PDF   -> ./reports/sample_messy_data_audit_report.pdf
 
-  Audit complete: 59 issues found
-    High: 23 | Medium: 20 | Low: 16
+  Health Score: 55/100
+  59 issues found  —  High: 23 | Medium: 20 | Low: 16
 ```
 
 ## Use as a Library
@@ -169,6 +178,103 @@ loose = audit_file("data.xlsx", fuzzy_threshold=0.70)
 
 Works in Jupyter notebooks — call `audit_file()` in a cell and explore the typed results interactively.
 
+## Custom Rules
+
+Define your own detection rules in a JSON file to enforce project-specific data standards alongside the built-in checks.
+
+```
+data-hygiene-audit --input data.xlsx --output ./reports --rules my_rules.json
+```
+
+### Rule file format
+
+```json
+{
+  "rules": [
+    {
+      "name": "Phone format (US)",
+      "description": "Phone numbers should match (XXX) XXX-XXXX format",
+      "severity": "High",
+      "condition": "regex_match",
+      "threshold": "^\\(\\d{3}\\) \\d{3}-\\d{4}$",
+      "column_pattern": "phone|tel"
+    }
+  ]
+}
+```
+
+Each rule requires: `name`, `description`, `severity` (High/Medium/Low), `condition`, and `threshold`.
+
+### Targeting columns
+
+- `"column_pattern": "phone|tel"` — regex matched against column names (case-insensitive)
+- `"columns": ["Status", "Type"]` — explicit list of column names
+- Omit both to apply the rule to all columns
+
+### Available conditions
+
+| Condition | Threshold | Fires when |
+|-----------|-----------|------------|
+| `regex_match` | Regex string | Values don't match the pattern |
+| `not_regex_match` | Regex string | Values match the disallowed pattern |
+| `min_length` | Number | Values are shorter than threshold |
+| `max_length` | Number | Values are longer than threshold |
+| `allowed_values` | Array of strings | Values not in the allowed set (case-insensitive) |
+| `disallowed_values` | Array of strings | Values found in the disallowed set (case-insensitive) |
+| `max_missing_pct` | Number (0-100) | Missing percentage exceeds threshold |
+
+See [`samples/rules_example.json`](samples/rules_example.json) for a working example with 4 rules.
+
+## Multi-file Mode
+
+Pass a directory or glob pattern to audit multiple files at once:
+
+```
+data-hygiene-audit --input ./data/ --output ./reports
+data-hygiene-audit --input "exports/*.csv" --output ./reports
+```
+
+Each file gets its own set of reports. The CLI shows a combined health score across all files.
+
+## CI / Pipeline Integration
+
+Use `--fail-under` to gate CI pipelines on data quality:
+
+```
+data-hygiene-audit --input data.xlsx --output ./reports --fail-under 70
+```
+
+Exits with code 1 if the health score drops below the threshold.
+
+### GitHub Actions
+
+```yaml
+- uses: actions/checkout@v4
+- uses: actions/setup-python@v5
+  with:
+    python-version: '3.12'
+- uses: ./.github/actions/audit
+  with:
+    file: data/customers.xlsx
+    fail-under: '70'
+    rules: rules.json
+```
+
+### SARIF for Code Scanning
+
+```yaml
+- name: Run audit with SARIF
+  run: |
+    pip install .
+    data-hygiene-audit --input data/ --output ./reports --sarif audit.sarif
+
+- uses: github/codeql-action/upload-sarif@v3
+  with:
+    sarif_file: audit.sarif
+```
+
+Findings appear as code scanning alerts in the GitHub Security tab.
+
 ## Regenerating the Sample Data
 
 `generate_sample.py` recreates the deliberately-messy demo workbook at `samples/input/sample_messy_data.xlsx`. Run it if you want to modify the demo data or verify that generation is reproducible. The committed outputs in [samples/output/](samples/output/) can then be regenerated with the command shown in [See It In Action](#see-it-in-action).
@@ -179,7 +285,7 @@ python generate_sample.py
 
 ## Requirements
 
-- Python 3.8+
+- Python 3.9+
 - pandas
 - openpyxl
 - reportlab
diff --git a/data_hygiene_auditor/__init__.py b/data_hygiene_auditor/__init__.py
index e400a3a..4ad79ad 100644
--- a/data_hygiene_auditor/__init__.py
+++ b/data_hygiene_auditor/__init__.py
@@ -2,6 +2,7 @@
 
 from .api import (
     AuditResult,
+    ColumnProfile,
     Duplicate,
     FieldResult,
     Finding,
@@ -12,7 +13,14 @@
     TrendData,
     audit_file,
 )
-from .core import SUPPORTED_EXTENSIONS, WHY_IT_MATTERS, _load_sheets, run_audit
+from .core import (  # noqa: F401
+    SUPPORTED_EXTENSIONS,
+    WHY_IT_MATTERS,
+    _load_sheets,
+    count_issues,
+    run_audit,
+    run_multi_audit,
+)
 from .detection import (
     analyze_fuzzy_duplicates,
     analyze_mixed_formats,
@@ -24,6 +32,7 @@
     rate_severity,
 )
 from .reporting import generate_excel, generate_html, generate_pdf
+from .rules import Rule, evaluate_rule, load_rules
 from .schema import generate_schema, load_schema, validate_schema
 from .trend import compute_trend, load_baseline
 
@@ -34,12 +43,14 @@
     'FixSuggestion',
     'Duplicate',
     'FuzzyDuplicate',
+    'ColumnProfile',
     'FieldResult',
     'SchemaViolation',
     'SheetResult',
     'TrendData',
     'run_audit',
-    '_load_sheets',
+    'run_multi_audit',
+    'count_issues',
     'SUPPORTED_EXTENSIONS',
     'WHY_IT_MATTERS',
     'infer_field_type',
@@ -58,4 +69,7 @@
     'validate_schema',
     'load_baseline',
     'compute_trend',
+    'load_rules',
+    'evaluate_rule',
+    'Rule',
 ]
diff --git a/data_hygiene_auditor/api.py b/data_hygiene_auditor/api.py
index 84d71f8..c4d1ca0 100644
--- a/data_hygiene_auditor/api.py
+++ b/data_hygiene_auditor/api.py
@@ -86,6 +86,23 @@ class FuzzyDuplicate:
     fix: Optional[FixSuggestion] = None
 
 
+@dataclass
+class ColumnProfile:
+    """Statistical profile for a column."""
+
+    cardinality: int
+    uniqueness_pct: float
+    total_values: int
+    non_empty_values: int
+    min_length: int
+    max_length: int
+    avg_length: float
+    min_value: Optional[float] = None
+    max_value: Optional[float] = None
+    mean_value: Optional[float] = None
+    median_value: Optional[float] = None
+
+
 @dataclass
 class FieldResult:
     """Audit results for a single field/column."""
@@ -99,6 +116,7 @@ class FieldResult:
     missing_pct: float
     total_rows: int
     findings: List[Finding] = field(default_factory=list)
+    profile: Optional[ColumnProfile] = None
 
 
 @dataclass
@@ -141,6 +159,7 @@ class AuditResult:
     overall_score: int
     sheets: List[SheetResult] = field(default_factory=list)
     trend: Optional[TrendData] = None
+    _raw: Dict[str, Any] = field(default_factory=dict, repr=False)
 
     @property
     def total_issues(self) -> int:
@@ -312,6 +331,21 @@ def audit_file(
                     detail=issue['detail'],
                     fix=fix_obj,
                 ))
+            profile_raw = field_data.get('profile', {})
+            profile_obj = ColumnProfile(
+                cardinality=profile_raw.get('cardinality', 0),
+                uniqueness_pct=profile_raw.get('uniqueness_pct', 0.0),
+                total_values=profile_raw.get('total_values', 0),
+                non_empty_values=profile_raw.get('non_empty_values', 0),
+                min_length=profile_raw.get('min_length', 0),
+                max_length=profile_raw.get('max_length', 0),
+                avg_length=profile_raw.get('avg_length', 0.0),
+                min_value=profile_raw.get('min_value'),
+                max_value=profile_raw.get('max_value'),
+                mean_value=profile_raw.get('mean_value'),
+                median_value=profile_raw.get('median_value'),
+            ) if profile_raw else None
+
             fields.append(FieldResult(
                 name=col_name,
                 inferred_type=field_data['inferred_type'],
@@ -322,6 +356,7 @@ def audit_file(
                 missing_pct=null['missing_pct'],
                 total_rows=null['total_rows'],
                 findings=findings,
+                profile=profile_obj,
             ))
 
         duplicates = []
@@ -401,12 +436,11 @@ def audit_file(
             sheets=raw_trend.get('sheets', {}),
         )
 
-    result = AuditResult(
+    return AuditResult(
         input_file=raw['input_file'],
         audit_timestamp=raw['audit_timestamp'],
         overall_score=raw['overall_score'],
         sheets=sheets,
         trend=trend_obj,
+        _raw=raw,
     )
-    result._raw = raw
-    return result
diff --git a/data_hygiene_auditor/cli.py b/data_hygiene_auditor/cli.py
index f518e2a..00bdfe1 100644
--- a/data_hygiene_auditor/cli.py
+++ b/data_hygiene_auditor/cli.py
@@ -4,10 +4,9 @@
 import json
 import os
 import sys
-from collections import Counter
 from pathlib import Path
 
-from .core import SUPPORTED_EXTENSIONS, run_audit
+from .core import SUPPORTED_EXTENSIONS, count_issues, run_audit
 from .reporting import generate_excel, generate_html, generate_pdf
 
 
@@ -30,6 +29,116 @@ def _c(text, code):
     return f"\033[{code}m{text}\033[0m"
 
 
+def _get_version():
+    """Get package version from metadata."""
+    from importlib.metadata import PackageNotFoundError, version
+    try:
+        return version('data-hygiene-auditor')
+    except PackageNotFoundError:
+        return '1.0.0'
+
+
+def _resolve_inputs(input_arg):
+    """Resolve input argument to a list of supported file paths.
+
+    Accepts: a single file, a directory, or a glob pattern.
+    """
+    import glob as glob_mod
+
+    path = Path(input_arg)
+
+    if path.is_file():
+        ext = path.suffix.lower()
+        if ext in SUPPORTED_EXTENSIONS:
+            return [str(path)]
+        return []
+
+    if path.is_dir():
+        files = []
+        for ext in SUPPORTED_EXTENSIONS:
+            files.extend(path.rglob(f'*{ext}'))
+        return sorted(str(f) for f in files)
+
+    expanded = glob_mod.glob(input_arg, recursive=True)
+    return sorted(
+        f for f in expanded
+        if Path(f).suffix.lower() in SUPPORTED_EXTENSIONS
+    )
+
+
+_SEVERITY_TO_SARIF = {
+    'High': 'error',
+    'Medium': 'warning',
+    'Low': 'note',
+}
+
+
+def _generate_sarif(all_results, input_files):
+    """Generate SARIF 2.1.0 output for GitHub Code Scanning."""
+    results_list = []
+    rules = []
+    rule_ids = set()
+
+    for results, input_path in zip(all_results, input_files):
+        for sheet_name, sheet_data in results['sheets'].items():
+            for col_name, field_data in sheet_data['fields'].items():
+                for issue in field_data['issues']:
+                    rule_id = issue['type']
+                    if issue.get('rule_name'):
+                        rule_id = f"custom/{issue['rule_name']}"
+                    if rule_id not in rule_ids:
+                        rule_ids.add(rule_id)
+                        rules.append({
+                            'id': rule_id,
+                            'shortDescription': {
+                                'text': issue.get('rule_name', issue['type']),
+                            },
+                            'fullDescription': {
+                                'text': issue.get('why', ''),
+                            },
+                            'defaultConfiguration': {
+                                'level': _SEVERITY_TO_SARIF.get(
+                                    issue['severity'], 'note',
+                                ),
+                            },
+                        })
+                    detail = issue.get('detail', {})
+                    msg = detail.get('message', '') if isinstance(detail, dict) else str(detail)
+                    results_list.append({
+                        'ruleId': rule_id,
+                        'level': _SEVERITY_TO_SARIF.get(issue['severity'], 'note'),
+                        'message': {
+                            'text': (
+                                f"[{sheet_name}] {col_name}: {msg}"
+                                if msg else
+                                f"[{sheet_name}] {col_name}: {issue['type']}"
+                            ),
+                        },
+                        'locations': [{
+                            'physicalLocation': {
+                                'artifactLocation': {
+                                    'uri': input_path.replace('\\', '/'),
+                                },
+                            },
+                        }],
+                    })
+
+    return {
+        '$schema': 'https://raw.githubusercontent.com/oasis-tcs/sarif-spec/main/sarif-2.1/schema/sarif-schema-2.1.0.json',
+        'version': '2.1.0',
+        'runs': [{
+            'tool': {
+                'driver': {
+                    'name': 'data-hygiene-auditor',
+                    'version': _get_version(),
+                    'rules': rules,
+                },
+            },
+            'results': results_list,
+        }],
+    }
+
+
 def main():
     parser = argparse.ArgumentParser(
         description=(
@@ -49,6 +158,10 @@ def main():
   - audit_report.pdf    (email-ready deliverable)
         """,
     )
+    parser.add_argument(
+        '--version', '-V', action='version',
+        version=f'%(prog)s {_get_version()}',
+    )
     parser.add_argument(
         '--input', '-i', required=True,
         help='Path to input file (.xlsx, .csv, .tsv)',
@@ -77,112 +190,164 @@ def main():
         '--baseline', '-b',
         help='Path to previous audit JSON for trend comparison',
     )
+    parser.add_argument(
+        '--rules', '-r',
+        help='Path to custom rules JSON for additional checks',
+    )
+    parser.add_argument(
+        '--sarif',
+        help='Output findings in SARIF format to the given path',
+    )
+    parser.add_argument(
+        '--quiet', '-q', action='store_true',
+        help='Suppress all terminal output (just write report files)',
+    )
+    parser.add_argument(
+        '--fail-under', type=int, default=0,
+        help='Exit with code 1 if health score is below this threshold (0-100)',
+    )
+    parser.add_argument(
+        '--force', action='store_true',
+        help='Process files exceeding the 2M row safety limit',
+    )
     args = parser.parse_args()
 
-    if not os.path.exists(args.input):
+    input_files = _resolve_inputs(args.input)
+    if not input_files:
         print(
-            f"Error: Input file not found: {args.input}",
-            file=sys.stderr,
-        )
-        sys.exit(1)
-
-    ext = Path(args.input).suffix.lower()
-    if ext not in SUPPORTED_EXTENSIONS:
-        supported = ', '.join(sorted(SUPPORTED_EXTENSIONS))
-        print(
-            f"Error: Unsupported file type '{ext}'."
-            f" Supported: {supported}",
+            f"Error: No supported files found for: {args.input}",
             file=sys.stderr,
         )
         sys.exit(1)
 
     os.makedirs(args.output, exist_ok=True)
 
-    basename = Path(args.input).stem
-    print(f"\n  {_c('Data Hygiene Auditor', '1')}")
-    print(f"  Auditing: {_c(args.input, '36')}\n")
+    def _log(msg=''):
+        if not args.quiet:
+            print(msg)
 
-    results = run_audit(
-        args.input,
-        fuzzy_threshold=args.threshold,
-        schema_path=args.schema,
-        baseline_path=args.baseline,
-    )
-    sheet_count = len(results['sheets'])
-    for i, (name, sdata) in enumerate(results['sheets'].items(), 1):
-        score = sdata['health_score']
-        score_color = '32' if score >= 90 else ('33' if score >= 70 else '31')
-        print(
-            f"  [{i}/{sheet_count}] Analyzed sheet: {_c(name, '36')}"
-            f"  (score: {_c(str(score), score_color)})"
+    from .core import _load_sheets
+    ROW_WARN = 500_000
+    ROW_LIMIT = 2_000_000
+
+    _log(f"\n  {_c('Data Hygiene Auditor', '1')}")
+    if len(input_files) > 1:
+        _log(f"  Auditing {_c(str(len(input_files)) + ' files', '36')}\n")
+    else:
+        _log(f"  Auditing: {_c(input_files[0], '36')}\n")
+
+    all_results = []
+    for input_path in input_files:
+        sheets_preview = _load_sheets(input_path)
+        total_rows = sum(len(df) for df in sheets_preview.values())
+        if total_rows > ROW_LIMIT and not args.force:
+            print(
+                f"Error: {input_path} has {total_rows:,} rows"
+                f" (limit: {ROW_LIMIT:,})."
+                f" Use --force to process anyway.",
+                file=sys.stderr,
+            )
+            sys.exit(1)
+        if total_rows > ROW_WARN:
+            _log(
+                f"  {_c('Warning:', '33')} Large file ({total_rows:,} rows)."
+                f" Processing may be slow."
+            )
+
+        results = run_audit(
+            input_path,
+            fuzzy_threshold=args.threshold,
+            schema_path=args.schema,
+            baseline_path=args.baseline,
+            rules_path=args.rules,
         )
+        all_results.append(results)
 
-    html_path = os.path.join(
-        args.output, f"{basename}_audit_report.html",
-    )
-    xlsx_path = os.path.join(
-        args.output, f"{basename}_audit_findings.xlsx",
-    )
-    pdf_path = os.path.join(
-        args.output, f"{basename}_audit_report.pdf",
-    )
+        sheet_count = len(results['sheets'])
+        file_label = (
+            f"  {_c(Path(input_path).name, '1')} " if len(input_files) > 1 else ""
+        )
+        for i, (name, sdata) in enumerate(results['sheets'].items(), 1):
+            score = sdata['health_score']
+            score_color = '32' if score >= 90 else ('33' if score >= 70 else '31')
+            _log(
+                f"  {file_label}[{i}/{sheet_count}]"
+                f" Analyzed sheet: {_c(name, '36')}"
+                f"  (score: {_c(str(score), score_color)})"
+            )
 
-    print("\n  Generating reports...")
+    for results in all_results:
+        basename = Path(results['input_file']).stem
+        html_path = os.path.join(
+            args.output, f"{basename}_audit_report.html",
+        )
+        xlsx_path = os.path.join(
+            args.output, f"{basename}_audit_findings.xlsx",
+        )
+        pdf_path = os.path.join(
+            args.output, f"{basename}_audit_report.pdf",
+        )
 
-    generate_html(results, html_path)
-    print(f"    {_c('HTML', '32')}  -> {html_path}")
+        if len(all_results) > 1:
+            _log(f"\n  Reports for {_c(basename, '36')}:")
+        else:
+            _log("\n  Generating reports...")
 
-    generate_excel(results, xlsx_path)
-    print(f"    {_c('Excel', '32')} -> {xlsx_path}")
+        generate_html(results, html_path)
+        _log(f"    {_c('HTML', '32')}  -> {html_path}")
 
-    generate_pdf(results, pdf_path)
-    print(f"    {_c('PDF', '32')}   -> {pdf_path}")
+        generate_excel(results, xlsx_path)
+        _log(f"    {_c('Excel', '32')} -> {xlsx_path}")
 
-    if args.json:
-        json_path = os.path.join(
-            args.output, f"{basename}_audit_results.json",
-        )
-        with open(json_path, 'w') as f:
-            json.dump(results, f, indent=2, default=str)
-        print(f"    {_c('JSON', '32')}  -> {json_path}")
+        generate_pdf(results, pdf_path)
+        _log(f"    {_c('PDF', '32')}   -> {pdf_path}")
+
+        if args.json:
+            json_path = os.path.join(
+                args.output, f"{basename}_audit_results.json",
+            )
+            with open(json_path, 'w') as f:
+                json.dump(results, f, indent=2, default=str)
+            _log(f"    {_c('JSON', '32')}  -> {json_path}")
 
-    if args.generate_schema:
+    if args.generate_schema and all_results:
         from .schema import generate_schema
-        schema_data = generate_schema(results)
+        schema_data = generate_schema(all_results[0])
         with open(args.generate_schema, 'w') as f:
             json.dump(schema_data, f, indent=2)
-        print(f"    {_c('Schema', '32')} -> {args.generate_schema}")
-
-    total_issues = 0
-    severity_totals = Counter()
-    schema_count = 0
-    for sheet in results['sheets'].values():
-        for field in sheet['fields'].values():
-            for issue in field['issues']:
-                total_issues += 1
-                severity_totals[issue['severity']] += 1
-        for d in sheet['phantom_duplicates']:
-            total_issues += 1
-            severity_totals[d['severity']] += 1
-        for sv in sheet.get('schema_violations', []):
-            total_issues += 1
-            severity_totals[sv['severity']] += 1
-            schema_count += 1
-
-    high = severity_totals.get('High', 0)
-    med = severity_totals.get('Medium', 0)
-    low = severity_totals.get('Low', 0)
-
-    overall = results['overall_score']
+        _log(f"    {_c('Schema', '32')} -> {args.generate_schema}")
+
+    if args.sarif:
+        sarif_data = _generate_sarif(all_results, input_files)
+        with open(args.sarif, 'w') as f:
+            json.dump(sarif_data, f, indent=2)
+        _log(f"    {_c('SARIF', '32')}  -> {args.sarif}")
+
+    total_counts = {'total': 0, 'High': 0, 'Medium': 0, 'Low': 0, 'schema': 0}
+    scores = []
+    for results in all_results:
+        counts = count_issues(results)
+        for k in ('total', 'High', 'Medium', 'Low', 'schema'):
+            total_counts[k] += counts.get(k, 0)
+        scores.append(results['overall_score'])
+
+    total_issues = total_counts['total']
+    high = total_counts['High']
+    med = total_counts['Medium']
+    low = total_counts['Low']
+    schema_count = total_counts['schema']
+
+    overall = round(sum(scores) / len(scores)) if scores else 100
     score_color = '32' if overall >= 90 else ('33' if overall >= 70 else '31')
 
     score_str = f"{overall}/100"
-    trend = results.get('trend')
-    if trend:
-        delta = trend['overall_score_delta']
-        arrow = _c(f'+{delta}', '32') if delta > 0 else _c(f'{delta}', '31') if delta < 0 else '='
-        score_str += f" ({arrow} from baseline)"
-    print(
+    if len(all_results) == 1:
+        trend = all_results[0].get('trend')
+        if trend:
+            delta = trend['overall_score_delta']
+            arrow = _c(f'+{delta}', '32') if delta > 0 else _c(f'{delta}', '31') if delta < 0 else '='
+            score_str += f" ({arrow} from baseline)"
+    _log(
         f"\n  Health Score: {_c(score_str, score_color)}"
     )
     issue_line = (
@@ -191,12 +356,26 @@ def main():
         f" | {_c(f'Medium: {med}', '33')}"
         f" | {_c(f'Low: {low}', '32')}"
     )
-    if trend:
-        td = trend['total_issues_delta']
-        if td != 0:
-            sign = '+' if td > 0 else ''
-            issue_line += f"  ({sign}{td} from baseline)"
-    print(issue_line)
+    if len(all_results) == 1:
+        trend = all_results[0].get('trend')
+        if trend:
+            td = trend['total_issues_delta']
+            if td != 0:
+                sign = '+' if td > 0 else ''
+                issue_line += f"  ({sign}{td} from baseline)"
+    _log(issue_line)
     if schema_count:
-        print(f"  Schema violations: {_c(str(schema_count), '31')}")
-    print()
+        _log(f"  Schema violations: {_c(str(schema_count), '31')}")
+    for results in all_results:
+        for w in results.get('warnings', []):
+            _log(f"  {_c('Note:', '33')} {w['message']}")
+    if len(all_results) > 1:
+        _log(f"  Files audited: {len(all_results)}")
+    _log()
+
+    if args.fail_under and overall < args.fail_under:
+        _log(
+            f"  {_c('FAILED:', '31')} score {overall}"
+            f" is below threshold {args.fail_under}"
+        )
+        sys.exit(1)
diff --git a/data_hygiene_auditor/core.py b/data_hygiene_auditor/core.py
index 3f5019a..cdd60b1 100644
--- a/data_hygiene_auditor/core.py
+++ b/data_hygiene_auditor/core.py
@@ -77,6 +77,36 @@
 SUPPORTED_EXTENSIONS = {'.xlsx', '.xls', '.csv', '.tsv'}
 
 
+def count_issues(results):
+    """Count total and per-severity issues across all sheets.
+
+    Counts all issue sources: field issues, phantom duplicates,
+    fuzzy duplicates, and schema violations.
+
+    Returns dict with keys: 'total', 'High', 'Medium', 'Low', 'schema'.
+    """
+    from collections import Counter
+    totals = Counter()
+    schema_count = 0
+    for sheet in results['sheets'].values():
+        for field_data in sheet['fields'].values():
+            for issue in field_data['issues']:
+                totals['total'] += 1
+                totals[issue['severity']] += 1
+        for d in sheet['phantom_duplicates']:
+            totals['total'] += 1
+            totals[d['severity']] += 1
+        for f in sheet.get('fuzzy_duplicates', []):
+            totals['total'] += 1
+            totals[f['severity']] += 1
+        for sv in sheet.get('schema_violations', []):
+            totals['total'] += 1
+            totals[sv['severity']] += 1
+            schema_count += 1
+    totals['schema'] = schema_count
+    return dict(totals)
+
+
 def _load_sheets(input_path):
     """Load tabular data as a dict of {sheet_name: DataFrame}."""
     ext = Path(input_path).suffix.lower()
@@ -92,13 +122,18 @@ def _load_sheets(input_path):
         }
 
 
-def run_audit(input_path, fuzzy_threshold=0.85, schema_path=None, baseline_path=None):
+def run_audit(input_path, fuzzy_threshold=0.85, schema_path=None, baseline_path=None, rules_path=None):
     """Run all checks against an Excel or CSV file. Returns structured audit results."""
     schema = None
     if schema_path:
         from .schema import load_schema
         schema = load_schema(schema_path)
 
+    rules = None
+    if rules_path:
+        from .rules import evaluate_rule, load_rules
+        rules = load_rules(rules_path)
+
     sheets = _load_sheets(input_path)
     results = {
         'input_file': os.path.basename(input_path),
@@ -192,6 +227,13 @@ def run_audit(input_path, fuzzy_threshold=0.85, schema_path=None, baseline_path=
                     issue['fix'] = fix
                 field_findings['issues'].append(issue)
 
+            if rules:
+                for rule in rules:
+                    finding = evaluate_rule(rule, df[col], col)
+                    if finding:
+                        field_findings['issues'].append(finding)
+
+            field_findings['profile'] = _compute_profile(df[col], field_type)
             sheet_results['fields'][col] = field_findings
 
         field_types = {
@@ -213,17 +255,32 @@ def run_audit(input_path, fuzzy_threshold=0.85, schema_path=None, baseline_path=
             frozenset(i - 2 for i in d['rows'])
             for d in dupes
         ]
-        fuzzy = analyze_fuzzy_duplicates(
+        fuzzy_raw = analyze_fuzzy_duplicates(
             df, sheet_name, field_types,
             threshold=fuzzy_threshold,
             phantom_row_sets=phantom_row_sets,
         )
-        for f in fuzzy:
+        fuzzy = []
+        for f in fuzzy_raw:
+            if f.get('type') == '_levenshtein_skipped':
+                results.setdefault('warnings', []).append({
+                    'type': 'levenshtein_skipped',
+                    'sheet': sheet_name,
+                    'unmatched_rows': f['unmatched_count'],
+                    'limit': f['limit'],
+                    'message': (
+                        f"Fuzzy (Levenshtein) matching skipped for sheet"
+                        f" '{sheet_name}': {f['unmatched_count']} unmatched"
+                        f" rows exceeds the {f['limit']}-row limit."
+                    ),
+                })
+                continue
             f['severity'] = rate_severity('fuzzy_duplicate', f)
             f['why'] = WHY_IT_MATTERS['fuzzy_duplicate']
             fix = generate_dup_fix('fuzzy_duplicate', f, sheet_name)
             if fix:
                 f['fix'] = fix
+            fuzzy.append(f)
         sheet_results['fuzzy_duplicates'] = fuzzy
 
         if schema:
@@ -247,6 +304,13 @@ def run_audit(input_path, fuzzy_threshold=0.85, schema_path=None, baseline_path=
     if schema:
         results['schema'] = {'source': schema_path, 'validated': True}
 
+    if rules:
+        results['rules'] = {
+            'source': rules_path,
+            'count': len(rules),
+            'names': [r.name for r in rules],
+        }
+
     if baseline_path:
         baseline = load_baseline(baseline_path)
         results['trend'] = compute_trend(results, baseline)
@@ -254,6 +318,45 @@ def run_audit(input_path, fuzzy_threshold=0.85, schema_path=None, baseline_path=
     return results
 
 
+def run_multi_audit(input_paths, fuzzy_threshold=0.85, schema_path=None, rules_path=None):
+    """Run audits across multiple files. Returns a combined results dict.
+
+    The returned dict has:
+    - 'files': mapping of filename -> per-file audit results
+    - 'overall_score': weighted average by row count
+    - 'total_files': number of files audited
+    - 'total_rows': sum of rows across all files
+    """
+    file_results = {}
+    for path in input_paths:
+        results = run_audit(
+            path,
+            fuzzy_threshold=fuzzy_threshold,
+            schema_path=schema_path,
+            rules_path=rules_path,
+        )
+        file_results[os.path.basename(path)] = results
+
+    total_rows = sum(
+        sum(s['row_count'] for s in r['sheets'].values())
+        for r in file_results.values()
+    )
+    if total_rows > 0:
+        weighted_score = sum(
+            r['overall_score'] * sum(s['row_count'] for s in r['sheets'].values())
+            for r in file_results.values()
+        ) / total_rows
+    else:
+        weighted_score = 100
+
+    return {
+        'files': file_results,
+        'overall_score': round(weighted_score),
+        'total_files': len(file_results),
+        'total_rows': total_rows,
+    }
+
+
 def _compute_health_score(sheet_data):
     """Compute a 0-100 health score for a sheet.
 
@@ -290,3 +393,46 @@ def _compute_health_score(sheet_data):
         score -= severity_penalty.get(sv['severity'], 1.0)
 
     return max(0, round(score))
+
+
+def _compute_profile(series, field_type):
+    """Compute column-level statistics for profiling."""
+    total = len(series)
+    non_null = series.dropna()
+    non_null_str = non_null.astype(str).str.strip()
+    non_empty = non_null_str[non_null_str != '']
+
+    cardinality = int(non_empty.nunique()) if len(non_empty) > 0 else 0
+    uniqueness_pct = round(cardinality / len(non_empty) * 100, 1) if len(non_empty) > 0 else 0.0
+
+    lengths = non_empty.str.len()
+    profile = {
+        'cardinality': cardinality,
+        'uniqueness_pct': uniqueness_pct,
+        'total_values': total,
+        'non_empty_values': int(len(non_empty)),
+        'min_length': int(lengths.min()) if len(lengths) > 0 else 0,
+        'max_length': int(lengths.max()) if len(lengths) > 0 else 0,
+        'avg_length': round(float(lengths.mean()), 1) if len(lengths) > 0 else 0.0,
+    }
+
+    if field_type == 'currency':
+        numeric = pd.to_numeric(
+            non_empty.str.replace(r'[$,£€]', '', regex=True),
+            errors='coerce',
+        ).dropna()
+        if len(numeric) > 0:
+            profile['min_value'] = round(float(numeric.min()), 2)
+            profile['max_value'] = round(float(numeric.max()), 2)
+            profile['mean_value'] = round(float(numeric.mean()), 2)
+            profile['median_value'] = round(float(numeric.median()), 2)
+
+    elif field_type == 'id':
+        numeric = pd.to_numeric(non_empty, errors='coerce').dropna()
+        if len(numeric) > 0:
+            profile['min_value'] = round(float(numeric.min()), 2)
+            profile['max_value'] = round(float(numeric.max()), 2)
+            profile['mean_value'] = round(float(numeric.mean()), 2)
+            profile['median_value'] = round(float(numeric.median()), 2)
+
+    return profile
diff --git a/data_hygiene_auditor/detection.py b/data_hygiene_auditor/detection.py
index f74ad60..8dfd7f3 100644
--- a/data_hygiene_auditor/detection.py
+++ b/data_hygiene_auditor/detection.py
@@ -549,6 +549,13 @@ def analyze_fuzzy_duplicates(
     skip = already_matched | fp_matched
     unmatched = [i for i in range(len(df)) if i not in skip]
 
+    if len(unmatched) > 500:
+        findings.append({
+            'type': '_levenshtein_skipped',
+            'unmatched_count': len(unmatched),
+            'limit': 500,
+        })
+
     if len(unmatched) >= 2 and len(unmatched) <= 500:
         norm_strings = {}
         for idx in unmatched:
diff --git a/data_hygiene_auditor/reporting/excel.py b/data_hygiene_auditor/reporting/excel.py
index c2c1eb1..612e2fd 100644
--- a/data_hygiene_auditor/reporting/excel.py
+++ b/data_hygiene_auditor/reporting/excel.py
@@ -15,7 +15,7 @@ def generate_excel(results, output_path):
     headers = [
         "Sheet", "Field", "Inferred Type", "Issue Type", "Severity",
         "Description", "Example / Detail", "Why It Matters",
-        "Suggested Fix",
+        "Suggested Fix", "Cardinality", "Uniqueness %",
     ]
     header_font = Font(bold=True, color="FFFFFF", size=11, name="Arial")
     header_fill = PatternFill("solid", fgColor="0f3460")
@@ -85,18 +85,28 @@ def generate_excel(results, output_path):
                         f" Blank: {detail['blank_count']},"
                         f" Whitespace: {detail['whitespace_only']}"
                     )
+                elif itype == 'custom_rule':
+                    desc = (
+                        f"{issue.get('rule_name', 'Custom Rule')}:"
+                        f" {detail.get('message', '')}"
+                    )
+                    examples = detail.get('examples', [])
+                    example = '; '.join(str(e) for e in examples[:5])
                 else:
                     desc = str(itype)
                     example = json.dumps(detail, default=str)
 
                 fix = issue.get('fix', {})
                 fix_text = fix.get('code', '') if fix else ''
+                profile = field_data.get('profile', {})
                 values = [
                     sheet_name, col_name,
                     field_data['inferred_type'],
                     itype, issue['severity'],
                     desc, example, issue.get('why', ''),
                     fix_text,
+                    profile.get('cardinality', ''),
+                    profile.get('uniqueness_pct', ''),
                 ]
                 for col_idx, val in enumerate(values, 1):
                     cell = ws.cell(
diff --git a/data_hygiene_auditor/reporting/html.py b/data_hygiene_auditor/reporting/html.py
index 3d2dc1f..e9391e5 100644
--- a/data_hygiene_auditor/reporting/html.py
+++ b/data_hygiene_auditor/reporting/html.py
@@ -1,9 +1,10 @@
 """HTML report generator."""
 
 import json
-from collections import Counter
 from html import escape as _html_escape
 
+from ..core import count_issues
+
 
 def _h(val):
     """Escape a value for safe inclusion in HTML text or attributes."""
@@ -30,22 +31,9 @@ def _render_fix(fix):
 
 def generate_html(results, output_path):
     """Generate a client-readable HTML report."""
-    total_issues = 0
-    severity_totals = Counter()
-    for sheet in results['sheets'].values():
-        for field in sheet['fields'].values():
-            for issue in field['issues']:
-                total_issues += 1
-                severity_totals[issue['severity']] += 1
-        for d in sheet['phantom_duplicates']:
-            total_issues += 1
-            severity_totals[d['severity']] += 1
-        for f in sheet.get('fuzzy_duplicates', []):
-            total_issues += 1
-            severity_totals[f['severity']] += 1
-        for sv in sheet.get('schema_violations', []):
-            total_issues += 1
-            severity_totals[sv['severity']] += 1
+    counts = count_issues(results)
+    total_issues = counts.get('total', 0)
+    severity_totals = counts
 
     parts = []
     parts.append(f"""<!DOCTYPE html>
@@ -537,6 +525,24 @@ def generate_html(results, output_path):
     <div class="null-bar"><div class="null-bar-fill"
         style="width:{min(null['missing_pct'], 100)}%;background:{null_color};"></div></div>
 """)
+            profile = field_data.get('profile', {})
+            if profile:
+                stats_parts = [
+                    f"{profile['cardinality']} distinct",
+                    f"{profile['uniqueness_pct']}% unique",
+                    f"avg len {profile['avg_length']}",
+                ]
+                if 'min_value' in profile:
+                    stats_parts.append(
+                        f"range {profile['min_value']}"
+                        f"–{profile['max_value']}"
+                    )
+                parts.append(
+                    '<div style="font-size:0.8rem;color:var(--text-muted);'
+                    'margin:0.2rem 0 0.4rem 0;">'
+                    f'{" &nbsp;|&nbsp; ".join(stats_parts)}</div>'
+                )
+
             for issue in issues:
                 sev = issue['severity']
                 itype = issue['type']
@@ -615,6 +621,24 @@ def generate_html(results, output_path):
                         f' ({detail["missing_pct"]}%)'
                     )
 
+                elif itype == 'custom_rule':
+                    rule_name = _h(issue.get('rule_name', 'Custom Rule'))
+                    msg = _h(detail.get('message', ''))
+                    parts.append(
+                        f'<strong>{rule_name}</strong>'
+                        f' &mdash; {msg}'
+                    )
+                    examples = detail.get('examples', [])
+                    if examples:
+                        sample_str = ', '.join(
+                            f'"{_h(str(e))}"' for e in examples[:3]
+                        )
+                        parts.append(
+                            '<div style="font-size:0.85rem;'
+                            'color:var(--text-muted);">'
+                            f'Examples: {sample_str}</div>'
+                        )
+
                 else:
                     parts.append(
                         f'<strong>{_h(itype)}</strong>:'
diff --git a/data_hygiene_auditor/reporting/pdf.py b/data_hygiene_auditor/reporting/pdf.py
index 98b70a7..f3c9e74 100644
--- a/data_hygiene_auditor/reporting/pdf.py
+++ b/data_hygiene_auditor/reporting/pdf.py
@@ -178,6 +178,20 @@ def generate_pdf(results, output_path):
                 styles['FieldHead'],
             ))
 
+            profile = field_data.get('profile', {})
+            if profile:
+                stats = (
+                    f"{profile['cardinality']} distinct"
+                    f" | {profile['uniqueness_pct']}% unique"
+                    f" | avg len {profile['avg_length']}"
+                )
+                if 'min_value' in profile:
+                    stats += (
+                        f" | range {profile['min_value']}"
+                        f"–{profile['max_value']}"
+                    )
+                story.append(Paragraph(stats, styles['SmallBody']))
+
             for issue in issues:
                 sev = issue['severity']
                 detail = issue['detail']
@@ -258,6 +272,24 @@ def generate_pdf(results, output_path):
                         styles.get(sev_style, styles['SmallBody']),
                     ))
 
+                elif itype == 'custom_rule':
+                    rule_name = _p(issue.get('rule_name', 'Custom Rule'))
+                    msg = _p(detail.get('message', ''))
+                    text = f"[{sev}] {rule_name} — {msg}"
+                    story.append(Paragraph(
+                        text,
+                        styles.get(sev_style, styles['SmallBody']),
+                    ))
+                    examples = detail.get('examples', [])
+                    if examples:
+                        sample_str = ', '.join(
+                            f'"{_p(str(e))}"' for e in examples[:3]
+                        )
+                        story.append(Paragraph(
+                            f"Examples: {sample_str}",
+                            styles['SmallBody'],
+                        ))
+
                 why = issue.get('why', '')
                 if why:
                     story.append(Paragraph(
diff --git a/data_hygiene_auditor/rules.py b/data_hygiene_auditor/rules.py
new file mode 100644
index 0000000..1c80e04
--- /dev/null
+++ b/data_hygiene_auditor/rules.py
@@ -0,0 +1,330 @@
+"""Custom rule engine — load and evaluate user-defined detection rules."""
+
+import json
+import re
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+VALID_CONDITIONS = {
+    'regex_match',
+    'not_regex_match',
+    'min_length',
+    'max_length',
+    'allowed_values',
+    'disallowed_values',
+    'max_missing_pct',
+}
+
+
+@dataclass
+class Rule:
+    name: str
+    description: str
+    severity: str
+    condition: str
+    threshold: Any
+    column_pattern: str = '*'
+    columns: List[str] = field(default_factory=list)
+
+    def matches_column(self, col_name: str) -> bool:
+        if self.columns:
+            return col_name in self.columns
+        if self.column_pattern == '*':
+            return True
+        return bool(re.search(self.column_pattern, col_name, re.IGNORECASE))
+
+
+def load_rules(path: str) -> List[Rule]:
+    """Load custom rules from a JSON file.
+
+    Expected format:
+    {
+      "rules": [
+        {
+          "name": "Phone format",
+          "description": "All phone numbers must match E.164 or US format",
+          "severity": "High",
+          "column_pattern": "phone|tel",
+          "condition": "regex_match",
+          "threshold": "^\\+?1?\\d{10,14}$"
+        }
+      ]
+    }
+    """
+    path_obj = Path(path)
+    if not path_obj.exists():
+        raise FileNotFoundError(f"Rules file not found: {path}")
+
+    with open(path) as f:
+        try:
+            raw = json.load(f)
+        except json.JSONDecodeError as e:
+            raise ValueError(f"Invalid JSON in rules file: {e}") from e
+
+    if not isinstance(raw, dict) or 'rules' not in raw:
+        raise ValueError(
+            "Rules file must contain a top-level 'rules' array"
+        )
+
+    rules_list = raw['rules']
+    if not isinstance(rules_list, list):
+        raise ValueError("'rules' must be an array")
+
+    rules = []
+    for i, entry in enumerate(rules_list):
+        rules.append(_parse_rule(entry, i))
+    return rules
+
+
+def _parse_rule(entry: Dict[str, Any], index: int) -> Rule:
+    """Parse and validate a single rule entry."""
+    prefix = f"Rule [{index}]"
+
+    if not isinstance(entry, dict):
+        raise ValueError(f"{prefix}: each rule must be an object")
+
+    required = ('name', 'description', 'severity', 'condition', 'threshold')
+    for field_name in required:
+        if field_name not in entry:
+            raise ValueError(
+                f"{prefix}: missing required field '{field_name}'"
+            )
+
+    name = entry['name']
+    condition = entry['condition']
+    severity = entry['severity']
+    threshold = entry['threshold']
+
+    if condition not in VALID_CONDITIONS:
+        raise ValueError(
+            f"{prefix} ({name}): invalid condition '{condition}'."
+            f" Valid: {', '.join(sorted(VALID_CONDITIONS))}"
+        )
+
+    if severity not in ('High', 'Medium', 'Low'):
+        raise ValueError(
+            f"{prefix} ({name}): severity must be 'High', 'Medium', or 'Low'"
+        )
+
+    if condition in ('regex_match', 'not_regex_match'):
+        if not isinstance(threshold, str):
+            raise ValueError(
+                f"{prefix} ({name}): threshold must be a regex string"
+                f" for condition '{condition}'"
+            )
+        try:
+            re.compile(threshold)
+        except re.error as e:
+            raise ValueError(
+                f"{prefix} ({name}): invalid regex in threshold: {e}"
+            ) from e
+
+    if condition in ('min_length', 'max_length'):
+        if not isinstance(threshold, (int, float)) or threshold < 0:
+            raise ValueError(
+                f"{prefix} ({name}): threshold must be a non-negative number"
+                f" for condition '{condition}'"
+            )
+
+    if condition in ('allowed_values', 'disallowed_values'):
+        if not isinstance(threshold, list):
+            raise ValueError(
+                f"{prefix} ({name}): threshold must be an array"
+                f" for condition '{condition}'"
+            )
+
+    if condition == 'max_missing_pct':
+        if not isinstance(threshold, (int, float)) or not (0 <= threshold <= 100):
+            raise ValueError(
+                f"{prefix} ({name}): threshold must be a number 0-100"
+                f" for condition 'max_missing_pct'"
+            )
+
+    return Rule(
+        name=name,
+        description=entry['description'],
+        severity=severity,
+        condition=condition,
+        threshold=threshold,
+        column_pattern=entry.get('column_pattern', '*'),
+        columns=entry.get('columns', []),
+    )
+
+
+def evaluate_rule(rule: Rule, series, col_name: str) -> Optional[Dict[str, Any]]:
+    """Evaluate a single rule against a column. Returns a finding dict or None."""
+    if not rule.matches_column(col_name):
+        return None
+
+    non_null = series.dropna()
+    non_null_str = non_null.astype(str).str.strip()
+    non_empty = non_null_str[non_null_str != '']
+    total = len(series)
+
+    if rule.condition == 'max_missing_pct':
+        missing = total - len(non_empty)
+        pct = (missing / total * 100) if total > 0 else 0
+        if pct > rule.threshold:
+            return {
+                'type': 'custom_rule',
+                'rule_name': rule.name,
+                'severity': rule.severity,
+                'detail': {
+                    'condition': rule.condition,
+                    'threshold': rule.threshold,
+                    'actual': round(pct, 1),
+                    'message': (
+                        f"{pct:.1f}% missing (threshold: {rule.threshold}%)"
+                    ),
+                },
+                'why': rule.description,
+            }
+        return None
+
+    if len(non_empty) == 0:
+        return None
+
+    if rule.condition == 'regex_match':
+        pattern = re.compile(rule.threshold)
+        violations = non_empty[~non_empty.str.fullmatch(pattern, na=False)]
+        if len(violations) == 0:
+            return None
+        examples = violations.head(5).tolist()
+        return {
+            'type': 'custom_rule',
+            'rule_name': rule.name,
+            'severity': rule.severity,
+            'detail': {
+                'condition': rule.condition,
+                'threshold': rule.threshold,
+                'violations': len(violations),
+                'total_checked': len(non_empty),
+                'examples': examples,
+                'message': (
+                    f"{len(violations)}/{len(non_empty)} values don't match"
+                    f" pattern '{rule.threshold}'"
+                ),
+            },
+            'why': rule.description,
+        }
+
+    if rule.condition == 'not_regex_match':
+        pattern = re.compile(rule.threshold)
+        violations = non_empty[non_empty.str.fullmatch(pattern, na=False)]
+        if len(violations) == 0:
+            return None
+        examples = violations.head(5).tolist()
+        return {
+            'type': 'custom_rule',
+            'rule_name': rule.name,
+            'severity': rule.severity,
+            'detail': {
+                'condition': rule.condition,
+                'threshold': rule.threshold,
+                'violations': len(violations),
+                'total_checked': len(non_empty),
+                'examples': examples,
+                'message': (
+                    f"{len(violations)}/{len(non_empty)} values match"
+                    f" disallowed pattern '{rule.threshold}'"
+                ),
+            },
+            'why': rule.description,
+        }
+
+    if rule.condition == 'min_length':
+        violations = non_empty[non_empty.str.len() < rule.threshold]
+        if len(violations) == 0:
+            return None
+        examples = violations.head(5).tolist()
+        return {
+            'type': 'custom_rule',
+            'rule_name': rule.name,
+            'severity': rule.severity,
+            'detail': {
+                'condition': rule.condition,
+                'threshold': rule.threshold,
+                'violations': len(violations),
+                'total_checked': len(non_empty),
+                'examples': examples,
+                'message': (
+                    f"{len(violations)}/{len(non_empty)} values shorter than"
+                    f" {int(rule.threshold)} characters"
+                ),
+            },
+            'why': rule.description,
+        }
+
+    if rule.condition == 'max_length':
+        violations = non_empty[non_empty.str.len() > rule.threshold]
+        if len(violations) == 0:
+            return None
+        examples = violations.head(5).tolist()
+        return {
+            'type': 'custom_rule',
+            'rule_name': rule.name,
+            'severity': rule.severity,
+            'detail': {
+                'condition': rule.condition,
+                'threshold': rule.threshold,
+                'violations': len(violations),
+                'total_checked': len(non_empty),
+                'examples': examples,
+                'message': (
+                    f"{len(violations)}/{len(non_empty)} values longer than"
+                    f" {int(rule.threshold)} characters"
+                ),
+            },
+            'why': rule.description,
+        }
+
+    if rule.condition == 'allowed_values':
+        allowed_set = {v.lower() for v in rule.threshold}
+        violations = non_empty[~non_empty.str.lower().isin(allowed_set)]
+        if len(violations) == 0:
+            return None
+        examples = violations.head(5).tolist()
+        return {
+            'type': 'custom_rule',
+            'rule_name': rule.name,
+            'severity': rule.severity,
+            'detail': {
+                'condition': rule.condition,
+                'threshold': rule.threshold,
+                'violations': len(violations),
+                'total_checked': len(non_empty),
+                'examples': examples,
+                'message': (
+                    f"{len(violations)}/{len(non_empty)} values not in"
+                    f" allowed set"
+                ),
+            },
+            'why': rule.description,
+        }
+
+    if rule.condition == 'disallowed_values':
+        disallowed_set = {v.lower() for v in rule.threshold}
+        violations = non_empty[non_empty.str.lower().isin(disallowed_set)]
+        if len(violations) == 0:
+            return None
+        examples = violations.head(5).tolist()
+        return {
+            'type': 'custom_rule',
+            'rule_name': rule.name,
+            'severity': rule.severity,
+            'detail': {
+                'condition': rule.condition,
+                'threshold': rule.threshold,
+                'violations': len(violations),
+                'total_checked': len(non_empty),
+                'examples': examples,
+                'message': (
+                    f"{len(violations)}/{len(non_empty)} values contain"
+                    f" disallowed entries"
+                ),
+            },
+            'why': rule.description,
+        }
+
+    return None
diff --git a/pyproject.toml b/pyproject.toml
index 8368447..c647a90 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,7 +8,7 @@ version = "1.0.0"
 description = "Detect data quality issues in Excel and CSV files — mixed formats, misused fields, placeholder floods, and phantom duplicates"
 readme = "README.md"
 license = {text = "MIT"}
-requires-python = ">=3.8"
+requires-python = ">=3.9"
 authors = [
     {name = "Lailara LLC"},
 ]
@@ -42,7 +42,7 @@ data-hygiene-audit = "data_hygiene_auditor.cli:main"
 testpaths = ["tests"]
 
 [tool.ruff]
-target-version = "py38"
+target-version = "py39"
 line-length = 120
 exclude = ["generate_sample.py"]
 
diff --git a/samples/output/sample_messy_data_audit_findings.xlsx b/samples/output/sample_messy_data_audit_findings.xlsx
index aa8e2cc..1988e90 100644
Binary files a/samples/output/sample_messy_data_audit_findings.xlsx and b/samples/output/sample_messy_data_audit_findings.xlsx differ
diff --git a/samples/output/sample_messy_data_audit_report.html b/samples/output/sample_messy_data_audit_report.html
index a7e5f88..21d7ae7 100644
--- a/samples/output/sample_messy_data_audit_report.html
+++ b/samples/output/sample_messy_data_audit_report.html
@@ -336,7 +336,7 @@
 <body>
 
 <h1>Data Hygiene Audit Report</h1>
-<p class="subtitle">sample_messy_data.xlsx &mdash; 2026-05-15 15:03:29</p>
+<p class="subtitle">sample_messy_data.xlsx &mdash; 2026-05-16 12:00:51</p>
 
 <div class="score-hero">
     <div class="score-ring">
@@ -357,15 +357,6 @@ <h1>Data Hygiene Audit Report</h1>
 </div>
 
 
-<div class="trend-banner">
-    <div>
-        <span class="delta positive">↑4</span>
-        <span> vs baseline (2026-05-15 15:03:07)</span>
-    </div>
-    <div>Score: 28 → 32</div>
-    <div>Issues: 63 → 59 (-4)</div>
-</div>
-
 <div class="summary-grid">
     <div class="summary-card info">
         <div class="number">59</div>
@@ -420,7 +411,7 @@ <h2 class="sheet-toggle" onclick="toggleSheet(this)">Sheet: Customers
     lambda x: &quot;coded&quot; if isinstance(x, str)
     and &quot;-&quot; in x else &quot;numeric&quot;
 )</pre></div></div></div>
-<div class="field-card" data-field="firstname" data-severities="Low High Medium">
+<div class="field-card" data-field="firstname" data-severities="Medium Low High">
     <div class="field-header">
         <span class="field-name">FirstName</span>
         <span class="field-type">name</span>
@@ -436,7 +427,7 @@ <h2 class="sheet-toggle" onclick="toggleSheet(this)">Sheet: Customers
 suspect = df.loc[mask, &quot;FirstName&quot;]</pre></div></div><div class="issue severity-Medium"><span class="severity-badge Medium">Medium</span> <strong>Placeholder detected:</strong> "Test" appears 3 times (11.5%)<div class="why-box"><strong>Why this matters:</strong> Placeholder values (&quot;Test&quot;, &quot;N/A&quot;, &quot;TBD&quot;) that persist in production data inflate counts, skew averages, and create phantom records. They often indicate incomplete data entry or inadequate validation at the point of capture.</div><div class="fix-block"><div class="fix-header"><span>Suggested Fix (replace_placeholders)</span><button class="fix-copy" onclick="copyFix(this)">Copy</button></div><div class="fix-desc">Replace 3 placeholder values (&quot;Test&quot;) in &quot;FirstName&quot; with NaN for proper missing-data handling</div><pre class="fix-code">import numpy as np
 df[&quot;FirstName&quot;] = df[&quot;FirstName&quot;].replace(&quot;Test&quot;, np.nan)</pre></div></div><div class="issue severity-Low"><span class="severity-badge Low">Low</span> <strong>Placeholder detected:</strong> "TBD" appears 1 times (3.8%)<div class="why-box"><strong>Why this matters:</strong> Placeholder values (&quot;Test&quot;, &quot;N/A&quot;, &quot;TBD&quot;) that persist in production data inflate counts, skew averages, and create phantom records. They often indicate incomplete data entry or inadequate validation at the point of capture.</div><div class="fix-block"><div class="fix-header"><span>Suggested Fix (replace_placeholders)</span><button class="fix-copy" onclick="copyFix(this)">Copy</button></div><div class="fix-desc">Replace 1 placeholder values (&quot;TBD&quot;) in &quot;FirstName&quot; with NaN for proper missing-data handling</div><pre class="fix-code">import numpy as np
 df[&quot;FirstName&quot;] = df[&quot;FirstName&quot;].replace(&quot;TBD&quot;, np.nan)</pre></div></div></div>
-<div class="field-card" data-field="lastname" data-severities="Low Medium">
+<div class="field-card" data-field="lastname" data-severities="Medium Low">
     <div class="field-header">
         <span class="field-name">LastName</span>
         <span class="field-type">name</span>
@@ -453,7 +444,7 @@ <h2 class="sheet-toggle" onclick="toggleSheet(this)">Sheet: Customers
 )</pre></div></div><div class="issue severity-Medium"><span class="severity-badge Medium">Medium</span> <strong>Suspicious repetition:</strong> "Doe" appears 3 times (11.5%)<div class="why-box"><strong>Why this matters:</strong> When the same value appears far more often than expected, it may indicate a default value that was never updated, a copy-paste error, or a system glitch that stamped the same data across multiple records.</div><div class="fix-block"><div class="fix-header"><span>Suggested Fix (flag_repetitions)</span><button class="fix-copy" onclick="copyFix(this)">Copy</button></div><div class="fix-desc">Flag 3 rows where &quot;LastName&quot; = &quot;Doe&quot; (11.5%) for manual review</div><pre class="fix-code">df[&quot;_LastName_review&quot;] = (
     df[&quot;LastName&quot;] == &quot;Doe&quot;
 )</pre></div></div></div>
-<div class="field-card" data-field="email" data-severities="Low High Medium">
+<div class="field-card" data-field="email" data-severities="Medium Low High">
     <div class="field-header">
         <span class="field-name">Email</span>
         <span class="field-type">email</span>
@@ -473,7 +464,7 @@ <h2 class="sheet-toggle" onclick="toggleSheet(this)">Sheet: Customers
 )</pre></div></div><div class="issue severity-Medium"><span class="severity-badge Medium">Medium</span> <strong>Suspicious repetition:</strong> "test@test.com" appears 3 times (11.5%)<div class="why-box"><strong>Why this matters:</strong> When the same value appears far more often than expected, it may indicate a default value that was never updated, a copy-paste error, or a system glitch that stamped the same data across multiple records.</div><div class="fix-block"><div class="fix-header"><span>Suggested Fix (flag_repetitions)</span><button class="fix-copy" onclick="copyFix(this)">Copy</button></div><div class="fix-desc">Flag 3 rows where &quot;Email&quot; = &quot;test@test.com&quot; (11.5%) for manual review</div><pre class="fix-code">df[&quot;_Email_review&quot;] = (
     df[&quot;Email&quot;] == &quot;test@test.com&quot;
 )</pre></div></div></div>
-<div class="field-card" data-field="phone" data-severities="Low High Medium">
+<div class="field-card" data-field="phone" data-severities="Medium Low High">
     <div class="field-header">
         <span class="field-name">Phone</span>
         <span class="field-type">phone</span>
@@ -497,7 +488,7 @@ <h2 class="sheet-toggle" onclick="toggleSheet(this)">Sheet: Customers
 )</pre></div></div><div class="issue severity-Medium"><span class="severity-badge Medium">Medium</span> <strong>Suspicious repetition:</strong> "555-555-5555" appears 3 times (11.5%)<div class="why-box"><strong>Why this matters:</strong> When the same value appears far more often than expected, it may indicate a default value that was never updated, a copy-paste error, or a system glitch that stamped the same data across multiple records.</div><div class="fix-block"><div class="fix-header"><span>Suggested Fix (flag_repetitions)</span><button class="fix-copy" onclick="copyFix(this)">Copy</button></div><div class="fix-desc">Flag 3 rows where &quot;Phone&quot; = &quot;555-555-5555&quot; (11.5%) for manual review</div><pre class="fix-code">df[&quot;_Phone_review&quot;] = (
     df[&quot;Phone&quot;] == &quot;555-555-5555&quot;
 )</pre></div></div></div>
-<div class="field-card" data-field="joindate" data-severities="Low High Medium">
+<div class="field-card" data-field="joindate" data-severities="Medium Low High">
     <div class="field-header">
         <span class="field-name">JoinDate</span>
         <span class="field-type">date</span>
@@ -516,7 +507,7 @@ <h2 class="sheet-toggle" onclick="toggleSheet(this)">Sheet: Customers
 )</pre></div></div><div class="issue severity-Medium"><span class="severity-badge Medium">Medium</span> <strong>Suspicious repetition:</strong> "2023-01-15" appears 3 times (11.5%)<div class="why-box"><strong>Why this matters:</strong> When the same value appears far more often than expected, it may indicate a default value that was never updated, a copy-paste error, or a system glitch that stamped the same data across multiple records.</div><div class="fix-block"><div class="fix-header"><span>Suggested Fix (flag_repetitions)</span><button class="fix-copy" onclick="copyFix(this)">Copy</button></div><div class="fix-desc">Flag 3 rows where &quot;JoinDate&quot; = &quot;2023-01-15&quot; (11.5%) for manual review</div><pre class="fix-code">df[&quot;_JoinDate_review&quot;] = (
     df[&quot;JoinDate&quot;] == &quot;2023-01-15&quot;
 )</pre></div></div></div>
-<div class="field-card" data-field="accountbalance" data-severities="Low High Medium">
+<div class="field-card" data-field="accountbalance" data-severities="Medium Low High">
     <div class="field-header">
         <span class="field-name">AccountBalance</span>
         <span class="field-type">currency</span>
@@ -557,7 +548,7 @@ <h2 class="sheet-toggle" onclick="toggleSheet(this)">Sheet: Customers
 df[&quot;Status&quot;] = df[&quot;Status&quot;].replace(&quot;TBD&quot;, np.nan)</pre></div></div><div class="issue severity-High"><span class="severity-badge High">High</span> <strong>Suspicious repetition:</strong> "Active" appears 18 times (69.2%)<div class="why-box"><strong>Why this matters:</strong> When the same value appears far more often than expected, it may indicate a default value that was never updated, a copy-paste error, or a system glitch that stamped the same data across multiple records.</div><div class="fix-block"><div class="fix-header"><span>Suggested Fix (flag_repetitions)</span><button class="fix-copy" onclick="copyFix(this)">Copy</button></div><div class="fix-desc">Flag 18 rows where &quot;Status&quot; = &quot;Active&quot; (69.2%) for manual review</div><pre class="fix-code">df[&quot;_Status_review&quot;] = (
     df[&quot;Status&quot;] == &quot;Active&quot;
 )</pre></div></div></div>
-<div class="field-card" data-field="zipcode" data-severities="Low Medium">
+<div class="field-card" data-field="zipcode" data-severities="Medium Low">
     <div class="field-header">
         <span class="field-name">ZipCode</span>
         <span class="field-type">zipcode</span>
@@ -743,7 +734,7 @@ <h2 class="sheet-toggle" onclick="toggleSheet(this)">Sheet: Orders
         <tr><th>OrderID</th><th>CustomerID</th><th>OrderDate</th><th>Amount</th><th>ShipDate</th><th>Status</th></tr>
 <tr><td>ORD-006</td><td>CUST-010</td><td>2023-01-01</td><td>$0.00</td><td>2023-01-01</td><td>Test</td></tr><tr><td>ORD-007</td><td>CUST-010</td><td>2023-01-01</td><td>$0.00</td><td>2023-01-01</td><td>Test</td></tr></table><div class="why-box"><strong>Why this matters:</strong> Exact duplicate rows are the clearest sign of a data quality issue — they can result from double-submissions, ETL failures, or missing unique constraints. Every duplicate inflates counts and distorts any metric built on this data.</div><div class="fix-block"><div class="fix-header"><span>Suggested Fix (drop_exact_duplicates)</span><button class="fix-copy" onclick="copyFix(this)">Copy</button></div><div class="fix-desc">Remove 2 exact duplicate rows (rows 7, 8)</div><pre class="fix-code">df = df.drop_duplicates(keep=&quot;first&quot;).reset_index(drop=True)</pre></div></div></div></div>
 <div class="footer">
-    Data Hygiene Audit &mdash; Generated 2026-05-15 15:03:29 &mdash; Lailara LLC
+    Data Hygiene Audit &mdash; Generated 2026-05-16 12:00:51 &mdash; Lailara LLC
 </div>
 
 <script>
diff --git a/samples/output/sample_messy_data_audit_report.pdf b/samples/output/sample_messy_data_audit_report.pdf
index 963b426..e9b9da1 100644
Binary files a/samples/output/sample_messy_data_audit_report.pdf and b/samples/output/sample_messy_data_audit_report.pdf differ
diff --git a/samples/rules_example.json b/samples/rules_example.json
new file mode 100644
index 0000000..aa956e8
--- /dev/null
+++ b/samples/rules_example.json
@@ -0,0 +1,35 @@
+{
+  "rules": [
+    {
+      "name": "Phone format (US)",
+      "description": "Phone numbers should match standard US format (XXX) XXX-XXXX to ensure consistent dialing and deduplication",
+      "severity": "High",
+      "condition": "regex_match",
+      "threshold": "^\\(\\d{3}\\) \\d{3}-\\d{4}$",
+      "column_pattern": "phone|tel"
+    },
+    {
+      "name": "Valid status values",
+      "description": "Status fields should only contain known values to prevent downstream filtering and reporting errors",
+      "severity": "Medium",
+      "condition": "allowed_values",
+      "threshold": ["Active", "Inactive", "Pending", "Cancelled"],
+      "columns": ["Status"]
+    },
+    {
+      "name": "Email minimum length",
+      "description": "Valid emails are at least 6 characters (a@b.co) — shorter values are likely placeholders or typos",
+      "severity": "Low",
+      "condition": "min_length",
+      "threshold": 6,
+      "column_pattern": "email"
+    },
+    {
+      "name": "No test data in production",
+      "description": "Test and placeholder values in production data indicate incomplete data entry or inadequate validation",
+      "severity": "High",
+      "condition": "disallowed_values",
+      "threshold": ["test", "n/a", "tbd", "xxx", "asdf", "foo", "bar"]
+    }
+  ]
+}
diff --git a/tests/test_integration.py b/tests/test_integration.py
index 353a0a1..26f8c9a 100644
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@@ -1,9 +1,11 @@
 """Integration and edge case tests."""
+import json
 import os
 import tempfile
 from pathlib import Path
 
 from audit import _load_sheets, generate_excel, generate_html, generate_pdf, run_audit
+from data_hygiene_auditor.core import count_issues
 
 SAMPLE_PATH = Path(__file__).parent.parent / "samples" / "input" / "sample_messy_data.xlsx"
 
@@ -168,3 +170,204 @@ def test_tsv_support(self):
             assert len(results["sheets"]) == 1
         finally:
             os.unlink(f.name)
+
+
+class TestCountIssues:
+    def test_counts_all_issue_sources(self):
+        results = run_audit(str(SAMPLE_PATH))
+        counts = count_issues(results)
+        assert counts['total'] == counts.get('High', 0) + counts.get('Medium', 0) + counts.get('Low', 0)
+        assert counts['total'] > 0
+
+    def test_matches_manual_count(self):
+        results = run_audit(str(SAMPLE_PATH))
+        counts = count_issues(results)
+        manual_total = 0
+        for sheet in results["sheets"].values():
+            for field in sheet["fields"].values():
+                manual_total += len(field["issues"])
+            manual_total += len(sheet["phantom_duplicates"])
+            manual_total += len(sheet.get("fuzzy_duplicates", []))
+            manual_total += len(sheet.get("schema_violations", []))
+        assert counts['total'] == manual_total
+
+    def test_includes_fuzzy_duplicates(self):
+        results = run_audit(str(SAMPLE_PATH))
+        has_fuzzy = any(
+            len(sheet.get("fuzzy_duplicates", [])) > 0
+            for sheet in results["sheets"].values()
+        )
+        if has_fuzzy:
+            counts = count_issues(results)
+            no_fuzzy_total = 0
+            for sheet in results["sheets"].values():
+                for field in sheet["fields"].values():
+                    no_fuzzy_total += len(field["issues"])
+                no_fuzzy_total += len(sheet["phantom_duplicates"])
+                no_fuzzy_total += len(sheet.get("schema_violations", []))
+            assert counts['total'] > no_fuzzy_total
+
+    def test_schema_count_tracked(self):
+        counts = count_issues({'sheets': {
+            'Sheet1': {
+                'fields': {},
+                'phantom_duplicates': [],
+                'fuzzy_duplicates': [],
+                'schema_violations': [
+                    {'severity': 'High', 'type': 'schema_type_mismatch'},
+                ],
+            },
+        }})
+        assert counts['schema'] == 1
+        assert counts['total'] == 1
+        assert counts['High'] == 1
+
+
+class TestCustomRulesIntegration:
+
+    def test_rules_produce_findings(self, tmp_path):
+        rules_file = tmp_path / "rules.json"
+        rules_file.write_text(json.dumps({
+            "rules": [{
+                "name": "No short names",
+                "description": "Names must be at least 10 characters",
+                "severity": "Medium",
+                "condition": "min_length",
+                "threshold": 10,
+                "column_pattern": "name",
+            }]
+        }))
+        results = run_audit(str(SAMPLE_PATH), rules_path=str(rules_file))
+        custom_findings = []
+        for sheet in results['sheets'].values():
+            for field_data in sheet['fields'].values():
+                for issue in field_data['issues']:
+                    if issue.get('type') == 'custom_rule':
+                        custom_findings.append(issue)
+        assert len(custom_findings) > 0
+        assert custom_findings[0]['rule_name'] == "No short names"
+        assert custom_findings[0]['severity'] == "Medium"
+
+    def test_rules_counted_in_totals(self, tmp_path):
+        rules_file = tmp_path / "rules.json"
+        rules_file.write_text(json.dumps({
+            "rules": [{
+                "name": "All digits",
+                "description": "IDs must be numeric",
+                "severity": "High",
+                "condition": "regex_match",
+                "threshold": "^\\d+$",
+                "column_pattern": ".*",
+            }]
+        }))
+        results_without = run_audit(str(SAMPLE_PATH))
+        results_with = run_audit(str(SAMPLE_PATH), rules_path=str(rules_file))
+        count_without = count_issues(results_without)['total']
+        count_with = count_issues(results_with)['total']
+        assert count_with > count_without
+
+    def test_rules_metadata_in_results(self, tmp_path):
+        rules_file = tmp_path / "rules.json"
+        rules_file.write_text(json.dumps({
+            "rules": [{
+                "name": "Test rule",
+                "description": "d",
+                "severity": "Low",
+                "condition": "max_missing_pct",
+                "threshold": 1,
+            }]
+        }))
+        results = run_audit(str(SAMPLE_PATH), rules_path=str(rules_file))
+        assert 'rules' in results
+        assert results['rules']['count'] == 1
+        assert results['rules']['names'] == ["Test rule"]
+
+    def test_rules_affect_health_score(self, tmp_path):
+        rules_file = tmp_path / "rules.json"
+        rules_file.write_text(json.dumps({
+            "rules": [{
+                "name": "Strict rule",
+                "description": "Everything fails",
+                "severity": "High",
+                "condition": "regex_match",
+                "threshold": "^IMPOSSIBLE_VALUE$",
+                "column_pattern": ".*",
+            }]
+        }))
+        results_without = run_audit(str(SAMPLE_PATH))
+        results_with = run_audit(str(SAMPLE_PATH), rules_path=str(rules_file))
+        assert results_with['overall_score'] < results_without['overall_score']
+
+
+class TestColumnProfiling:
+
+    def test_profile_exists_for_all_fields(self):
+        results = run_audit(str(SAMPLE_PATH))
+        for sheet in results['sheets'].values():
+            for col, field_data in sheet['fields'].items():
+                assert 'profile' in field_data, f"Missing profile for {col}"
+                profile = field_data['profile']
+                assert 'cardinality' in profile
+                assert 'uniqueness_pct' in profile
+                assert 'min_length' in profile
+                assert 'max_length' in profile
+                assert 'avg_length' in profile
+
+    def test_profile_cardinality(self):
+        import pandas as pd
+
+        from data_hygiene_auditor.core import _compute_profile
+        series = pd.Series(["apple", "banana", "apple", "cherry", None])
+        profile = _compute_profile(series, "freetext")
+        assert profile['cardinality'] == 3
+        assert profile['non_empty_values'] == 4
+        assert profile['total_values'] == 5
+
+    def test_profile_uniqueness(self):
+        import pandas as pd
+
+        from data_hygiene_auditor.core import _compute_profile
+        series = pd.Series(["a", "b", "c", "d"])
+        profile = _compute_profile(series, "freetext")
+        assert profile['uniqueness_pct'] == 100.0
+
+    def test_profile_lengths(self):
+        import pandas as pd
+
+        from data_hygiene_auditor.core import _compute_profile
+        series = pd.Series(["hi", "hello", "hey"])
+        profile = _compute_profile(series, "freetext")
+        assert profile['min_length'] == 2
+        assert profile['max_length'] == 5
+        assert profile['avg_length'] == round((2 + 5 + 3) / 3, 1)
+
+    def test_profile_numeric_stats_currency(self):
+        import pandas as pd
+
+        from data_hygiene_auditor.core import _compute_profile
+        series = pd.Series(["$100.00", "$200.00", "$300.00", "$400.00"])
+        profile = _compute_profile(series, "currency")
+        assert profile['min_value'] == 100.0
+        assert profile['max_value'] == 400.0
+        assert profile['mean_value'] == 250.0
+        assert profile['median_value'] == 250.0
+
+    def test_profile_numeric_stats_id(self):
+        import pandas as pd
+
+        from data_hygiene_auditor.core import _compute_profile
+        series = pd.Series(["1", "2", "3", "4", "5"])
+        profile = _compute_profile(series, "id")
+        assert profile['min_value'] == 1.0
+        assert profile['max_value'] == 5.0
+        assert profile['mean_value'] == 3.0
+
+    def test_profile_empty_series(self):
+        import pandas as pd
+
+        from data_hygiene_auditor.core import _compute_profile
+        series = pd.Series([None, None, ""])
+        profile = _compute_profile(series, "freetext")
+        assert profile['cardinality'] == 0
+        assert profile['uniqueness_pct'] == 0.0
+        assert profile['min_length'] == 0
diff --git a/tests/test_rules.py b/tests/test_rules.py
new file mode 100644
index 0000000..e84cb81
--- /dev/null
+++ b/tests/test_rules.py
@@ -0,0 +1,288 @@
+"""Tests for custom rule engine — loader and evaluator."""
+
+import json
+
+import pandas as pd
+import pytest
+
+from data_hygiene_auditor.rules import Rule, evaluate_rule, load_rules
+
+
+@pytest.fixture
+def tmp_rules_file(tmp_path):
+    """Helper to write a rules JSON file and return its path."""
+    def _write(data):
+        path = tmp_path / "rules.json"
+        path.write_text(json.dumps(data))
+        return str(path)
+    return _write
+
+
+class TestLoadRules:
+
+    def test_loads_valid_rules(self, tmp_rules_file):
+        path = tmp_rules_file({
+            "rules": [
+                {
+                    "name": "Phone format",
+                    "description": "Must match US format",
+                    "severity": "High",
+                    "condition": "regex_match",
+                    "threshold": r"^\(\d{3}\) \d{3}-\d{4}$",
+                    "column_pattern": "phone",
+                }
+            ]
+        })
+        rules = load_rules(path)
+        assert len(rules) == 1
+        assert rules[0].name == "Phone format"
+        assert rules[0].severity == "High"
+        assert rules[0].condition == "regex_match"
+
+    def test_rejects_missing_rules_key(self, tmp_rules_file):
+        path = tmp_rules_file({"checks": []})
+        with pytest.raises(ValueError, match="top-level 'rules' array"):
+            load_rules(path)
+
+    def test_rejects_invalid_json(self, tmp_path):
+        path = tmp_path / "bad.json"
+        path.write_text("not json {{{")
+        with pytest.raises(ValueError, match="Invalid JSON"):
+            load_rules(str(path))
+
+    def test_rejects_missing_required_fields(self, tmp_rules_file):
+        path = tmp_rules_file({
+            "rules": [{"name": "incomplete"}]
+        })
+        with pytest.raises(ValueError, match="missing required field"):
+            load_rules(path)
+
+    def test_rejects_invalid_condition(self, tmp_rules_file):
+        path = tmp_rules_file({
+            "rules": [{
+                "name": "bad",
+                "description": "x",
+                "severity": "High",
+                "condition": "magic_check",
+                "threshold": 5,
+            }]
+        })
+        with pytest.raises(ValueError, match="invalid condition"):
+            load_rules(path)
+
+    def test_rejects_invalid_severity(self, tmp_rules_file):
+        path = tmp_rules_file({
+            "rules": [{
+                "name": "bad",
+                "description": "x",
+                "severity": "Critical",
+                "condition": "min_length",
+                "threshold": 5,
+            }]
+        })
+        with pytest.raises(ValueError, match="severity must be"):
+            load_rules(path)
+
+    def test_rejects_invalid_regex(self, tmp_rules_file):
+        path = tmp_rules_file({
+            "rules": [{
+                "name": "bad regex",
+                "description": "x",
+                "severity": "High",
+                "condition": "regex_match",
+                "threshold": "[invalid(",
+            }]
+        })
+        with pytest.raises(ValueError, match="invalid regex"):
+            load_rules(path)
+
+    def test_rejects_nonexistent_file(self):
+        with pytest.raises(FileNotFoundError):
+            load_rules("/nonexistent/rules.json")
+
+    def test_loads_multiple_rules(self, tmp_rules_file):
+        path = tmp_rules_file({
+            "rules": [
+                {
+                    "name": "R1",
+                    "description": "d1",
+                    "severity": "Low",
+                    "condition": "min_length",
+                    "threshold": 3,
+                },
+                {
+                    "name": "R2",
+                    "description": "d2",
+                    "severity": "Medium",
+                    "condition": "max_missing_pct",
+                    "threshold": 10,
+                },
+            ]
+        })
+        rules = load_rules(path)
+        assert len(rules) == 2
+
+    def test_column_pattern_default(self, tmp_rules_file):
+        path = tmp_rules_file({
+            "rules": [{
+                "name": "R",
+                "description": "d",
+                "severity": "Low",
+                "condition": "min_length",
+                "threshold": 1,
+            }]
+        })
+        rules = load_rules(path)
+        assert rules[0].column_pattern == '*'
+
+    def test_columns_list(self, tmp_rules_file):
+        path = tmp_rules_file({
+            "rules": [{
+                "name": "R",
+                "description": "d",
+                "severity": "Low",
+                "condition": "min_length",
+                "threshold": 1,
+                "columns": ["Name", "Email"],
+            }]
+        })
+        rules = load_rules(path)
+        assert rules[0].columns == ["Name", "Email"]
+
+
+class TestRuleMatchesColumn:
+
+    def test_wildcard_matches_all(self):
+        rule = Rule("R", "d", "Low", "min_length", 1, column_pattern="*")
+        assert rule.matches_column("anything")
+
+    def test_pattern_matches(self):
+        rule = Rule("R", "d", "Low", "min_length", 1, column_pattern="phone|tel")
+        assert rule.matches_column("Phone")
+        assert rule.matches_column("telephone")
+        assert not rule.matches_column("email")
+
+    def test_explicit_columns_list(self):
+        rule = Rule("R", "d", "Low", "min_length", 1, columns=["Name", "Email"])
+        assert rule.matches_column("Name")
+        assert rule.matches_column("Email")
+        assert not rule.matches_column("Phone")
+
+
+class TestEvaluateRuleRegex:
+
+    def test_regex_match_finds_violations(self):
+        rule = Rule("R", "Must be digits", "High", "regex_match", r"^\d+$")
+        series = pd.Series(["123", "456", "abc", "78x"])
+        result = evaluate_rule(rule, series, "ID")
+        assert result is not None
+        assert result['detail']['violations'] == 2
+        assert "abc" in result['detail']['examples']
+
+    def test_regex_match_no_violations(self):
+        rule = Rule("R", "d", "High", "regex_match", r"^\d+$")
+        series = pd.Series(["123", "456", "789"])
+        result = evaluate_rule(rule, series, "ID")
+        assert result is None
+
+    def test_not_regex_match_finds_violations(self):
+        rule = Rule("R", "No SSNs", "High", "not_regex_match", r"^\d{3}-\d{2}-\d{4}$")
+        series = pd.Series(["hello", "123-45-6789", "world"])
+        result = evaluate_rule(rule, series, "Notes")
+        assert result is not None
+        assert result['detail']['violations'] == 1
+
+    def test_not_regex_match_no_violations(self):
+        rule = Rule("R", "d", "High", "not_regex_match", r"^\d{3}-\d{2}-\d{4}$")
+        series = pd.Series(["hello", "world"])
+        result = evaluate_rule(rule, series, "Notes")
+        assert result is None
+
+
+class TestEvaluateRuleLength:
+
+    def test_min_length_finds_short_values(self):
+        rule = Rule("R", "Too short", "Medium", "min_length", 5)
+        series = pd.Series(["hello", "hi", "world", "yo"])
+        result = evaluate_rule(rule, series, "Name")
+        assert result is not None
+        assert result['detail']['violations'] == 2
+
+    def test_max_length_finds_long_values(self):
+        rule = Rule("R", "Too long", "Low", "max_length", 5)
+        series = pd.Series(["hi", "toolongvalue", "ok", "another_long"])
+        result = evaluate_rule(rule, series, "Code")
+        assert result is not None
+        assert result['detail']['violations'] == 2
+
+    def test_min_length_all_pass(self):
+        rule = Rule("R", "d", "Low", "min_length", 2)
+        series = pd.Series(["hello", "world", "ok"])
+        result = evaluate_rule(rule, series, "Name")
+        assert result is None
+
+
+class TestEvaluateRuleValues:
+
+    def test_allowed_values_finds_violations(self):
+        rule = Rule("R", "Invalid status", "High", "allowed_values", ["active", "inactive", "pending"])
+        series = pd.Series(["Active", "inactive", "UNKNOWN", "deleted"])
+        result = evaluate_rule(rule, series, "Status")
+        assert result is not None
+        assert result['detail']['violations'] == 2
+
+    def test_allowed_values_case_insensitive(self):
+        rule = Rule("R", "d", "Low", "allowed_values", ["yes", "no"])
+        series = pd.Series(["Yes", "NO", "yes"])
+        result = evaluate_rule(rule, series, "Flag")
+        assert result is None
+
+    def test_disallowed_values_finds_matches(self):
+        rule = Rule("R", "No test data", "Medium", "disallowed_values", ["test", "n/a", "tbd"])
+        series = pd.Series(["John", "Test", "N/A", "Jane"])
+        result = evaluate_rule(rule, series, "Name")
+        assert result is not None
+        assert result['detail']['violations'] == 2
+
+    def test_disallowed_values_no_matches(self):
+        rule = Rule("R", "d", "Low", "disallowed_values", ["test", "n/a"])
+        series = pd.Series(["John", "Jane", "Bob"])
+        result = evaluate_rule(rule, series, "Name")
+        assert result is None
+
+
+class TestEvaluateRuleMissing:
+
+    def test_max_missing_pct_exceeds(self):
+        rule = Rule("R", "Too many missing", "High", "max_missing_pct", 10)
+        series = pd.Series(["a", None, None, "b", "", None])
+        result = evaluate_rule(rule, series, "Field")
+        assert result is not None
+        assert result['detail']['actual'] > 10
+
+    def test_max_missing_pct_within(self):
+        rule = Rule("R", "d", "Low", "max_missing_pct", 50)
+        series = pd.Series(["a", "b", "c", None])
+        result = evaluate_rule(rule, series, "Field")
+        assert result is None
+
+
+class TestEvaluateRuleColumnFilter:
+
+    def test_skips_non_matching_column(self):
+        rule = Rule("R", "d", "High", "min_length", 5, column_pattern="phone")
+        series = pd.Series(["hi"])
+        result = evaluate_rule(rule, series, "Email")
+        assert result is None
+
+    def test_applies_to_matching_column(self):
+        rule = Rule("R", "d", "High", "min_length", 5, column_pattern="phone")
+        series = pd.Series(["hi"])
+        result = evaluate_rule(rule, series, "Phone")
+        assert result is not None
+
+    def test_empty_series_returns_none(self):
+        rule = Rule("R", "d", "High", "regex_match", r"\d+")
+        series = pd.Series([None, None, ""])
+        result = evaluate_rule(rule, series, "Col")
+        assert result is None