diff --git a/CLAUDE.md b/CLAUDE.md index ff7de1ac..a51deb20 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -82,6 +82,7 @@ Numerical comparisons use epsilon tolerance — use `AssertCase`/`AssertBatch` r ## Active Technologies - Python 3.8+ + numpy >= 1.20 (sole runtime dependency per constitution) (002-decompose-intervals-pipeline) - Python 3.8+ + numpy >= 1.20 (sole runtime dependency) (002-decompose-intervals-pipeline) +- Python 3.8+ + numpy >= 1.20 (sole runtime dependency; no additions) (004-partials-package) ## Recent Changes - 002-decompose-intervals-pipeline: Added Python 3.8+ + numpy >= 1.20 (sole runtime dependency per constitution) diff --git a/mkdocs.yml b/mkdocs.yml index f1d73a78..3116add7 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -79,8 +79,8 @@ nav: - "foapy.order": references/order.md - "foapy.binding": references/binding.md - "foapy.chain_mode": references/chain_mode.md - - "foapy.tuple_mode": references/tuple_mode.md - "foapy.intervals_chain": references/intervals_chain.md + - "foapy.tuple_mode": references/tuple_mode.md - "foapy.intervals_tuple": references/intervals_tuple.md - "foapy.intervals_distribution": references/intervals_distribution.md - "foapy.ma": diff --git a/specs/004-partials-package/checklists/requirements.md b/specs/004-partials-package/checklists/requirements.md new file mode 100644 index 00000000..8d89b64f --- /dev/null +++ b/specs/004-partials-package/checklists/requirements.md @@ -0,0 +1,39 @@ +# Specification Quality Checklist: foapy.partials Package + +**Purpose**: Validate specification completeness and quality before proceeding to planning +**Created**: 2026-04-19 +**Feature**: [spec.md](../spec.md) + +## Content Quality + +- [x] No implementation details (languages, frameworks, APIs) +- [x] Focused on user value and business needs +- [x] Written for non-technical stakeholders +- [x] All mandatory sections completed + +## Requirement Completeness + +- [x] No [NEEDS CLARIFICATION] markers remain +- [x] Requirements are testable and unambiguous +- [x] Success criteria are measurable +- [x] Success criteria are technology-agnostic (no implementation details) +- [x] All acceptance scenarios are defined +- [x] Edge cases are identified +- [x] Scope is clearly bounded (4 functions only; intervals_distribution excluded) +- [x] Dependencies and assumptions identified + +## Feature Readiness + +- [x] All functional requirements have clear acceptance criteria +- [x] User scenarios cover primary flows +- [x] Feature meets measurable outcomes defined in Success Criteria +- [x] No implementation details leak into specification + +## Notes + +All items pass. Clarifications session (2026-04-19) resolved 3 open items: +- intervals_distribution excluded from scope +- plain array input accepted (auto-wrapped) +- submodule-only access pattern confirmed + +Spec is ready for `/speckit.plan`. diff --git a/specs/004-partials-package/contracts/alphabet.md b/specs/004-partials-package/contracts/alphabet.md new file mode 100644 index 00000000..1279d5fd --- /dev/null +++ b/specs/004-partials-package/contracts/alphabet.md @@ -0,0 +1,45 @@ +# Contract: partials.alphabet + +## Signature + +```python +foapy.partials.alphabet(X) -> numpy.ndarray +``` + +## Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `X` | `array_like` or `numpy.ma.MaskedArray` | Yes | 1-D sequence (plain or masked). Masked positions are excluded. | + +## Returns + +| Return | Type | Shape | dtype | +|--------|------|-------|-------| +| `alphabet` | `numpy.ndarray` | `(p,)` | Same as input element type | + +- `p` = number of unique non-masked values +- Values appear in order of first appearance among non-masked positions +- Returns empty array when all positions are masked or input is empty + +## Raises + +| Exception | Condition | +|-----------|-----------| +| `Not1DArrayException` | Input has more than 1 dimension | + +## Invariants + +- `len(result)` equals the number of unique values in `X.compressed()` +- Result is a plain ndarray (not masked), even when input is masked + +## Examples + +```python +import numpy.ma as ma +import foapy.partials as partials + +X = ma.masked_array(['a', 'b', 'a', 'c'], mask=[False, True, False, False]) +alphabet = partials.alphabet(X) +# alphabet: ['a', 'c'] (b is masked, c appears after a) +``` diff --git a/specs/004-partials-package/contracts/intervals_chain.md b/specs/004-partials-package/contracts/intervals_chain.md new file mode 100644 index 00000000..41008d41 --- /dev/null +++ b/specs/004-partials-package/contracts/intervals_chain.md @@ -0,0 +1,71 @@ +# Contract: partials.intervals_chain + +## Signature + +```python +foapy.partials.intervals_chain(X, binding: int, chain_mode: int) -> numpy.ma.MaskedArray +``` + +## Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `X` | `array_like` or `numpy.ma.MaskedArray` | Yes | 1-D raw sequence (plain or masked). Masked positions are gaps. Pass the original sequence, not the order output. | +| `binding` | `int` | Yes | `foapy.binding.start` (1) or `foapy.binding.end` (2) | +| `chain_mode` | `int` | Yes | `foapy.chain_mode.boundary` (1) or `foapy.chain_mode.cycle` (2) | + +## Returns + +| Return | Type | Shape | dtype | +|--------|------|-------|-------| +| `chain` | `numpy.ma.MaskedArray` | `(n,)` | `numpy.intp` | + +- `n` = length of input sequence +- Output mask is identical to input mask +- Non-masked positions hold the interval distance as a positive integer +- **Interval distance uses actual positional indices in the full array** (gap positions count toward distance) +- Minimum interval value: 1. Maximum: n. + +## Raises + +| Exception | Condition | +|-----------|-----------| +| `Not1DArrayException` | Input has more than 1 dimension | +| `ValueError` | `binding` is not `binding.start` or `binding.end` | +| `ValueError` | `chain_mode` is not `chain_mode.boundary` or `chain_mode.cycle` | + +## Invariants + +- `len(result) == len(X)` always +- `result.mask` equals `ma.getmaskarray(X)` always +- For two non-masked occurrences of the same element at actual positions `i < j`: interval at position `j` = `j - i` +- For the first non-masked occurrence of an element at actual position `p` (binding.start, boundary): interval = `p + 1` +- When `X` has no masked positions: result equals `foapy.core.intervals_chain(X, binding, chain_mode)` + +## Key Semantic Difference from `foapy.ma.intervals_chain` + +`foapy.ma.intervals_chain` compresses masked elements first, so intervals measure distances in the compressed sequence. `foapy.partials.intervals_chain` uses actual positional indices, so a gap between two occurrences increases the interval distance. + +| Sequence | foapy.ma result | foapy.partials result | +|----------|-----------------|-----------------------| +| `[A, --, A]` (binding.start, boundary) | `[1, 1]` (compressed) | `[1, --, 2]` (full array) | + +## Examples + +```python +import numpy.ma as ma +import foapy +import foapy.partials as partials + +X = ma.masked_array(['C', 'T', 'C', 'G'], mask=[False, False, False, False]) +chain = partials.intervals_chain(X, foapy.binding.start, foapy.chain_mode.boundary) +# chain: [1, 2, 2, 4] (same as core — no gaps) + +X_partial = ma.masked_array( + ['_', 'C', 'T', 'C', '_', 'G'], + mask=[True, False, False, False, True, False] +) +chain = partials.intervals_chain(X_partial, foapy.binding.start, foapy.chain_mode.boundary) +# chain.data at non-masked positions [1,2,3,5]: [2, 3, 2, 6] +# chain.mask: [True, False, False, False, True, False] +``` diff --git a/specs/004-partials-package/contracts/intervals_tuple.md b/specs/004-partials-package/contracts/intervals_tuple.md new file mode 100644 index 00000000..4815e43c --- /dev/null +++ b/specs/004-partials-package/contracts/intervals_tuple.md @@ -0,0 +1,70 @@ +# Contract: partials.intervals_tuple + +## Signature + +```python +foapy.partials.intervals_tuple(chain, binding: int, tuple_mode: int) -> numpy.ma.MaskedArray +``` + +## Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `chain` | `array_like` or `numpy.ma.MaskedArray` | Yes | 1-D intervals chain produced by `partials.intervals_chain`. | +| `binding` | `int` | Yes | Must match the binding used to produce `chain`. `foapy.binding.start` (1) or `foapy.binding.end` (2). | +| `tuple_mode` | `int` | Yes | `foapy.tuple_mode.normal` (2), `foapy.tuple_mode.lossy` (1), or `foapy.tuple_mode.redundant` (3) | + +## Returns + +| `tuple_mode` | Shape | Mask | Description | +|--------------|-------|------|-------------| +| `normal` | `(n,)` | Same as input | Chain returned unchanged | +| `lossy` | `(n,)` | Input mask ∪ boundary positions | Boundary (first-occurrence) intervals additionally masked | +| `redundant` | `(n + k,)` | Input mask + `k` unmasked trailing elements | Trailing complementary boundary intervals appended; `k` = unique symbol count inferred from compressed chain | + +- `n` = length of input chain +- `k` = number of unique elements inferred from the compressed chain + +## Raises + +| Exception | Condition | +|-----------|-----------| +| `ValueError` | `binding` is not `binding.start` or `binding.end` | +| `ValueError` | `tuple_mode` is not a recognised value | + +## Invariants + +- For `normal`: `result.data == chain.data` and `result.mask == chain.mask` +- For `lossy`: `result.mask[i] == True` whenever `chain.mask[i] == True`; additionally masked at boundary positions +- For `lossy`: `len(result) == len(chain)` always +- For `redundant`: `len(result) == len(chain) + k`; trailing k elements are unmasked +- When `chain` has no masked positions: non-masked values in result equal `foapy.core.intervals_tuple(chain, binding, tuple_mode)` + +## Boundary Detection (lossy) + +A non-masked element at compressed-array index `i` is a boundary interval if `compressed_chain[i] > i`. This is the same criterion as `foapy.core.intervals_tuple` applied to the compressed sub-sequence. + +## Examples + +```python +import numpy.ma as ma +import foapy +import foapy.partials as partials + +# Partial chain: positions 0 and 4 are gaps +chain = ma.masked_array( + [0, 2, 3, 2, 0, 6], + mask=[True, False, False, False, True, False] +) + +# normal: unchanged +result = partials.intervals_tuple(chain, foapy.binding.start, foapy.tuple_mode.normal) +# result: same as chain + +# lossy: C's first occurrence at compressed-index 0 has value 2 > 0 → masked +# T's first occurrence at compressed-index 1 has value 3 > 1 → masked +# G's first occurrence at compressed-index 3 has value 6 > 3 → masked +# C's second occurrence at compressed-index 2 has value 2 == 2 → kept +result = partials.intervals_tuple(chain, foapy.binding.start, foapy.tuple_mode.lossy) +# Only position 3 (C second occurrence) remains unmasked +``` diff --git a/specs/004-partials-package/contracts/order.md b/specs/004-partials-package/contracts/order.md new file mode 100644 index 00000000..be1d2728 --- /dev/null +++ b/specs/004-partials-package/contracts/order.md @@ -0,0 +1,54 @@ +# Contract: partials.order + +## Signature + +```python +foapy.partials.order(X, return_alphabet=False) -> numpy.ma.MaskedArray +``` + +## Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `X` | `array_like` or `numpy.ma.MaskedArray` | Yes | 1-D sequence (plain or masked). Masked positions treated as gaps. | +| `return_alphabet` | `bool` | No (default `False`) | If `True`, also return the alphabet of non-masked unique values. | + +## Returns + +| Mode | Return | Type | Shape | dtype | +|------|--------|------|-------|-------| +| `return_alphabet=False` | `order` | `numpy.ma.MaskedArray` | `(n,)` | `numpy.intp` | +| `return_alphabet=True` | `(order, alphabet)` | `(numpy.ma.MaskedArray, numpy.ndarray)` | `(n,)`, `(p,)` | `numpy.intp`, same as input | + +- `n` = length of input sequence +- `p` = number of unique non-masked values (alphabet power) +- Output mask is identical to input mask +- Non-masked positions hold the element's 0-based index in the alphabet (first-appearance order) + +## Raises + +| Exception | Condition | +|-----------|-----------| +| `Not1DArrayException` | Input has more than 1 dimension | + +## Invariants + +- `len(result) == len(X)` always +- `result.mask` equals `ma.getmaskarray(X)` always +- When `return_alphabet=True`: `alphabet[result.data[~result.mask]]` reconstructs `X.compressed()` +- When all positions are masked: result is fully masked; alphabet is empty array + +## Examples + +```python +import numpy.ma as ma +import foapy.partials as partials + +X = ma.masked_array(['a', 'b', 'a', 'c'], mask=[False, True, False, False]) +result = partials.order(X) +# result.data (at non-masked): [0, 0, 1] +# result.mask: [False, True, False, False] + +result, alphabet = partials.order(X, return_alphabet=True) +# alphabet: ['a', 'c'] +``` diff --git a/specs/004-partials-package/data-model.md b/specs/004-partials-package/data-model.md new file mode 100644 index 00000000..ddf171de --- /dev/null +++ b/specs/004-partials-package/data-model.md @@ -0,0 +1,87 @@ +# Data Model: foapy.partials + +**Date**: 2026-04-19 | **Branch**: `004-partials-package` + +## Entities + +### Partial Sequence + +A 1-D masked array where each position holds either a symbolic value or a gap marker. + +| Attribute | Type | Constraints | +|-----------|------|-------------| +| data | any element type (str, int, …) | 1-D only; multi-dimensional raises `Not1DArrayException` | +| mask | boolean ndarray | `True` = gap position; `False` = present element | +| length `n` | int | ≥ 0 | +| compressed length `m` | int | 0 ≤ m ≤ n | + +**Construction**: Accept `numpy.ma.MaskedArray`, plain `numpy.ndarray`, or Python list. Plain inputs are auto-wrapped with no mask (`ma.asarray(X)`). + +--- + +### Partial Order + +Output of `partials.order()`. A 1-D masked array of the same length as the Partial Sequence. + +| Attribute | Type | Constraints | +|-----------|------|-------------| +| data | `numpy.intp` | alphabet index at non-masked positions; value at masked positions is undefined but always 0 | +| mask | boolean ndarray | Identical to input Partial Sequence mask | +| shape | `(n,)` | Same length as input | +| value range | int | 0 ≤ value < alphabet power (number of unique non-masked elements) | + +**Reconstruction**: `partial_order` paired with the `alphabet` array from `return_alphabet=True` allows full reconstruction: `alphabet[partial_order.data[~mask]]` recovers the original non-masked elements. + +--- + +### Partial Intervals Chain + +Output of `partials.intervals_chain()`. A 1-D masked array of the same length as the input. + +| Attribute | Type | Constraints | +|-----------|------|-------------| +| data | `numpy.intp` | positional interval distance at non-masked positions | +| mask | boolean ndarray | Identical to input chain mask | +| shape | `(n,)` | Same length as input | +| value range | int | 1 ≤ value ≤ n (actual positional distance, including gap positions) | + +**Semantics**: For two consecutive non-masked occurrences of the same element at actual positions `i` and `j` (i < j), the interval at position `j` is `j - i`. For the first occurrence at position `p`, the boundary interval is `p + 1` (binding.start, chain_mode.boundary). + +--- + +### Partial Intervals Tuple + +Output of `partials.intervals_tuple()`. A 1-D masked array. + +| Attribute | Type | Constraints | +|-----------|------|-------------| +| data | `numpy.intp` | boundary-adjusted interval distances | +| mask | boolean ndarray | input mask ∪ additionally masked boundary positions (lossy) | +| shape | `(n,)` for normal and lossy; `(n + k,)` for redundant | n = chain length; k = unique symbol count inferred from chain | + +**Mode effects on mask**: +- `tuple_mode.normal`: mask unchanged from input +- `tuple_mode.lossy`: first-occurrence boundary positions additionally masked; output length = n +- `tuple_mode.redundant`: original mask unchanged; k trailing elements appended as unmasked; output length = n + k + +## Pipeline Flow + +``` +Partial Sequence (masked 1-D) + │ + ▼ + partials.order() + │ returns Partial Order (masked 1-D, same length) + ▼ + partials.intervals_chain() + │ takes raw Partial Sequence (not the order) + │ returns Partial Intervals Chain (masked 1-D, same length) + ▼ + partials.intervals_tuple() + │ returns Partial Intervals Tuple (masked 1-D) + │ normal/lossy: same length; redundant: n + k length + ▼ + (downstream characteristics or further analysis) +``` + +Note: `intervals_chain` accepts the original Partial Sequence (raw symbols), not the order output. This mirrors `foapy.core.intervals_chain` which also operates on raw sequences. diff --git a/specs/004-partials-package/plan.md b/specs/004-partials-package/plan.md new file mode 100644 index 00000000..0200fac4 --- /dev/null +++ b/specs/004-partials-package/plan.md @@ -0,0 +1,221 @@ +# Implementation Plan: foapy.partials Package + +**Branch**: `004-partials-package` | **Date**: 2026-04-19 | **Spec**: [spec.md](./spec.md) +**Input**: Feature specification from `/specs/004-partials-package/spec.md` + +## Summary + +Add `foapy.partials` — a new sub-package providing FOA pipeline operations for partial sequences (sequences with gap/skip positions represented as numpy masked arrays). Unlike `foapy.ma`, which compresses masked elements before computing intervals, `foapy.partials` preserves gap positions in every output and computes interval distances using actual positional indices in the full array. The package exposes four functions: `order`, `alphabet`, `intervals_chain`, `intervals_tuple`. + +## Technical Context + +**Language/Version**: Python 3.8+ +**Primary Dependencies**: numpy >= 1.20 (sole runtime dependency; no additions) +**Storage**: N/A +**Testing**: tox -e default (pytest); tests mirror `tests/test_ma_*.py` naming pattern +**Target Platform**: Any platform supporting Python 3.8+ and numpy >= 1.20 +**Project Type**: Python library sub-package +**Performance Goals**: Sequences up to length 10 000 must complete in < 100 ms on a single CPU core (Constitution Principle IV) +**Constraints**: All operations must be vectorized numpy; no Python loops over array elements; O(n) memory +**Scale/Scope**: 4 source modules + `__init__.py`; 4 test files; 1 `__init__.py` update in `foapy/__init__.py` + +## Constitution Check + +*GATE: Must pass before Phase 0 research. Re-check after Phase 1 design.* + +| Principle | Status | Notes | +|-----------|--------|-------| +| I. Code Quality — single responsibility, pure functions, no new deps | ✅ Pass | Each function is a single module; no new runtime deps; pure numpy | +| I. Code Quality — pre-commit (black, isort, flake8) | ✅ Pass | Enforced by existing CI pipeline | +| II. Testing — tox -e default; edge cases for pipeline functions | ✅ Pass | Tests must cover empty, single-element, all-unique, all-same, and realistic datasets; masked-value edge cases required | +| III. API Consistency — `foapy.ma` must mirror `foapy.core` signatures | ⚠️ Note | Constitution says "foapy.ma MUST mirror every public function in foapy.core". `foapy.partials` is a NEW sub-package, not an extension of foapy.ma, so this constraint does not apply. `partials` introduces the same four functions as core with an identical signature contract (masked in, masked out). | +| III. API Consistency — exceptions, docstring shapes | ✅ Pass | Uses `Not1DArrayException`; docstrings document dtype and dimensions | +| IV. Performance — vectorized numpy, < 100 ms for n=10 000 | ✅ Pass | All algorithms are vectorized; see design in Phase 1 | +| V. Simplicity — YAGNI, no base class, extract only at 3+ call sites | ✅ Pass | 4 thin modules; no abstractions beyond what is required | + +No violations requiring Complexity Tracking entries. + +## Project Structure + +### Documentation (this feature) + +```text +specs/004-partials-package/ +├── plan.md # This file +├── research.md # Phase 0 output +├── data-model.md # Phase 1 output +├── quickstart.md # Phase 1 output +├── contracts/ # Phase 1 output +│ ├── order.md +│ ├── alphabet.md +│ ├── intervals_chain.md +│ └── intervals_tuple.md +└── tasks.md # Phase 2 output (/speckit.tasks — not created here) +``` + +### Source Code (repository root) + +```text +src/foapy/ +├── __init__.py # Add 'partials' to __foapy_submodules__ and __getattr__ +├── partials/ +│ ├── __init__.py # Exports: order, alphabet, intervals_chain, intervals_tuple +│ ├── _order.py +│ ├── _alphabet.py +│ ├── _intervals_chain.py +│ └── _intervals_tuple.py + +tests/ +├── test_partials_order.py +├── test_partials_alphabet.py +├── test_partials_intervals_chain.py +└── test_partials_intervals_tuple.py +``` + +**Structure Decision**: Single flat package under `src/foapy/partials/`, mirroring `src/foapy/ma/`. Each function in its own private module (`_name.py`), exported via `__init__.py`. Test files follow the existing `test__.py` naming convention. + +## Complexity Tracking + +No constitution violations requiring justification. + +--- + +## Phase 0: Research + +*No NEEDS CLARIFICATION items in Technical Context. All design decisions derived from codebase analysis and documentation review.* + +See [research.md](./research.md) for detailed findings. + +--- + +## Phase 1: Design & Contracts + +See [data-model.md](./data-model.md), [contracts/](./contracts/), [quickstart.md](./quickstart.md). + +### Function Algorithm Designs + +#### `partials.order(X, return_alphabet=False)` + +``` +1. ar = ma.asarray(X) # auto-wrap plain arrays +2. Validate ar.ndim == 1 # raise Not1DArrayException otherwise +3. mask = ma.getmaskarray(ar) # boolean mask; False=present, True=gap +4. compressed = ar.compressed() # non-masked values in original order +5. If compressed is empty: + return fully masked array of length n (and empty alphabet if requested) +6. order_compressed, alphabet = core.order(compressed, return_alphabet=True) +7. result_data = zeros(n, dtype=intp) +8. result_data[~mask] = order_compressed # map back to full positions +9. result = ma.masked_array(result_data, mask=mask) +10. Return (result, alphabet) if return_alphabet else result +``` + +All operations are O(n) and fully vectorized. + +#### `partials.alphabet(X)` + +``` +1. ar = ma.asarray(X) +2. Validate ar.ndim == 1 +3. return core.order(ar.compressed(), return_alphabet=True)[1] + # equivalent: unique non-masked values in first-appearance order +``` + +Delegates to core after compression; O(n log n). + +#### `partials.intervals_chain(X, binding, chain_mode)` + +Key distinction from `ma.intervals_chain`: intervals measure **actual positional distance** in the full array (gaps count toward distance), not compressed distance. + +``` +1. Validate binding and chain_mode values +2. ar = ma.asarray(X) +3. Validate ar.ndim == 1 +4. full_mask = ma.getmaskarray(ar) +5. non_masked_idx = np.where(~full_mask)[0] # actual positions in full array +6. compressed_values = ar.compressed() +7. m = len(compressed_values); n = len(ar) +8. If m == 0: return fully masked array of length n + +9. If binding == binding.end: + # Reverse both positions and values to treat as binding.start + compressed_values = compressed_values[::-1] + non_masked_idx = (n - 1 - non_masked_idx)[::-1] + +10. Sort compressed_values stably → perm (indices into compressed array) +11. actual_pos = non_masked_idx # actual positions for each compressed element + +12. Build group boundary masks (first_mask, last_mask) over perm: + same as core: compare consecutive sorted values + +13. chain_compressed = empty(m, dtype=intp) + chain_compressed[1:] = actual_pos[perm[1:]] - actual_pos[perm[:-1]] + +14. If chain_mode == cycle: + delta = n - actual_pos[perm[last_mask]] # wrap-around distance + else: + delta = 1 # boundary = 1-indexed from edge + + chain_compressed[first_mask] = actual_pos[perm[first_mask]] + delta + +15. Invert perm → result_compressed (restore original order) + +16. If binding == binding.end: result_compressed = result_compressed[::-1] + +17. result_data = zeros(n, dtype=intp) + result_data[original_non_masked_idx] = result_compressed +18. Return ma.masked_array(result_data, mask=full_mask) +``` + +Fully vectorized; same algorithmic structure as `core.intervals_chain` but uses `actual_pos` instead of compressed-array indices. + +#### `partials.intervals_tuple(chain, binding, tuple_mode)` + +Receives a masked 1-D chain (output of `partials.intervals_chain`). Output length rules: +- `normal`: same length as input, same mask +- `lossy`: same length as input; positions identified as boundary intervals become **additionally masked** +- `redundant`: length increases by k (number of unique symbols inferred from chain); trailing intervals appended as unmasked elements + +``` +normal(ar_masked): + return ar_masked.copy() + +lossy(ar_masked, binding): + # Work on compressed chain (positions within compressed array) + compressed = ar_masked.compressed() + non_masked_idx = np.where(~ma.getmaskarray(ar_masked))[0] + If binding == end: reverse compressed and non_masked_idx + + positions = np.arange(len(compressed)) + first_mask = compressed > positions # boundary detection (same as core) + boundary_original_idx = non_masked_idx[first_mask if binding==start + else reversed first_mask] + + new_mask = ma.getmaskarray(ar_masked).copy() + new_mask[boundary_original_idx] = True + return ma.masked_array(ar_masked.data, mask=new_mask) + +redundant(ar_masked, binding): + compressed = ar_masked.compressed() + # Apply core redundant on compressed chain + extended_plain = core.intervals_tuple(compressed, binding, tuple_mode.redundant) + # The extra trailing intervals are the last (len(extended_plain) - len(compressed)) elements + k = len(extended_plain) - len(compressed) + trailing = extended_plain[len(compressed):] + # Reconstruct masked array: original positions unchanged, trailing appended (unmasked) + result_data = np.concatenate([ar_masked.data, trailing]) + result_mask = np.concatenate([ma.getmaskarray(ar_masked), np.zeros(k, dtype=bool)]) + return ma.masked_array(result_data, mask=result_mask) +``` + +### `foapy/__init__.py` Update + +Add `'partials'` to `__foapy_submodules__` and add a `__getattr__` branch: + +```python +if attr == "partials": + import foapy.partials as partials + return partials +``` + +No functions from `partials` are hoisted to `foapy.*` top-level (same pattern as `foapy.ma`). diff --git a/specs/004-partials-package/quickstart.md b/specs/004-partials-package/quickstart.md new file mode 100644 index 00000000..d17234d4 --- /dev/null +++ b/specs/004-partials-package/quickstart.md @@ -0,0 +1,105 @@ +# Quickstart: foapy.partials + +**Date**: 2026-04-19 | **Branch**: `004-partials-package` + +## What is foapy.partials? + +`foapy.partials` provides the Formal Order Analysis pipeline for **partial sequences** — sequences with gap (empty/skip) positions represented as numpy masked arrays. Unlike `foapy.core`, which operates on dense sequences, and `foapy.ma`, which compresses gaps away before computing intervals, `foapy.partials` preserves gap positions throughout the pipeline and measures interval distances using actual positional indices (gaps count toward distance). + +## Installation + +`foapy.partials` is part of the `foapy` package — no separate installation. + +```python +import foapy.partials as partials +import numpy.ma as ma +import foapy # for binding, chain_mode, tuple_mode enums +``` + +## Basic Usage + +### 1. Construct a Partial Sequence + +```python +import numpy.ma as ma + +# Text "INTELLIGENCE" with only letters I, T, G, N kept; others are gaps +X = ma.masked_array( + list("INTELLIGENCE"), + mask=[False, True, False, True, True, True, True, True, False, True, False, True] +) +# Non-masked positions: I(0), T(2), G(8), N(10) +``` + +### 2. Extract the Alphabet + +```python +alphabet = partials.alphabet(X) +# array(['I', 'T', 'G', 'N']) — unique non-masked values, first-appearance order +``` + +### 3. Compute the Order + +```python +order, alphabet = partials.order(X, return_alphabet=True) +# order: masked 1-D array of same length as X +# Non-masked positions: [0, 1, 2, 3] (I=0, T=1, G=2, N=3) +# Masked positions: same as X +``` + +### 4. Compute the Intervals Chain + +```python +chain = partials.intervals_chain(X, foapy.binding.start, foapy.chain_mode.boundary) +# chain: masked 1-D array of same length as X +# Each non-masked position holds the actual positional distance to the previous +# occurrence of that element (including any gap positions in between) +``` + +### 5. Apply a Boundary Strategy + +```python +# Keep all intervals (normal) +result = partials.intervals_tuple(chain, foapy.binding.start, foapy.tuple_mode.normal) + +# Drop boundary (first-occurrence) intervals — those positions become masked +result = partials.intervals_tuple(chain, foapy.binding.start, foapy.tuple_mode.lossy) + +# Append trailing complementary intervals +result = partials.intervals_tuple(chain, foapy.binding.start, foapy.tuple_mode.redundant) +``` + +### 6. Access the Non-Masked Intervals + +```python +# Extract just the interval values for downstream analysis +interval_values = result.compressed() +``` + +## Key Differences at a Glance + +| | `foapy.core` | `foapy.ma` | `foapy.partials` | +|---|---|---|---| +| Input | Plain 1-D array | Masked 1-D array | Masked 1-D array | +| `order()` output | Plain 1-D array | 2-D masked matrix | Masked 1-D array | +| Gaps in intervals | N/A | Compressed out | Count toward distance | +| Output length | Same as input | Varies by function | Same as input (normal/lossy); n+k (redundant) | + +## Plain Array Input + +All functions accept plain numpy arrays or Python lists; they are auto-wrapped as fully unmasked masked arrays: + +```python +result = partials.order(['a', 'b', 'a', 'c']) # same as foapy.core.order(['a','b','a','c']) +``` + +## Enums + +`foapy.partials` does not export `binding`, `chain_mode`, or `tuple_mode`. Import them from `foapy`: + +```python +import foapy +foapy.binding.start # or foapy.binding.end +foapy.chain_mode.boundary # or foapy.chain_mode.cycle +foapy.tuple_mode.normal # or .lossy or .redundant +``` diff --git a/specs/004-partials-package/research.md b/specs/004-partials-package/research.md new file mode 100644 index 00000000..022c94b3 --- /dev/null +++ b/specs/004-partials-package/research.md @@ -0,0 +1,73 @@ +# Research: foapy.partials Package + +**Date**: 2026-04-19 | **Branch**: `004-partials-package` + +## Key Decisions + +### 1. `partials.order()` Returns a 1-D Masked Array + +**Decision**: Return a masked 1-D array of the same length as input (mask preserved). + +**Rationale**: The partial-sequence semantics require output positions to align with input positions so callers can trace which gap position produced which result. `foapy.ma.order()` returns a 2-D matrix (one row per alphabet element) because its purpose is to represent the full occurrence matrix; `foapy.partials.order()` returns a 1-D array because the output is a direct replacement of the input sequence with alphabet indices. + +**Alternatives considered**: +- 2-D masked matrix (rejected — not partial-sequence semantics; that is `foapy.ma` territory) +- Plain 1-D ndarray with -1 sentinel for gaps (rejected — forces callers to handle sentinel values; masked arrays are the standard numpy idiom for missing data) + +### 2. Gaps Contribute to Interval Distances + +**Decision**: In `partials.intervals_chain()`, interval distances are computed using actual positional indices in the full (uncompressed) array, so masked gaps between two occurrences of the same element increase the measured interval. + +**Rationale**: This is the defining semantic of partial sequences per the FOA documentation: `[-, C, T, C, -, G]` produces intervals `[-, 2, 3, 2, -, 6]`, where C at position 3 has interval 2 (= 3 - 1), not 1 (which would be the compressed distance). + +**Alternatives considered**: +- Compress first, compute intervals on compressed, map back (rejected — this is `foapy.ma` behavior; intervals would be wrong for partial sequences) + +### 3. `intervals_tuple.lossy` Masks Rather Than Removes + +**Decision**: `tuple_mode.lossy` produces an output of the **same length** as the input chain; positions identified as boundary intervals become additionally masked instead of being dropped. + +**Rationale**: Preserving array length maintains positional alignment throughout the pipeline. Callers can always determine which original positions were boundary intervals by inspecting the output mask. + +**Alternatives considered**: +- Drop positions and shrink the array (rejected — breaks positional alignment; core behavior is appropriate for plain arrays but not for positionally-indexed masked arrays) + +### 4. `intervals_tuple.redundant` Appends Trailing Intervals + +**Decision**: `tuple_mode.redundant` appends the k trailing intervals at the end of the array (output length = n + k), delegating to `core.intervals_tuple` on the compressed chain. + +**Rationale**: Trailing intervals do not map to any original input position, so they cannot be placed at existing positions. Appending is the only semantically valid option. The output remains a masked array (appended trailing elements are unmasked). + +**Alternatives considered**: +- Raise NotImplementedError (rejected — the spec requires all three tuple_modes to be supported) +- Place trailing at masked positions (rejected — conflates interval values with gap positions) + +### 5. Plain Array Auto-Wrapping + +**Decision**: All four functions call `numpy.ma.asarray(X)` as the first operation, which safely converts plain lists, plain ndarrays, or masked arrays to a masked array with no mask. + +**Rationale**: Follows the `foapy.ma.intervals_chain()` pattern (`ma.asarray(X)`) and means callers who already use `foapy.core` don't need to explicitly construct masked arrays. + +### 6. No `intervals_distribution` in Scope + +**Decision**: `intervals_distribution` is excluded from `foapy.partials`. + +**Rationale**: It operates on plain interval counts (frequency histogram), not on positionally-indexed arrays. It works identically on the output of `intervals_tuple` regardless of whether the chain came from core or partials. Users simply call `foapy.intervals_distribution(partials_chain.compressed())` directly. + +### 7. Submodule Access Only + +**Decision**: `foapy.partials` is exposed as a lazily-loaded submodule (via `__getattr__` in `foapy/__init__.py`); no functions are hoisted to `foapy.*`. + +**Rationale**: Mirrors the `foapy.ma` pattern. Avoids naming conflicts with core functions (both packages have `order`, `alphabet`, etc.). Users import the specific sub-package they need. + +## Implementation Patterns Reused from Existing Code + +| Pattern | Source | Reused In | +|---------|--------|-----------| +| `ma.asarray(X)` auto-wrapping | `foapy/ma/_intervals_chain.py` | All four partials functions | +| `Not1DArrayException` on ndim > 1 | `foapy/core/_intervals_chain.py` | `order`, `intervals_chain` | +| `argsort(kind="mergesort")` for stable sort | `foapy/core/_intervals_chain.py` | `intervals_chain` | +| Boundary first/last group detection via boolean mask | `foapy/core/_intervals_chain.py` | `intervals_chain` | +| Inverse permutation via `inverse_perm[perm] = arange(n)` | `foapy/core/_intervals_chain.py` | `intervals_chain` | +| `ar > positions` boundary detection | `foapy/core/_intervals_tuple.py` | `intervals_tuple` (lossy) | +| Lazy `__getattr__` submodule loading | `foapy/__init__.py` | `foapy/__init__.py` update | diff --git a/specs/004-partials-package/spec.md b/specs/004-partials-package/spec.md new file mode 100644 index 00000000..d29de07a --- /dev/null +++ b/specs/004-partials-package/spec.md @@ -0,0 +1,132 @@ +# Feature Specification: foapy.partials Package + +**Feature Branch**: `004-partials-package` +**Created**: 2026-04-19 +**Status**: Draft +**Input**: User description: "Create a new package - foapy.partials. The package should be equivalent to core package but use masked arrays as inputs and outputs." + +## Clarifications + +### Session 2026-04-19 + +- Q: Should `foapy.partials` include `intervals_distribution`? → A: No — exclude it; it operates on plain interval counts, not positional structure. +- Q: Should `partials` functions accept plain numpy arrays as input? → A: Yes — auto-wrap plain arrays as fully unmasked masked arrays internally. +- Q: Should `foapy.partials` be accessible as a top-level hoist in `foapy.*`? → A: No — submodule access only (`foapy.partials.order()`), matching the `foapy.ma` pattern. + +## User Scenarios & Testing *(mandatory)* + +### User Story 1 - Extract Order of Partial Sequence (Priority: P1) + +A researcher has a symbolic sequence with missing or irrelevant positions (e.g., studying a subset of characters in a text while preserving their original positions). They call `foapy.partials.order()` with a masked 1-D array and receive back a masked 1-D array of the same length, where non-empty positions contain the element's alphabet index and empty positions remain masked. + +**Why this priority**: The `order` function is the entry point to the entire FOA pipeline. Without it, no downstream analysis is possible. It also defines the fundamental contract between partials and the core API: 1-D in, 1-D out, gaps preserved. + +**Independent Test**: Can be fully tested by calling `partials.order()` on a masked array and verifying the returned masked 1-D array has the correct alphabet indices at non-masked positions and masked values at masked positions. + +**Acceptance Scenarios**: + +1. **Given** a masked 1-D array `['a', --, 'b', 'a', --]` (positions 1 and 4 masked), **When** `partials.order()` is called, **Then** the result is a masked 1-D array `[0, --, 1, 0, --]` — same length, gap positions remain masked, non-gap positions hold alphabet indices. +2. **Given** a masked 1-D array where all positions are masked, **When** `partials.order()` is called, **Then** the result is a fully masked 1-D array of the same length. +3. **Given** a masked 1-D array with no masked positions, **When** `partials.order()` is called, **Then** the result matches `foapy.core.order()` applied to the same sequence. +4. **Given** a 2-D masked array, **When** `partials.order()` is called, **Then** a `Not1DArrayException` is raised. +5. **Given** `return_alphabet=True`, **When** `partials.order()` is called, **Then** both the masked 1-D order array and the alphabet of non-masked unique values are returned. + +--- + +### User Story 2 - Compute Intervals Chain for Partial Sequence (Priority: P2) + +A researcher has a partial order (1-D masked array from `partials.order()`) and wants to compute the intervals chain. They call `foapy.partials.intervals_chain()` and receive a masked 1-D array of the same length, where non-empty positions contain the interval distance (counting actual positional distances including gap positions), and empty positions remain masked. + +**Why this priority**: Intervals are the core measurement of FOA. The critical semantic distinction of `foapy.partials` vs `foapy.ma` is that gaps contribute to interval distances — a gap between two occurrences of the same element increases the measured distance. + +**Independent Test**: Can be fully tested by calling `partials.intervals_chain()` on a masked order array and verifying that interval values at non-masked positions reflect actual positional distances (including any masked gap positions between occurrences). + +**Acceptance Scenarios**: + +1. **Given** a partial sequence `[--, C, T, C, --, G]` and its partial order `[--, 0, 1, 0, --, 2]`, **When** `partials.intervals_chain()` is called with `binding.start` and `chain_mode.boundary`, **Then** the result is `[--, 2, 3, 2, --, 6]` — intervals count positional distances including gap positions. +2. **Given** two consecutive non-masked occurrences of the same element with one masked gap between them, **When** `partials.intervals_chain()` is called, **Then** the interval distance is 2 (gap counts as distance), not 1. +3. **Given** a partial order with all non-masked positions, **When** `partials.intervals_chain()` is called, **Then** the result matches `foapy.core.intervals_chain()` applied to the same sequence. +4. **Given** a partial order with all positions masked, **When** `partials.intervals_chain()` is called, **Then** a fully masked 1-D array of the same length is returned. + +--- + +### User Story 3 - Apply Tuple Boundary Strategy to Partial Intervals (Priority: P3) + +A researcher has a partial intervals chain and wants to apply a boundary strategy (lossy, normal, or redundant) to produce the final intervals tuple. They call `foapy.partials.intervals_tuple()` and receive a masked 1-D array of the same length with boundary-adjusted interval values. + +**Why this priority**: Completes the pipeline to match core's full API. Required for all downstream characteristic computations that depend on boundary-adjusted intervals. + +**Independent Test**: Can be tested by verifying that `partials.intervals_tuple()` produces the same boundary adjustments as `foapy.core.intervals_tuple()` for the non-masked positions of an equivalent chain. + +**Acceptance Scenarios**: + +1. **Given** a partial intervals chain and `tuple_mode.lossy`, **When** `partials.intervals_tuple()` is called, **Then** boundary intervals at non-masked positions are dropped and their positions become masked in the result. +2. **Given** a partial intervals chain and `tuple_mode.normal`, **When** `partials.intervals_tuple()` is called, **Then** the partial structure is preserved and boundary intervals at non-masked positions are retained unchanged. +3. **Given** a partial intervals chain and `tuple_mode.redundant`, **When** `partials.intervals_tuple()` is called, **Then** boundary intervals at non-masked positions are expanded and the output remains a masked 1-D array. + +--- + +### User Story 4 - Extract Alphabet from Partial Sequence (Priority: P4) + +A researcher needs to know which unique elements appear in a partial sequence. They call `foapy.partials.alphabet()` with a masked array and receive a plain 1-D array of the unique non-masked elements in order of first appearance. + +**Why this priority**: A supporting utility used alongside `order()` and needed for result interpretation. + +**Independent Test**: Can be tested by verifying that `partials.alphabet()` returns only unique non-masked values in first-appearance order, ignoring masked positions entirely. + +**Acceptance Scenarios**: + +1. **Given** a masked array `['a', --, 'b', 'a', --]`, **When** `partials.alphabet()` is called, **Then** the result is `['a', 'b']` — masked positions excluded, first-appearance order preserved. +2. **Given** a fully masked array, **When** `partials.alphabet()` is called, **Then** an empty array is returned. + +--- + +### Edge Cases + +- What happens when the partial sequence has only a single unique non-masked element? All intervals reduce to distances from the sequence boundary. +- What happens when adjacent non-masked positions are the same element? Interval distance is 1. +- How does a fully masked sequence propagate through the pipeline? Each stage returns a fully masked output of the same length. +- What happens when a non-masked element appears only once in the sequence? Only the boundary interval exists; under `tuple_mode.lossy` it is dropped. + +## Requirements *(mandatory)* + +### Functional Requirements + +- **FR-001**: `foapy.partials.order(X, return_alphabet=False)` MUST accept a masked 1-D array (or plain array, auto-wrapped as fully unmasked) and return a masked 1-D array of the same length, with alphabet indices at non-masked positions and masked values at masked positions. +- **FR-002**: `foapy.partials.order()` MUST raise `Not1DArrayException` when given a multi-dimensional input. +- **FR-003**: `foapy.partials.order()` with `return_alphabet=True` MUST return a tuple of `(masked_order_array, alphabet_array)` where `alphabet_array` contains only the unique non-masked values in first-appearance order. +- **FR-004**: `foapy.partials.intervals_chain(X, binding, chain_mode)` MUST accept a masked 1-D order array (or plain array, auto-wrapped as fully unmasked) and return a masked 1-D array of the same length, where interval distances at non-masked positions reflect actual positional distances including gap positions. +- **FR-005**: `foapy.partials.intervals_chain()` MUST treat masked gap positions as contributing to interval distance, so an element at position i and a matching element at position j with k masked gaps between them produces interval `j - i`. +- **FR-006**: `foapy.partials.intervals_tuple(chain, binding, tuple_mode)` MUST accept a masked 1-D intervals chain (or plain array, auto-wrapped as fully unmasked) and return a masked 1-D array with boundary strategy applied to non-masked positions. +- **FR-007**: `foapy.partials.alphabet(X)` MUST accept a masked 1-D array (or plain array, auto-wrapped as fully unmasked) and return a plain 1-D array of unique non-masked values in first-appearance order. +- **FR-008**: All `foapy.partials` functions MUST preserve the masked positions from input to output — a position that is masked in the input MUST be masked in the output. +- **FR-009**: The `foapy.partials` package MUST expose exactly four functions in its public API: `order`, `alphabet`, `intervals_chain`, `intervals_tuple`. `intervals_distribution` is excluded from scope. +- **FR-010**: When a `foapy.partials` function receives input with no masked positions, its result MUST be equivalent to the corresponding `foapy.core` function applied to the same data (as a plain array). +- **FR-011**: `foapy.partials` MUST be accessible as a submodule (`import foapy.partials` or `foapy.partials.order()`); its functions MUST NOT be hoisted to the `foapy.*` top-level namespace. + +### Key Entities + +- **Partial Sequence**: A 1-D masked array where masked positions represent empty/skip elements and non-masked positions hold symbolic values. +- **Partial Order**: A masked 1-D array where non-masked positions hold the alphabet index of the corresponding element and masked positions match the input mask. +- **Partial Intervals Chain**: A masked 1-D array of interval distances where non-masked positions hold the distance to the previous (or next) occurrence of the same element, counting positional distance including any gap positions between them. + +## Success Criteria *(mandatory)* + +### Measurable Outcomes + +- **SC-001**: All four functions (`order`, `alphabet`, `intervals_chain`, `intervals_tuple`) are callable via `import foapy.partials` without errors. +- **SC-002**: For any partial sequence with no masked positions, `foapy.partials` produces results identical to `foapy.core` applied to the same plain sequence. +- **SC-003**: 100% of the `foapy.core` test cases pass an equivalent `foapy.partials` test with unmasked arrays, confirming behavioral parity at zero gaps. +- **SC-004**: Gap positions in the input are preserved as masked positions in every output, verified across all four functions. +- **SC-005**: Interval distances computed by `partials.intervals_chain()` correctly reflect actual positional distances (including gaps), verified against known examples from the partials documentation. +- **SC-006**: The package is fully covered by automated tests with the same structure as existing core and ma test suites. + +## Assumptions + +- The `foapy.partials` package scope is limited to four functions: `order`, `alphabet`, `intervals_chain`, `intervals_tuple`. `intervals_distribution` and all characteristics functions are out of scope for this feature. +- Masked positions represent "empty" or "skip" elements — excluded from alphabet construction and order assignment, but their positions count toward interval distances. +- `foapy.partials.order()` returns a 1-D masked array (not the 2-D matrix returned by `foapy.ma.order()`), which is the defining structural difference between the two packages. +- All four functions auto-wrap plain numpy arrays or Python lists as fully unmasked masked arrays; no explicit `numpy.ma.MaskedArray` construction is required by callers. +- `foapy.partials` is accessible as a submodule only (`foapy.partials.order()`), matching the `foapy.ma` pattern — functions are not hoisted to the `foapy.*` top-level namespace. +- `foapy.partials` does not export `binding`, `chain_mode`, or `tuple_mode` — users import those from `foapy.core`. +- The sole runtime dependency remains `numpy >= 1.20`; no new dependencies are introduced. diff --git a/specs/004-partials-package/tasks.md b/specs/004-partials-package/tasks.md new file mode 100644 index 00000000..bd50cec4 --- /dev/null +++ b/specs/004-partials-package/tasks.md @@ -0,0 +1,199 @@ +# Tasks: foapy.partials Package + +**Input**: Design documents from `/specs/004-partials-package/` +**Prerequisites**: plan.md ✓, spec.md ✓, research.md ✓, data-model.md ✓, contracts/ ✓, quickstart.md ✓ + +**Tests**: Included — required by Constitution Principle II and SC-006 (full test coverage). + +**Organization**: Tasks grouped by user story (4 stories → Phases 3–6), each independently testable. + +## Format: `[ID] [P?] [Story] Description` + +- **[P]**: Parallelizable with other [P] tasks (different files, no dependencies) +- **[Story]**: Maps to user story (US1=order, US2=intervals_chain, US3=intervals_tuple, US4=alphabet) + +--- + +## Phase 1: Setup (Package Skeleton) + +**Purpose**: Create the `foapy.partials` package structure and wire it into `foapy`. + +- [X] T001 Create `src/foapy/partials/` directory with empty `__init__.py` shell (exports: `order`, `alphabet`, `intervals_chain`, `intervals_tuple` — stubs filled in per story) +- [X] T002 Update `src/foapy/__init__.py`: add `'partials'` to `__foapy_submodules__` set and add `if attr == "partials": import foapy.partials as partials; return partials` branch in `__getattr__` + +**Checkpoint**: `import foapy.partials` must not raise `AttributeError`. + +--- + +## Phase 3: User Story 1 — `partials.order()` (Priority: P1) 🎯 MVP + +**Goal**: Deliver `partials.order()` — accepts a masked or plain 1-D array, returns a masked 1-D array of the same length with alphabet indices at non-masked positions. + +**Independent Test**: `import foapy.partials as p; import numpy.ma as ma; X = ma.masked_array(['a','b','a'], mask=[0,1,0]); r = p.order(X); assert r.shape == (3,) and list(r.compressed()) == [0, 0]` + +### Tests for User Story 1 + +> **Write these tests FIRST; ensure they FAIL before implementing `_order.py`** + +- [X] T003 [P] [US1] Write tests for `partials.order` in `tests/test_partials_order.py` covering: empty array, single element, all-unique symbols, all-same symbol, realistic dataset, fully masked, partially masked, no-mask passthrough (result equals `foapy.core.order`), multi-dimensional input raises `Not1DArrayException`, `return_alphabet=True` returns correct alphabet + +### Implementation for User Story 1 + +- [X] T004 [US1] Implement `src/foapy/partials/_order.py`: `ma.asarray` wrap, 1-D validation, `ar.compressed()` + `core.order()` + `result_data[~mask] = order_compressed` + `ma.masked_array(result_data, mask)`, `return_alphabet` branch +- [X] T005 [US1] Export `order` from `src/foapy/partials/__init__.py` + +**Checkpoint**: `tox -e default -- tests/test_partials_order.py -v` passes with zero failures. + +--- + +## Phase 4: User Story 2 — `partials.intervals_chain()` (Priority: P2) + +**Goal**: Deliver `partials.intervals_chain()` — accepts a masked or plain 1-D raw sequence, returns a masked 1-D array of the same length where interval distances at non-masked positions reflect actual positional distances including gap positions. + +**Independent Test**: `X = ma.masked_array(['_','C','T','C','_','G'], mask=[1,0,0,0,1,0]); chain = p.intervals_chain(X, foapy.binding.start, foapy.chain_mode.boundary); assert list(chain.compressed()) == [2, 3, 2, 6]` + +### Tests for User Story 2 + +> **Write these tests FIRST; ensure they FAIL before implementing `_intervals_chain.py`** + +- [X] T006 [P] [US2] Write tests for `partials.intervals_chain` in `tests/test_partials_intervals_chain.py` covering: empty array, single element, all-unique symbols, all-same symbol, realistic dataset with gaps, fully masked array, partially masked (gaps count toward distance), no-mask passthrough (equals `foapy.core.intervals_chain`), invalid binding raises `ValueError`, invalid chain_mode raises `ValueError`, multi-dimensional input raises `Not1DArrayException`, both `binding.start` and `binding.end`, both `chain_mode.boundary` and `chain_mode.cycle` + +### Implementation for User Story 2 + +- [X] T007 [US2] Implement `src/foapy/partials/_intervals_chain.py`: `ma.asarray` wrap, parameter validation, `non_masked_idx = np.where(~full_mask)[0]`, binding.end reversal, stable argsort on `compressed_values`, group boundary detection (`first_mask`/`last_mask`), `chain_compressed[1:] = actual_pos[perm[1:]] - actual_pos[perm[:-1]]`, boundary intervals using `actual_pos[perm[first_mask]] + delta`, inverse permutation, `ma.masked_array(result_data, mask=full_mask)` +- [X] T008 [US2] Export `intervals_chain` from `src/foapy/partials/__init__.py` + +**Checkpoint**: `tox -e default -- tests/test_partials_intervals_chain.py -v` passes with zero failures. + +--- + +## Phase 5: User Story 3 — `partials.intervals_tuple()` (Priority: P3) + +**Goal**: Deliver `partials.intervals_tuple()` — applies `normal` / `lossy` / `redundant` boundary strategies to a masked 1-D intervals chain. `lossy` masks boundary positions in-place (same length); `redundant` appends k trailing intervals. + +**Independent Test**: `chain = ma.masked_array([0,2,3,2,0,6], mask=[1,0,0,0,1,0]); result = p.intervals_tuple(chain, foapy.binding.start, foapy.tuple_mode.normal); assert result.shape == (6,) and np.array_equal(result.mask, chain.mask)` + +### Tests for User Story 3 + +> **Write these tests FIRST; ensure they FAIL before implementing `_intervals_tuple.py`** + +- [X] T009 [P] [US3] Write tests for `partials.intervals_tuple` in `tests/test_partials_intervals_tuple.py` covering: `tuple_mode.normal` (mask unchanged, values unchanged), `tuple_mode.lossy` (boundary positions additionally masked, output length == input length), `tuple_mode.redundant` (trailing k intervals appended unmasked, output length == n+k), no-mask passthrough for normal/lossy (non-masked values equal `foapy.core.intervals_tuple` result), empty input, fully masked input, invalid binding raises `ValueError`, invalid tuple_mode raises `ValueError` + +### Implementation for User Story 3 + +- [X] T010 [US3] Implement `src/foapy/partials/_intervals_tuple.py`: `ma.asarray` wrap, parameter validation, `normal` (copy), `lossy` (compress → `ar > positions` boundary detection → map boundary indices back to original positions → set `new_mask[boundary_original_idx] = True`), `redundant` (compress → `core.intervals_tuple` with redundant → split plain result at `len(compressed)` → `np.concatenate` with trailing; append unmasked) +- [X] T011 [US3] Export `intervals_tuple` from `src/foapy/partials/__init__.py` + +**Checkpoint**: `tox -e default -- tests/test_partials_intervals_tuple.py -v` passes with zero failures. + +--- + +## Phase 6: User Story 4 — `partials.alphabet()` (Priority: P4) + +**Goal**: Deliver `partials.alphabet()` — accepts a masked or plain 1-D array, returns a plain 1-D array of unique non-masked values in first-appearance order. + +**Independent Test**: `X = ma.masked_array(['a','b','a','c'], mask=[0,1,0,0]); assert list(p.alphabet(X)) == ['a', 'c']` + +### Tests for User Story 4 + +> **Write these tests FIRST; ensure they FAIL before implementing `_alphabet.py`** + +- [X] T012 [P] [US4] Write tests for `partials.alphabet` in `tests/test_partials_alphabet.py` covering: empty array, single element, all-unique symbols, all-same symbol, realistic dataset, fully masked (returns empty array), partially masked (masked values excluded), no-mask passthrough (equals `foapy.core.alphabet`), multi-dimensional input raises `Not1DArrayException` + +### Implementation for User Story 4 + +- [X] T013 [US4] Implement `src/foapy/partials/_alphabet.py`: `ma.asarray` wrap, 1-D validation, `_, alphabet = core.order(ar.compressed(), return_alphabet=True); return alphabet` +- [X] T014 [US4] Export `alphabet` from `src/foapy/partials/__init__.py` + +**Checkpoint**: `tox -e default -- tests/test_partials_alphabet.py -v` passes with zero failures. + +--- + +## Phase 7: Polish & Cross-Cutting Concerns + +**Purpose**: End-to-end verification, pipeline consistency, and code quality. + +- [X] T015 Add pipeline consistency test in `tests/test_partials_pipeline_consistency.py`: for any plain (unmasked) input, verify `partials.order == core.order`, `partials.intervals_chain == core.intervals_chain`, and `partials.intervals_tuple == core.intervals_tuple` across all binding × chain_mode × tuple_mode combinations +- [X] T016 Run full test suite `tox -e default` and verify zero failures across all tests including existing `foapy.core` and `foapy.ma` tests (no regressions) +- [X] T017 [P] Run linting `pipx run pre-commit run --all-files --show-diff-on-failure` and fix any black/isort/flake8 violations in `src/foapy/partials/` +- [X] T018 [P] Validate quickstart examples from `specs/004-partials-package/quickstart.md` execute without error in a Python REPL + +--- + +## Dependencies & Execution Order + +### Phase Dependencies + +- **Setup (Phase 1)**: No dependencies — start immediately +- **User Stories (Phases 3–6)**: Depend on Phase 1 completion; each story is independent of the others +- **Polish (Phase 7)**: Depends on all desired user stories being complete + +### User Story Dependencies + +- **US1 (P1)**: No dependency on US2/US3/US4 +- **US2 (P2)**: No dependency on US1/US3/US4 +- **US3 (P3)**: No dependency on US1/US2/US4 +- **US4 (P4)**: No dependency on US1/US2/US3 + +All four user stories can proceed in parallel after Phase 1 completes. + +### Within Each User Story + +1. Write tests (T003/T006/T009/T012) → **verify they FAIL** +2. Implement module (T004/T007/T010/T013) +3. Export from `__init__.py` (T005/T008/T011/T014) +4. **Verify tests now PASS** + +### Parallel Opportunities + +- T003, T006, T009, T012 (test writing) can all run in parallel — different files +- T004, T007, T010, T013 (implementations) can run in parallel — different files, but wait for respective test task first +- T017, T018 (polish) can run in parallel + +--- + +## Parallel Example: After Phase 1 + +```bash +# All four test tasks can be launched together: +Task T003: "Write tests for partials.order in tests/test_partials_order.py" +Task T006: "Write tests for partials.intervals_chain in tests/test_partials_intervals_chain.py" +Task T009: "Write tests for partials.intervals_tuple in tests/test_partials_intervals_tuple.py" +Task T012: "Write tests for partials.alphabet in tests/test_partials_alphabet.py" +``` + +--- + +## Implementation Strategy + +### MVP First (User Story 1 Only) + +1. Complete Phase 1: Setup (T001–T002) +2. Complete Phase 3: US1 (T003–T005) +3. **STOP and VALIDATE**: `tox -e default -- tests/test_partials_order.py -v` +4. `import foapy.partials; foapy.partials.order(...)` works end-to-end + +### Incremental Delivery + +1. Phase 1 → Foundation ready +2. Phase 3 (US1: order) → MVP: partials.order works +3. Phase 4 (US2: intervals_chain) → Full interval pipeline +4. Phase 5 (US3: intervals_tuple) → Complete boundary handling +5. Phase 6 (US4: alphabet) → Full public API complete +6. Phase 7 (Polish) → Test suite green, linting clean + +### Parallel Strategy + +With one developer, recommended order: T001 → T002 → T003 → T004 → T005 → T006 → T007 → T008 → T009 → T010 → T011 → T012 → T013 → T014 → T015 → T016 → T017 → T018 + +--- + +## Notes + +- [P] tasks operate on different files with no incomplete dependencies +- Each test task must produce failing tests BEFORE the paired implementation task runs +- Constitution Principle IV: all implementations must use vectorized numpy — no Python loops over array elements +- `intervals_chain` accepts the **raw sequence** (not the order output) — mirrors `foapy.core.intervals_chain` +- `intervals_tuple.lossy` preserves array length (masks boundary positions rather than removing them) +- `intervals_tuple.redundant` appends k trailing unmasked elements (output length = n + k) +- `foapy/__init__.py` update (T002) is the only modification to an existing file diff --git a/src/foapy/__init__.py b/src/foapy/__init__.py index 6d714b9a..5fc772ea 100644 --- a/src/foapy/__init__.py +++ b/src/foapy/__init__.py @@ -39,7 +39,7 @@ # __getattr__. Note that `distutils` (deprecated) and `array_api` # (experimental label) are not added here, because `from foapy import *` # must not raise any warnings - that's too disruptive. - __foapy_submodules__ = {"ma", "exceptions", "core", "characteristics"} + __foapy_submodules__ = {"ma", "exceptions", "core", "characteristics", "partials"} __all__ = list( __foapy_submodules__ @@ -76,6 +76,11 @@ def __getattr__(attr): return ma + if attr == "partials": + import foapy.partials as partials + + return partials + raise AttributeError( "module {!r} has no attribute " "{!r}".format(__name__, attr) ) diff --git a/src/foapy/partials/__init__.py b/src/foapy/partials/__init__.py new file mode 100644 index 00000000..e6b851dd --- /dev/null +++ b/src/foapy/partials/__init__.py @@ -0,0 +1,23 @@ +import sys + +try: + __FOAPY_SETUP__ +except NameError: + __FOAPY_SETUP__ = False + +if __FOAPY_SETUP__: + sys.stderr.write("Running from foapy source directory.\n") +else: + from ._alphabet import alphabet # noqa: F401 + from ._intervals_chain import intervals_chain # noqa: F401 + from ._intervals_tuple import intervals_tuple # noqa: F401 + from ._order import order # noqa: F401 + + __all__ = list( + { + "order", + "alphabet", + "intervals_chain", + "intervals_tuple", + } + ) diff --git a/src/foapy/partials/_alphabet.py b/src/foapy/partials/_alphabet.py new file mode 100644 index 00000000..8185880d --- /dev/null +++ b/src/foapy/partials/_alphabet.py @@ -0,0 +1,42 @@ +import numpy.ma as ma + +from foapy.core._order import order as core_order +from foapy.exceptions import Not1DArrayException + + +def alphabet(X): + """ + Extract the alphabet of a partial sequence. + + Parameters + ---------- + X : array_like or numpy.ma.MaskedArray + 1-D sequence (plain or masked). Masked positions are excluded. + + Returns + ------- + alphabet : numpy.ndarray, shape (p,) + Unique non-masked values in first-appearance order. + p = number of unique non-masked values. Empty array when all positions + are masked or input is empty. + + Raises + ------ + Not1DArrayException + When X has more than one dimension. + """ + ar = ma.asarray(X) + + if ar.ndim > 1: + raise Not1DArrayException( + {"message": f"Incorrect array form. Expected d1 array, exists {ar.ndim}"} + ) + + compressed = ar.compressed() + if len(compressed) == 0: + import numpy as np + + return np.array([], dtype=ar.dtype) + + _, alphabet_values = core_order(compressed, return_alphabet=True) + return alphabet_values diff --git a/src/foapy/partials/_intervals_chain.py b/src/foapy/partials/_intervals_chain.py new file mode 100644 index 00000000..94e1b6df --- /dev/null +++ b/src/foapy/partials/_intervals_chain.py @@ -0,0 +1,121 @@ +import numpy as np +import numpy.ma as ma + +from foapy.core._binding import binding as binding_cls +from foapy.core._chain_mode import chain_mode as chain_mode_cls +from foapy.exceptions import Not1DArrayException + + +def intervals_chain(X, binding: int, chain_mode: int) -> ma.MaskedArray: + """ + Compute the partial intervals chain from a sequence with gaps. + + Unlike foapy.ma.intervals_chain, gap positions (masked values) are NOT + compressed out. Interval distances are measured using actual positional + indices in the full array, so gaps between two occurrences of the same + element increase the measured interval. + + Parameters + ---------- + X : array_like or numpy.ma.MaskedArray + 1-D raw sequence (plain or masked). Pass the original sequence, not + the order output. Masked positions are treated as gaps. + binding : int + ``binding.start`` (1) — intervals extracted left-to-right. + ``binding.end`` (2) — intervals extracted right-to-left. + chain_mode : int + ``chain_mode.boundary`` (1) — finite sequence; boundary intervals are + distances from sequence edges to first/last occurrence. + ``chain_mode.cycle`` (2) — cyclic; wrap-around distance used. + + Returns + ------- + numpy.ma.MaskedArray, shape (n,), dtype numpy.intp + Masked 1-D array of the same length as X. Non-masked positions hold + the interval distance (≥1). Masked positions are identical to the + input mask. + + Raises + ------ + Not1DArrayException + When X has more than one dimension. + ValueError + When ``binding`` or ``chain_mode`` is invalid. + """ + if binding not in {binding_cls.start, binding_cls.end}: + raise ValueError( + {"message": "Invalid binding value. Use binding.start or binding.end."} + ) + + if chain_mode not in {chain_mode_cls.boundary, chain_mode_cls.cycle}: + raise ValueError( + { + "message": ( + "Invalid chain_mode value. " + "Use chain_mode.boundary or chain_mode.cycle." + ) + } + ) + + ar = ma.asarray(X) + + if ar.ndim > 1: + raise Not1DArrayException( + {"message": f"Incorrect array form. Expected d1 array, exists {ar.ndim}"} + ) + + n = len(ar) + full_mask = ma.getmaskarray(ar) + orig_non_masked_idx = np.where(~full_mask)[0] + compressed_values = ar.compressed() + m = len(compressed_values) + + result_data = np.zeros(n, dtype=np.intp) + + if m == 0: + return ma.masked_array(result_data, mask=full_mask) + + # For binding.end, reverse both values and positions to compute right-to-left. + if binding == binding_cls.end: + work_values = compressed_values[::-1] + work_pos = (n - 1 - orig_non_masked_idx)[::-1] + else: + work_values = compressed_values + work_pos = orig_non_masked_idx + + # Stable sort by value — same element group detection as core. + perm = np.argsort(work_values, kind="mergesort") + + # Detect group boundaries (first and last occurrence per unique value). + group_boundary = np.empty(m + 1, dtype=bool) + group_boundary[:1] = True + group_boundary[1:-1] = work_values[perm[1:]] != work_values[perm[:-1]] + group_boundary[-1:] = True + + first_mask_arr = group_boundary[:-1] + last_mask_arr = group_boundary[1:] + + chain_compressed = np.empty(m, dtype=np.intp) + + # Consecutive position differences within each group. + chain_compressed[1:] = work_pos[perm[1:]] - work_pos[perm[:-1]] + + # Boundary intervals for first occurrence of each group. + if chain_mode == chain_mode_cls.cycle: + delta = n - work_pos[perm[last_mask_arr]] + else: + delta = 1 + + chain_compressed[first_mask_arr] = work_pos[perm[first_mask_arr]] + delta + + # Restore original compressed order via inverse permutation. + inverse_perm = np.empty(m, dtype=np.intp) + inverse_perm[perm] = np.arange(m) + result_compressed = chain_compressed[inverse_perm] + + # Reverse result back to original (non-reversed) order for binding.end. + if binding == binding_cls.end: + result_compressed = result_compressed[::-1] + + result_data[orig_non_masked_idx] = result_compressed + return ma.masked_array(result_data, mask=full_mask) diff --git a/src/foapy/partials/_intervals_tuple.py b/src/foapy/partials/_intervals_tuple.py new file mode 100644 index 00000000..ca640794 --- /dev/null +++ b/src/foapy/partials/_intervals_tuple.py @@ -0,0 +1,124 @@ +import numpy as np +import numpy.ma as ma + +from foapy.core._binding import binding as binding_cls +from foapy.core._tuple_mode import tuple_mode as tuple_mode_cls + + +def intervals_tuple(chain, binding: int, tuple_mode: int) -> ma.MaskedArray: + """ + Apply a boundary handling strategy to a partial intervals chain. + + Parameters + ---------- + chain : array_like or numpy.ma.MaskedArray + 1-D intervals chain produced by ``partials.intervals_chain``. + Plain arrays are auto-wrapped (treated as fully unmasked). + binding : int + Must match the binding used to produce the chain. + ``binding.start`` (1) or ``binding.end`` (2). + tuple_mode : int + ``tuple_mode.normal`` (2) — return chain unchanged. + ``tuple_mode.lossy`` (1) — mask boundary (first-occurrence) intervals + in-place; output length equals input length. + ``tuple_mode.redundant`` (3) — append k trailing complementary + boundary intervals; output length = n + k. + + Returns + ------- + numpy.ma.MaskedArray + ``normal`` / ``lossy``: shape (n,), dtype numpy.intp. + ``redundant``: shape (n + k,), dtype numpy.intp. + + Raises + ------ + ValueError + When ``binding`` or ``tuple_mode`` is invalid. + """ + if binding not in {binding_cls.start, binding_cls.end}: + raise ValueError( + {"message": "Invalid binding value. Use binding.start or binding.end."} + ) + + valid_modes = { + tuple_mode_cls.lossy, + tuple_mode_cls.normal, + tuple_mode_cls.redundant, + } + if tuple_mode not in valid_modes: + raise ValueError( + { + "message": ( + "Invalid tuple_mode value. " + "Use tuple_mode.lossy, normal, or redundant." + ) + } + ) + + ar = ma.asarray(chain) + + if tuple_mode == tuple_mode_cls.normal: + return ar.copy() + + chain_mask = ma.getmaskarray(ar) + non_masked_idx = np.where(~chain_mask)[0] + compressed = ar.compressed().astype(np.intp) + m = len(compressed) + + if m == 0: + return ar.copy() + + if tuple_mode == tuple_mode_cls.lossy: + return _lossy(ar, chain_mask, non_masked_idx, compressed, binding) + + return _redundant(ar, chain_mask, non_masked_idx, compressed, binding, len(ar)) + + +def _lossy(ar, chain_mask, non_masked_idx, compressed, binding): + # Identify boundary (first-occurrence) intervals by the same criterion as + # core.intervals_tuple.lossy: ar[i] > i in the compressed chain. + # For binding.end, reverse compressed first (matches core reversal). + work = compressed[::-1] if binding == binding_cls.end else compressed + m = len(work) + + positions = np.arange(m, dtype=np.intp) + first = work > positions # True = boundary interval + + if binding == binding_cls.end: + # Indices in reversed compressed → map back to original compressed order. + boundary_compressed_idx = m - 1 - np.where(first)[0] + else: + boundary_compressed_idx = np.where(first)[0] + + boundary_orig_positions = non_masked_idx[boundary_compressed_idx] + + new_mask = chain_mask.copy() + new_mask[boundary_orig_positions] = True + return ma.masked_array(ar.data.copy(), mask=new_mask) + + +def _redundant(ar, chain_mask, non_masked_idx, compressed, binding, n_full): + # Compute trailing intervals using actual positional distances in the full array. + # Uses the same "last occurrence" detection as core.intervals_tuple.redundant + # but substitutes full-array positions for compressed positions. + if binding == binding_cls.end: + work = compressed[::-1] + work_pos = (n_full - 1 - non_masked_idx)[::-1] + else: + work = compressed + work_pos = non_masked_idx + + m = len(work) + positions = np.arange(m, dtype=np.intp) + prev_pos = positions - work + + last_mask_arr = np.ones(m, dtype=bool) + valid_prev = prev_pos >= 0 + if np.any(valid_prev): + last_mask_arr[prev_pos[valid_prev]] = False + + trailing = n_full - work_pos[last_mask_arr] + + result_data = np.concatenate([ar.data, trailing]) + result_mask = np.concatenate([chain_mask, np.zeros(len(trailing), dtype=bool)]) + return ma.masked_array(result_data, mask=result_mask) diff --git a/src/foapy/partials/_order.py b/src/foapy/partials/_order.py new file mode 100644 index 00000000..73b0d130 --- /dev/null +++ b/src/foapy/partials/_order.py @@ -0,0 +1,61 @@ +import numpy as np +import numpy.ma as ma + +from foapy.core._order import order as core_order +from foapy.exceptions import Not1DArrayException + + +def order(X, return_alphabet=False): + """ + Map a partial sequence to its order, preserving gap positions. + + Parameters + ---------- + X : array_like or numpy.ma.MaskedArray + 1-D sequence (plain or masked). Masked positions are treated as gaps + and are preserved in the output. + return_alphabet : bool, optional + If True, also return the alphabet of non-masked unique values. + + Returns + ------- + result : numpy.ma.MaskedArray, shape (n,), dtype numpy.intp + Masked 1-D array of the same length as X. Non-masked positions hold + the element's 0-based alphabet index (first-appearance order). + Masked positions are identical to the input mask. + alphabet : numpy.ndarray, shape (p,) + Only returned when return_alphabet=True. Unique non-masked values in + first-appearance order. p = number of unique non-masked values. + + Raises + ------ + Not1DArrayException + When X has more than one dimension. + """ + ar = ma.asarray(X) + + if ar.ndim > 1: + raise Not1DArrayException( + {"message": f"Incorrect array form. Expected d1 array, exists {ar.ndim}"} + ) + + n = len(ar) + full_mask = ma.getmaskarray(ar) + compressed = ar.compressed() + + if len(compressed) == 0: + result_data = np.zeros(n, dtype=np.intp) + result = ma.masked_array(result_data, mask=full_mask) + if return_alphabet: + return result, np.array([], dtype=ar.dtype) + return result + + order_compressed, alphabet_values = core_order(compressed, return_alphabet=True) + + result_data = np.zeros(n, dtype=np.intp) + result_data[~full_mask] = order_compressed + result = ma.masked_array(result_data, mask=full_mask) + + if return_alphabet: + return result, alphabet_values + return result diff --git a/tests/test_partials_alphabet.py b/tests/test_partials_alphabet.py new file mode 100644 index 00000000..91e186fc --- /dev/null +++ b/tests/test_partials_alphabet.py @@ -0,0 +1,140 @@ +from unittest import TestCase + +import numpy as np +import numpy.ma as ma +import pytest +from numpy.ma.testutils import assert_equal +from numpy.testing import assert_array_equal + +from foapy import alphabet as core_alphabet +from foapy.exceptions import Not1DArrayException +from foapy.partials import alphabet + + +class TestPartialsAlphabet(TestCase): + """ + Test foapy.partials.alphabet(X) -> ndarray. + + Returns a plain 1-D array of unique non-masked values in first-appearance + order. Masked positions are excluded entirely. + """ + + # ------------------------------------------------------------------------- + # Empty input + # ------------------------------------------------------------------------- + + def test_empty_array(self): + X = ma.masked_array([], mask=[]) + result = alphabet(X) + assert len(result) == 0 + + # ------------------------------------------------------------------------- + # Single element + # ------------------------------------------------------------------------- + + def test_single_unmasked_element(self): + X = ma.masked_array(["a"], mask=[0]) + result = alphabet(X) + assert_equal(result, ["a"]) + + def test_single_masked_element_returns_empty(self): + X = ma.masked_array(["a"], mask=[1]) + result = alphabet(X) + assert len(result) == 0 + + # ------------------------------------------------------------------------- + # All unique symbols + # ------------------------------------------------------------------------- + + def test_all_unique_strings(self): + X = ma.masked_array(["a", "b", "c", "d"], mask=[0, 0, 0, 0]) + result = alphabet(X) + assert_equal(result, ["a", "b", "c", "d"]) + + # ------------------------------------------------------------------------- + # All same symbol + # ------------------------------------------------------------------------- + + def test_all_same_symbol(self): + X = ma.masked_array(["a", "a", "a"], mask=[0, 0, 0]) + result = alphabet(X) + assert_equal(result, ["a"]) + + # ------------------------------------------------------------------------- + # Realistic dataset + # ------------------------------------------------------------------------- + + def test_realistic_string_dataset(self): + X = ma.masked_array(["a", "c", "c", "e", "d", "a"], mask=[0, 0, 0, 0, 0, 0]) + result = alphabet(X) + assert_equal(result, ["a", "c", "e", "d"]) + + def test_realistic_integer_dataset(self): + X = ma.masked_array([1, 2, 2, 3, 4, 1], mask=[0, 0, 0, 0, 0, 0]) + result = alphabet(X) + assert_equal(result, [1, 2, 3, 4]) + + # ------------------------------------------------------------------------- + # Fully masked + # ------------------------------------------------------------------------- + + def test_fully_masked_returns_empty(self): + X = ma.masked_array(["a", "b", "c"], mask=[1, 1, 1]) + result = alphabet(X) + assert len(result) == 0 + + # ------------------------------------------------------------------------- + # Partially masked — masked values excluded + # ------------------------------------------------------------------------- + + def test_partially_masked_excludes_masked_values(self): + X = ma.masked_array(["a", "b", "a", "c"], mask=[0, 1, 0, 0]) + result = alphabet(X) + assert_equal(result, ["a", "c"]) + + def test_partially_masked_first_appearance_order(self): + # 'b' appears in unmasked positions, but 'a' appears first + X = ma.masked_array(["a", "x", "b", "a", "b"], mask=[0, 1, 0, 0, 0]) + result = alphabet(X) + assert_equal(result, ["a", "b"]) + + # ------------------------------------------------------------------------- + # No-mask passthrough — must equal core.alphabet + # ------------------------------------------------------------------------- + + def test_no_mask_matches_core_alphabet(self): + data = ["a", "b", "a", "c", "d"] + X = ma.masked_array(data, mask=[0, 0, 0, 0, 0]) + core_result = core_alphabet(data) + partial_result = alphabet(X) + assert_array_equal(partial_result, core_result) + + # ------------------------------------------------------------------------- + # Plain array auto-wrapping + # ------------------------------------------------------------------------- + + def test_plain_list_accepted(self): + result = alphabet(["a", "b", "a"]) + assert_equal(result, ["a", "b"]) + + def test_plain_ndarray_accepted(self): + result = alphabet(np.array(["a", "b", "a"])) + assert_equal(result, ["a", "b"]) + + # ------------------------------------------------------------------------- + # Return type is plain ndarray (not masked) + # ------------------------------------------------------------------------- + + def test_returns_plain_ndarray(self): + X = ma.masked_array(["a", "b"], mask=[0, 1]) + result = alphabet(X) + assert not isinstance(result, ma.MaskedArray) + + # ------------------------------------------------------------------------- + # Error handling + # ------------------------------------------------------------------------- + + def test_2d_array_raises_not1d(self): + X = ma.masked_array([[1, 2], [3, 4]]) + with pytest.raises(Not1DArrayException): + alphabet(X) diff --git a/tests/test_partials_intervals_chain.py b/tests/test_partials_intervals_chain.py new file mode 100644 index 00000000..faded390 --- /dev/null +++ b/tests/test_partials_intervals_chain.py @@ -0,0 +1,217 @@ +from unittest import TestCase + +import numpy as np +import numpy.ma as ma +import pytest +from numpy.testing import assert_array_equal + +from foapy import binding, chain_mode +from foapy.core import intervals_chain as core_intervals_chain +from foapy.exceptions import Not1DArrayException +from foapy.partials import intervals_chain + + +class TestPartialsIntervalsChain(TestCase): + """ + Test foapy.partials.intervals_chain(X, binding, chain_mode) -> masked_array. + + Key semantic: gap positions (masked values) are preserved in output and + their positional distances COUNT toward interval values. + """ + + # ------------------------------------------------------------------------- + # Empty input + # ------------------------------------------------------------------------- + + def test_empty_array(self): + X = ma.masked_array([], mask=[]) + result = intervals_chain(X, binding.start, chain_mode.boundary) + assert result.shape == (0,) + + # ------------------------------------------------------------------------- + # Single element + # ------------------------------------------------------------------------- + + def test_single_unmasked_element_boundary(self): + X = ma.masked_array(["a"], mask=[0]) + result = intervals_chain(X, binding.start, chain_mode.boundary) + assert result.shape == (1,) + assert_array_equal(result.compressed(), [1]) + + def test_single_masked_element_returns_masked(self): + X = ma.masked_array(["a"], mask=[1]) + result = intervals_chain(X, binding.start, chain_mode.boundary) + assert result.shape == (1,) + assert np.all(ma.getmaskarray(result)) + + # ------------------------------------------------------------------------- + # All unique symbols + # ------------------------------------------------------------------------- + + def test_all_unique_boundary_start(self): + X = ma.masked_array(["a", "b", "c"], mask=[0, 0, 0]) + result = intervals_chain(X, binding.start, chain_mode.boundary) + assert result.shape == (3,) + # Each symbol appears once; intervals = position + 1 + assert_array_equal(result.compressed(), [1, 2, 3]) + + # ------------------------------------------------------------------------- + # All same symbol + # ------------------------------------------------------------------------- + + def test_all_same_boundary_start(self): + X = ma.masked_array(["a", "a", "a"], mask=[0, 0, 0]) + result = intervals_chain(X, binding.start, chain_mode.boundary) + # Intervals: first=1, then consecutive diffs = 1, 1 + assert_array_equal(result.compressed(), [1, 1, 1]) + + # ------------------------------------------------------------------------- + # Realistic dataset — no gaps + # ------------------------------------------------------------------------- + + def test_realistic_no_gaps_boundary_start(self): + X = ma.masked_array(["b", "a", "b", "c", "b"], mask=[0, 0, 0, 0, 0]) + result = intervals_chain(X, binding.start, chain_mode.boundary) + expected = core_intervals_chain( + ["b", "a", "b", "c", "b"], binding.start, chain_mode.boundary + ) + assert_array_equal(result.compressed(), expected) + + # ------------------------------------------------------------------------- + # Gaps count toward distance — core semantic distinction + # ------------------------------------------------------------------------- + + def test_gaps_count_toward_interval_distance(self): + # X = [-, C, T, C, -, G] + X = ma.masked_array( + ["_", "C", "T", "C", "_", "G"], + mask=[1, 0, 0, 0, 1, 0], + ) + result = intervals_chain(X, binding.start, chain_mode.boundary) + # C at position 1: first, interval = 1+1 = 2 + # T at position 2: first, interval = 2+1 = 3 + # C at position 3: second, prev at 1, interval = 3-1 = 2 + # G at position 5: first, interval = 5+1 = 6 + assert result.shape == (6,) + assert_array_equal(ma.getmaskarray(result), [1, 0, 0, 0, 1, 0]) + assert_array_equal(result.compressed(), [2, 3, 2, 6]) + + def test_single_gap_between_same_element(self): + # X = [A, --, A] — gap should increase interval to 2 (not 1) + X = ma.masked_array(["A", "x", "A"], mask=[0, 1, 0]) + result = intervals_chain(X, binding.start, chain_mode.boundary) + assert result.shape == (3,) + assert_array_equal(ma.getmaskarray(result), [0, 1, 0]) + # A at position 0: first, interval = 1; A at position 2: prev=0, interval=2 + assert_array_equal(result.compressed(), [1, 2]) + + def test_multiple_gaps_increase_interval(self): + # X = [A, --, --, A] — two gaps → interval = 3 + X = ma.masked_array(["A", "x", "y", "A"], mask=[0, 1, 1, 0]) + result = intervals_chain(X, binding.start, chain_mode.boundary) + assert_array_equal(result.compressed(), [1, 3]) + + # ------------------------------------------------------------------------- + # Fully masked + # ------------------------------------------------------------------------- + + def test_fully_masked_returns_full_mask(self): + X = ma.masked_array(["a", "b", "c"], mask=[1, 1, 1]) + result = intervals_chain(X, binding.start, chain_mode.boundary) + assert result.shape == (3,) + assert np.all(ma.getmaskarray(result)) + + # ------------------------------------------------------------------------- + # No-mask passthrough — must match core + # ------------------------------------------------------------------------- + + def test_no_mask_matches_core_boundary_start(self): + data = ["b", "a", "b", "c", "b"] + X = ma.masked_array(data, mask=[0] * 5) + core_result = core_intervals_chain(data, binding.start, chain_mode.boundary) + partial_result = intervals_chain(X, binding.start, chain_mode.boundary) + assert_array_equal(partial_result.compressed(), core_result) + + def test_no_mask_matches_core_boundary_end(self): + data = ["b", "a", "b", "c", "b"] + X = ma.masked_array(data, mask=[0] * 5) + core_result = core_intervals_chain(data, binding.end, chain_mode.boundary) + partial_result = intervals_chain(X, binding.end, chain_mode.boundary) + assert_array_equal(partial_result.compressed(), core_result) + + def test_no_mask_matches_core_cycle_start(self): + data = ["b", "a", "b", "c", "b"] + X = ma.masked_array(data, mask=[0] * 5) + core_result = core_intervals_chain(data, binding.start, chain_mode.cycle) + partial_result = intervals_chain(X, binding.start, chain_mode.cycle) + assert_array_equal(partial_result.compressed(), core_result) + + def test_no_mask_matches_core_cycle_end(self): + data = ["b", "a", "b", "c", "b"] + X = ma.masked_array(data, mask=[0] * 5) + core_result = core_intervals_chain(data, binding.end, chain_mode.cycle) + partial_result = intervals_chain(X, binding.end, chain_mode.cycle) + assert_array_equal(partial_result.compressed(), core_result) + + # ------------------------------------------------------------------------- + # binding.end with gaps + # ------------------------------------------------------------------------- + + def test_binding_end_with_gaps(self): + # X = [A, --, B, A] with binding.end + X = ma.masked_array(["A", "x", "B", "A"], mask=[0, 1, 0, 0]) + result = intervals_chain(X, binding.end, chain_mode.boundary) + assert result.shape == (4,) + assert_array_equal(ma.getmaskarray(result), [0, 1, 0, 0]) + + # ------------------------------------------------------------------------- + # chain_mode.cycle with gaps + # ------------------------------------------------------------------------- + + def test_cycle_mode_with_gaps(self): + X = ma.masked_array(["A", "x", "A"], mask=[0, 1, 0]) + result = intervals_chain(X, binding.start, chain_mode.cycle) + assert result.shape == (3,) + assert_array_equal(ma.getmaskarray(result), [0, 1, 0]) + # n=3; A at positions 0,2 + # first (cycle): pos[0] + (n - pos[last]) = 0 + (3 - 2) = 1 + # second: pos[2] - pos[0] = 2 + assert_array_equal(result.compressed(), [1, 2]) + + # ------------------------------------------------------------------------- + # Output mask identical to input mask + # ------------------------------------------------------------------------- + + def test_output_mask_identical_to_input_mask(self): + mask = [0, 1, 0, 1, 0] + X = ma.masked_array(["a", "b", "c", "d", "a"], mask=mask) + result = intervals_chain(X, binding.start, chain_mode.boundary) + assert result.shape == (5,) + assert_array_equal(ma.getmaskarray(result), mask) + + # ------------------------------------------------------------------------- + # Plain array auto-wrapping + # ------------------------------------------------------------------------- + + def test_plain_list_accepted(self): + result = intervals_chain(["a", "b", "a"], binding.start, chain_mode.boundary) + assert result.shape == (3,) + + # ------------------------------------------------------------------------- + # Error handling + # ------------------------------------------------------------------------- + + def test_invalid_binding_raises_value_error(self): + X = ma.masked_array(["a"], mask=[0]) + with pytest.raises(ValueError): + intervals_chain(X, 99, chain_mode.boundary) + + def test_invalid_chain_mode_raises_value_error(self): + X = ma.masked_array(["a"], mask=[0]) + with pytest.raises(ValueError): + intervals_chain(X, binding.start, 99) + + def test_2d_array_raises_not1d(self): + X = ma.masked_array([[1, 2], [3, 4]]) + with pytest.raises(Not1DArrayException): + intervals_chain(X, binding.start, chain_mode.boundary) diff --git a/tests/test_partials_intervals_tuple.py b/tests/test_partials_intervals_tuple.py new file mode 100644 index 00000000..45521f04 --- /dev/null +++ b/tests/test_partials_intervals_tuple.py @@ -0,0 +1,205 @@ +from unittest import TestCase + +import numpy as np +import numpy.ma as ma +import pytest +from numpy.testing import assert_array_equal + +from foapy import binding +from foapy.core import intervals_tuple as core_intervals_tuple +from foapy.core import tuple_mode +from foapy.partials import intervals_tuple + + +class TestPartialsIntervalsTuple(TestCase): + """ + Test foapy.partials.intervals_tuple(chain, binding, tuple_mode) -> masked_array. + + tuple_mode.normal : same length, same mask, values unchanged. + tuple_mode.lossy : same length, boundary positions additionally masked. + tuple_mode.redundant : length n+k, k trailing unmasked intervals appended. + """ + + # ------------------------------------------------------------------------- + # Empty input + # ------------------------------------------------------------------------- + + def test_empty_normal(self): + chain = ma.masked_array([], mask=[], dtype=np.intp) + result = intervals_tuple(chain, binding.start, tuple_mode.normal) + assert result.shape == (0,) + + def test_empty_lossy(self): + chain = ma.masked_array([], mask=[], dtype=np.intp) + result = intervals_tuple(chain, binding.start, tuple_mode.lossy) + assert result.shape == (0,) + + def test_empty_redundant(self): + chain = ma.masked_array([], mask=[], dtype=np.intp) + result = intervals_tuple(chain, binding.start, tuple_mode.redundant) + assert result.shape == (0,) + + # ------------------------------------------------------------------------- + # Fully masked input + # ------------------------------------------------------------------------- + + def test_fully_masked_normal(self): + chain = ma.masked_array([2, 3, 2, 6], mask=[1, 1, 1, 1]) + result = intervals_tuple(chain, binding.start, tuple_mode.normal) + assert result.shape == (4,) + assert np.all(ma.getmaskarray(result)) + + def test_fully_masked_lossy(self): + chain = ma.masked_array([2, 3, 2, 6], mask=[1, 1, 1, 1]) + result = intervals_tuple(chain, binding.start, tuple_mode.lossy) + assert result.shape == (4,) + assert np.all(ma.getmaskarray(result)) + + # ------------------------------------------------------------------------- + # tuple_mode.normal — mask and values unchanged + # ------------------------------------------------------------------------- + + def test_normal_preserves_mask(self): + mask = [1, 0, 0, 0, 1, 0] + chain = ma.masked_array([0, 2, 3, 2, 0, 6], mask=mask, dtype=np.intp) + result = intervals_tuple(chain, binding.start, tuple_mode.normal) + assert result.shape == (6,) + assert_array_equal(ma.getmaskarray(result), mask) + + def test_normal_preserves_values(self): + chain = ma.masked_array( + [0, 2, 3, 2, 0, 6], mask=[1, 0, 0, 0, 1, 0], dtype=np.intp + ) + result = intervals_tuple(chain, binding.start, tuple_mode.normal) + assert_array_equal(result.compressed(), [2, 3, 2, 6]) + + # ------------------------------------------------------------------------- + # tuple_mode.lossy — boundary positions additionally masked, same length + # ------------------------------------------------------------------------- + + def test_lossy_same_length_as_input(self): + chain = ma.masked_array( + [0, 2, 3, 2, 0, 6], mask=[1, 0, 0, 0, 1, 0], dtype=np.intp + ) + result = intervals_tuple(chain, binding.start, tuple_mode.lossy) + assert len(result) == len(chain) + + def test_lossy_masks_boundary_positions(self): + # compressed = [2, 3, 2, 6] at positions [1,2,3,5] + # compressed indices [0,1,2,3], boundary: compressed[i] > i + # 2>0=T, 3>1=T, 2>2=F, 6>3=T + # boundary compressed indices: [0,1,3] → original positions [1,2,5] + # non-boundary: compressed index [2] → original position [3] + chain = ma.masked_array( + [0, 2, 3, 2, 0, 6], mask=[1, 0, 0, 0, 1, 0], dtype=np.intp + ) + result = intervals_tuple(chain, binding.start, tuple_mode.lossy) + # positions [1,2,5] additionally masked; [3] remains unmasked + expected_mask = [1, 1, 1, 0, 1, 1] + assert_array_equal(ma.getmaskarray(result), expected_mask) + assert_array_equal(result.compressed(), [2]) + + def test_lossy_original_mask_preserved(self): + # Input mask positions must remain masked in output + chain = ma.masked_array( + [0, 2, 3, 2, 0, 6], mask=[1, 0, 0, 0, 1, 0], dtype=np.intp + ) + result = intervals_tuple(chain, binding.start, tuple_mode.lossy) + input_mask = ma.getmaskarray(chain) + output_mask = ma.getmaskarray(result) + # All originally masked positions must still be masked + assert np.all(output_mask[input_mask]) + + def test_lossy_no_mask_non_boundary_values_kept(self): + # For an unmasked chain, lossy should keep same non-boundary values as core + plain_chain = np.array([1, 2, 2, 4, 2], dtype=np.intp) + masked_chain = ma.masked_array(plain_chain, mask=[0] * 5) + core_result = core_intervals_tuple(plain_chain, binding.start, tuple_mode.lossy) + partial_result = intervals_tuple(masked_chain, binding.start, tuple_mode.lossy) + assert_array_equal(partial_result.compressed(), core_result) + + # ------------------------------------------------------------------------- + # tuple_mode.redundant — trailing intervals appended + # ------------------------------------------------------------------------- + + def test_redundant_length_greater_than_input(self): + chain = ma.masked_array( + [0, 2, 3, 2, 0, 6], mask=[1, 0, 0, 0, 1, 0], dtype=np.intp + ) + result = intervals_tuple(chain, binding.start, tuple_mode.redundant) + assert len(result) > len(chain) + + def test_redundant_trailing_elements_unmasked(self): + chain = ma.masked_array( + [0, 2, 3, 2, 0, 6], mask=[1, 0, 0, 0, 1, 0], dtype=np.intp + ) + result = intervals_tuple(chain, binding.start, tuple_mode.redundant) + # Trailing elements (beyond len(chain)) must be unmasked + assert not np.any(ma.getmaskarray(result)[len(chain) :]) + + def test_redundant_input_portion_mask_unchanged(self): + chain = ma.masked_array( + [0, 2, 3, 2, 0, 6], mask=[1, 0, 0, 0, 1, 0], dtype=np.intp + ) + result = intervals_tuple(chain, binding.start, tuple_mode.redundant) + # The first n positions must preserve the original mask + assert_array_equal( + ma.getmaskarray(result)[: len(chain)], ma.getmaskarray(chain) + ) + + def test_redundant_no_mask_matches_core_values(self): + plain_chain = np.array([1, 2, 2, 4, 2], dtype=np.intp) + masked_chain = ma.masked_array(plain_chain, mask=[0] * 5) + core_result = core_intervals_tuple( + plain_chain, binding.start, tuple_mode.redundant + ) + partial_result = intervals_tuple( + masked_chain, binding.start, tuple_mode.redundant + ) + # All values (original + trailing) should match core + assert_array_equal(partial_result.data, core_result) + + # ------------------------------------------------------------------------- + # binding.end + # ------------------------------------------------------------------------- + + def test_lossy_binding_end_no_mask_same_values_as_core(self): + # For binding.end, position-preserving partials returns non-boundary values + # in left-to-right order; core returns them in right-to-left order. + # The multisets must be equal. + plain_chain = np.array([1, 2, 2, 4, 2], dtype=np.intp) + masked_chain = ma.masked_array(plain_chain, mask=[0] * 5) + core_result = core_intervals_tuple(plain_chain, binding.end, tuple_mode.lossy) + partial_result = intervals_tuple(masked_chain, binding.end, tuple_mode.lossy) + assert_array_equal( + np.sort(partial_result.compressed()), + np.sort(core_result), + ) + + def test_normal_binding_end_preserves_mask(self): + chain = ma.masked_array([2, 0, 3, 2], mask=[0, 1, 0, 0], dtype=np.intp) + result = intervals_tuple(chain, binding.end, tuple_mode.normal) + assert_array_equal(ma.getmaskarray(result), [0, 1, 0, 0]) + + # ------------------------------------------------------------------------- + # Error handling + # ------------------------------------------------------------------------- + + def test_invalid_binding_raises_value_error(self): + chain = ma.masked_array([1, 2], mask=[0, 0], dtype=np.intp) + with pytest.raises(ValueError): + intervals_tuple(chain, 99, tuple_mode.normal) + + def test_invalid_tuple_mode_raises_value_error(self): + chain = ma.masked_array([1, 2], mask=[0, 0], dtype=np.intp) + with pytest.raises(ValueError): + intervals_tuple(chain, binding.start, 99) + + # ------------------------------------------------------------------------- + # Plain array auto-wrapping + # ------------------------------------------------------------------------- + + def test_plain_array_accepted(self): + plain = np.array([1, 2, 2, 4, 2], dtype=np.intp) + result = intervals_tuple(plain, binding.start, tuple_mode.normal) + assert result.shape == plain.shape diff --git a/tests/test_partials_order.py b/tests/test_partials_order.py new file mode 100644 index 00000000..c45fa3c3 --- /dev/null +++ b/tests/test_partials_order.py @@ -0,0 +1,180 @@ +from unittest import TestCase + +import numpy as np +import numpy.ma as ma +import pytest +from numpy.ma.testutils import assert_equal +from numpy.testing import assert_array_equal + +from foapy import order as core_order +from foapy.exceptions import Not1DArrayException +from foapy.partials import order + + +class TestPartialsOrder(TestCase): + """ + Test foapy.partials.order(X, return_alphabet=False) -> masked_array. + + Returns a 1-D masked array of the same length as the input. + Non-masked positions hold the element's 0-based alphabet index. + Masked positions remain masked in the output. + """ + + # ------------------------------------------------------------------------- + # Empty input + # ------------------------------------------------------------------------- + + def test_empty_array(self): + X = ma.masked_array([], mask=[]) + result = order(X) + assert result.shape == (0,) + assert_equal(result, ma.masked_array([], mask=[])) + + # ------------------------------------------------------------------------- + # Single element + # ------------------------------------------------------------------------- + + def test_single_unmasked_element(self): + X = ma.masked_array(["a"], mask=[0]) + result = order(X) + expected = ma.masked_array([0], mask=[0]) + assert_equal(result, expected) + + def test_single_masked_element(self): + X = ma.masked_array(["a"], mask=[1]) + result = order(X) + expected = ma.masked_array([0], mask=[1]) + assert_equal(result, expected) + + # ------------------------------------------------------------------------- + # All unique symbols + # ------------------------------------------------------------------------- + + def test_all_unique_strings(self): + X = ma.masked_array(["a", "b", "c", "d"], mask=[0, 0, 0, 0]) + result = order(X) + expected = ma.masked_array([0, 1, 2, 3], mask=[0, 0, 0, 0]) + assert_equal(result, expected) + + def test_all_unique_integers(self): + X = ma.masked_array([10, 20, 30], mask=[0, 0, 0]) + result = order(X) + expected = ma.masked_array([0, 1, 2], mask=[0, 0, 0]) + assert_equal(result, expected) + + # ------------------------------------------------------------------------- + # All same symbol + # ------------------------------------------------------------------------- + + def test_all_same_symbol(self): + X = ma.masked_array(["a", "a", "a"], mask=[0, 0, 0]) + result = order(X) + expected = ma.masked_array([0, 0, 0], mask=[0, 0, 0]) + assert_equal(result, expected) + + # ------------------------------------------------------------------------- + # Realistic dataset + # ------------------------------------------------------------------------- + + def test_realistic_string_dataset(self): + X = ma.masked_array(["a", "b", "a", "c", "d"], mask=[0, 0, 0, 0, 0]) + result = order(X) + expected = ma.masked_array([0, 1, 0, 2, 3], mask=[0, 0, 0, 0, 0]) + assert_equal(result, expected) + + def test_realistic_integer_dataset(self): + X = ma.masked_array([1, 2, 2, 3, 4, 1], mask=[0, 0, 0, 0, 0, 0]) + result = order(X) + expected = ma.masked_array([0, 1, 1, 2, 3, 0], mask=[0, 0, 0, 0, 0, 0]) + assert_equal(result, expected) + + # ------------------------------------------------------------------------- + # Fully masked + # ------------------------------------------------------------------------- + + def test_fully_masked_returns_full_mask(self): + X = ma.masked_array(["a", "b", "c"], mask=[1, 1, 1]) + result = order(X) + assert result.shape == (3,) + assert np.all(ma.getmaskarray(result)) + + # ------------------------------------------------------------------------- + # Partially masked — gap positions preserved + # ------------------------------------------------------------------------- + + def test_partially_masked_preserves_gap_positions(self): + # X = ['a', --, 'b', 'a', --] + X = ma.masked_array(["a", "x", "b", "a", "y"], mask=[0, 1, 0, 0, 1]) + result = order(X) + expected_data_at_non_masked = [0, 1, 0] + assert result.shape == (5,) + assert_array_equal(ma.getmaskarray(result), [0, 1, 0, 0, 1]) + assert_array_equal(result.compressed(), expected_data_at_non_masked) + + def test_partial_mask_output_same_length_as_input(self): + X = ma.masked_array(["a", "b", "a"], mask=[0, 1, 0]) + result = order(X) + assert len(result) == len(X) + + def test_partial_mask_output_mask_identical_to_input(self): + mask = [0, 1, 0, 1, 0] + X = ma.masked_array(["a", "b", "c", "d", "a"], mask=mask) + result = order(X) + assert_array_equal(ma.getmaskarray(result), mask) + + # ------------------------------------------------------------------------- + # No-mask passthrough — must equal core.order + # ------------------------------------------------------------------------- + + def test_no_mask_matches_core_order(self): + data = ["a", "b", "a", "c", "d"] + X = ma.masked_array(data, mask=[0, 0, 0, 0, 0]) + core_result = core_order(data) + partial_result = order(X) + assert_array_equal(partial_result.compressed(), core_result) + + # ------------------------------------------------------------------------- + # Plain array auto-wrapping + # ------------------------------------------------------------------------- + + def test_plain_list_accepted(self): + result = order(["a", "b", "a"]) + assert result.shape == (3,) + assert_array_equal(result.compressed(), [0, 1, 0]) + + def test_plain_ndarray_accepted(self): + result = order(np.array(["a", "b", "a"])) + assert result.shape == (3,) + assert_array_equal(result.compressed(), [0, 1, 0]) + + # ------------------------------------------------------------------------- + # return_alphabet=True + # ------------------------------------------------------------------------- + + def test_return_alphabet_unmasked(self): + X = ma.masked_array(["a", "c", "c", "e", "d", "a"], mask=[0, 0, 0, 0, 0, 0]) + result, alph = order(X, return_alphabet=True) + assert_array_equal(result.compressed(), [0, 1, 1, 2, 3, 0]) + assert_equal(alph, ["a", "c", "e", "d"]) + + def test_return_alphabet_with_mask(self): + # X = ['a', --, 'b', 'a', --] + X = ma.masked_array(["a", "x", "b", "a", "y"], mask=[0, 1, 0, 0, 1]) + result, alph = order(X, return_alphabet=True) + assert_array_equal(result.compressed(), [0, 1, 0]) + assert_equal(alph, ["a", "b"]) + + def test_return_alphabet_fully_masked(self): + X = ma.masked_array(["a", "b"], mask=[1, 1]) + result, alph = order(X, return_alphabet=True) + assert np.all(ma.getmaskarray(result)) + assert len(alph) == 0 + + # ------------------------------------------------------------------------- + # Error handling + # ------------------------------------------------------------------------- + + def test_2d_array_raises_not1d(self): + X = ma.masked_array([[1, 2], [3, 4]]) + with pytest.raises(Not1DArrayException): + order(X) diff --git a/tests/test_partials_pipeline_consistency.py b/tests/test_partials_pipeline_consistency.py new file mode 100644 index 00000000..54c8b832 --- /dev/null +++ b/tests/test_partials_pipeline_consistency.py @@ -0,0 +1,160 @@ +""" +Pipeline consistency tests: partials functions must equal core functions +when input has no masked positions (SC-003). +""" + +from unittest import TestCase + +import numpy as np +import numpy.ma as ma +from numpy.testing import assert_array_equal + +from foapy import binding, chain_mode +from foapy.core import alphabet as core_alphabet +from foapy.core import intervals_chain as core_intervals_chain +from foapy.core import intervals_tuple as core_intervals_tuple +from foapy.core import order as core_order +from foapy.core import ( + tuple_mode, +) +from foapy.partials import alphabet as partials_alphabet +from foapy.partials import intervals_chain as partials_intervals_chain +from foapy.partials import intervals_tuple as partials_intervals_tuple +from foapy.partials import order as partials_order + +_DATASETS = [ + ["a", "b", "a", "c", "b"], + ["a", "a", "a"], + ["a", "b", "c", "d"], + [1, 2, 3, 2, 1], +] + + +class TestPartialsCoreConsistency(TestCase): + """ + Verify that foapy.partials produces the same results as foapy.core + when the input has no masked positions. + """ + + def _unmasked(self, data): + return ma.masked_array(data, mask=[0] * len(data)) + + # ------------------------------------------------------------------------- + # order + # ------------------------------------------------------------------------- + + def test_order_matches_core_for_all_datasets(self): + for data in _DATASETS: + with self.subTest(data=data): + X = self._unmasked(data) + core_result = core_order(data) + partial_result = partials_order(X) + assert_array_equal( + partial_result.compressed(), + core_result, + err_msg=f"order mismatch for {data}", + ) + + # ------------------------------------------------------------------------- + # alphabet + # ------------------------------------------------------------------------- + + def test_alphabet_matches_core_for_all_datasets(self): + for data in _DATASETS: + with self.subTest(data=data): + X = self._unmasked(data) + core_result = core_alphabet(data) + partial_result = partials_alphabet(X) + assert_array_equal( + partial_result, + core_result, + err_msg=f"alphabet mismatch for {data}", + ) + + # ------------------------------------------------------------------------- + # intervals_chain — all binding × chain_mode combinations + # ------------------------------------------------------------------------- + + def test_intervals_chain_matches_core_all_combinations(self): + bindings = [binding.start, binding.end] + chain_modes = [chain_mode.boundary, chain_mode.cycle] + for data in _DATASETS: + for b in bindings: + for cm in chain_modes: + with self.subTest(data=data, binding=b, chain_mode=cm): + X = self._unmasked(data) + core_result = core_intervals_chain(data, b, cm) + partial_result = partials_intervals_chain(X, b, cm) + expected_err_msg = ( + f"intervals_chain mismatch for {data}, b={b}, cm={cm}" + ) + assert_array_equal( + partial_result.compressed(), + core_result, + err_msg=expected_err_msg, + ) + + # ------------------------------------------------------------------------- + # intervals_tuple — all binding × tuple_mode combinations + # ------------------------------------------------------------------------- + + def test_intervals_tuple_normal_matches_core(self): + for data in _DATASETS: + for b in [binding.start, binding.end]: + for cm in [chain_mode.boundary, chain_mode.cycle]: + with self.subTest(data=data, binding=b, chain_mode=cm): + chain = core_intervals_chain(data, b, cm) + masked_chain = ma.masked_array(chain, mask=[0] * len(chain)) + core_result = core_intervals_tuple(chain, b, tuple_mode.normal) + partial_result = partials_intervals_tuple( + masked_chain, b, tuple_mode.normal + ) + assert_array_equal( + partial_result.compressed(), + core_result, + ) + + def test_intervals_tuple_lossy_matches_core(self): + for data in _DATASETS: + for b in [binding.start, binding.end]: + for cm in [chain_mode.boundary, chain_mode.cycle]: + with self.subTest(data=data, binding=b, chain_mode=cm): + chain = core_intervals_chain(data, b, cm) + masked_chain = ma.masked_array(chain, mask=[0] * len(chain)) + core_result = core_intervals_tuple(chain, b, tuple_mode.lossy) + partial_result = partials_intervals_tuple( + masked_chain, b, tuple_mode.lossy + ) + # For binding.end, partials preserves positional order + # (left-to-right) while core returns values in reversed + # order — compare as multisets. + assert_array_equal( + np.sort(partial_result.compressed()), + np.sort(core_result), + ) + + def test_intervals_tuple_redundant_matches_core(self): + for data in _DATASETS: + for b in [binding.start, binding.end]: + for cm in [chain_mode.boundary, chain_mode.cycle]: + with self.subTest(data=data, binding=b, chain_mode=cm): + chain = core_intervals_chain(data, b, cm) + masked_chain = ma.masked_array(chain, mask=[0] * len(chain)) + core_result = core_intervals_tuple( + chain, b, tuple_mode.redundant + ) + partial_result = partials_intervals_tuple( + masked_chain, b, tuple_mode.redundant + ) + n = len(chain) + # Original portion: for binding.end, positional order differs — + # compare as multisets. + assert_array_equal( + np.sort(partial_result.data[:n]), + np.sort(core_result[:n]), + ) + # Trailing intervals must match exactly regardless of binding. + assert_array_equal( + partial_result.data[n:], + core_result[n:], + )