diff --git a/CHANGELOG.md b/CHANGELOG.md index ff61f15..ebd234c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,29 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- `DictionaryBuilder` for build-time dictionary construction with `normalizeKey` injection +- `Dictionary::fromRows()` as the canonical dictionary factory +- `tests/DictionaryTest.php` unit tests for dictionary authoring and validation + +### Changed + +- Dictionary author format is now `term`, `category`, and `severity` only; `normalized` is derived at build time +- `Entry::fromRow()` replaces `Entry::fromArray()` for internal row construction +- `data/tr.php` seed dictionary migrated to author-only rows +- `TurkishProfile` builds its dictionary via `NormalizationPipeline` so index keys match runtime normalization + +### Removed + +- `Dictionary::fromArray()` — use `Dictionary::fromRows($rows, $normalizeKey)` instead +- `Entry::fromArray()` + +### Breaking changes (v0.2) + +- `Dictionary::fromArray()` removed; custom profiles must use `Dictionary::fromRows()` with a `normalizeKey` callable +- Author dictionary rows no longer accept a `normalized` field + ## [0.1.0] - 2026-07-01 Initial public release of VerbaGuard — a framework-independent PHP moderation engine for language-aware text analysis. diff --git a/README.md b/README.md index 0962959..b99a1c4 100644 --- a/README.md +++ b/README.md @@ -178,12 +178,15 @@ Use `VerbaGuard::forLanguages()` with a custom profile when you need a productio ## Language Profiles -A language profile bundles a dictionary and profile-specific normalizers: +A language profile bundles a dictionary and profile-specific normalizers. + +**v0.2 dictionary authoring:** write only `term`, `category`, and `severity` in dictionary rows. Do not author `normalized` — it is derived at build time via `Dictionary::fromRows()` and a `normalizeKey` callable that must match the profile's runtime normalization chain. ```php use VerbaGuard\Contracts\LanguageProfile; use VerbaGuard\Dictionary\Dictionary; use VerbaGuard\Normalizer\Normalizer; +use VerbaGuard\Pipeline\NormalizationPipeline; use VerbaGuard\VerbaGuard; final class ExampleProfile implements LanguageProfile @@ -195,14 +198,20 @@ final class ExampleProfile implements LanguageProfile public function dictionary(): Dictionary { - return Dictionary::fromArray([ + $rows = [ [ 'term' => 'badword', - 'normalized' => 'badword', 'category' => 'profanity', 'severity' => 'medium', ], - ]); + ]; + + $normalization = new NormalizationPipeline($this->normalizers()); + + return Dictionary::fromRows( + $rows, + static fn (string $term): string => $normalization->normalize($term), + ); } public function normalizers(): array diff --git a/data/tr.php b/data/tr.php index 6a70453..f552975 100644 --- a/data/tr.php +++ b/data/tr.php @@ -3,7 +3,8 @@ declare(strict_types=1); /** - * Turkish seed dictionary for VerbaGuard v0.1. + * Turkish seed dictionary for VerbaGuard. Author rows contain term, category, + * and severity only; normalized keys are derived at build time. * * Contains a minimal set of offensive terms for testing purposes only. * See README.md offensive language notice. @@ -11,31 +12,26 @@ return [ [ 'term' => 'amk', - 'normalized' => 'amk', 'category' => 'profanity', 'severity' => 'medium', ], [ 'term' => 'aq', - 'normalized' => 'aq', 'category' => 'profanity', 'severity' => 'low', ], [ 'term' => 'siktir', - 'normalized' => 'siktir', 'category' => 'profanity', 'severity' => 'high', ], [ 'term' => 'orospu', - 'normalized' => 'orospu', 'category' => 'profanity', 'severity' => 'high', ], [ 'term' => 'mal', - 'normalized' => 'mal', 'category' => 'insult', 'severity' => 'low', ], diff --git a/docs/specification.md b/docs/specification.md index ed5166f..1b26080 100644 --- a/docs/specification.md +++ b/docs/specification.md @@ -86,26 +86,55 @@ interface LanguageProfile Dictionaries are plain PHP arrays loaded from files such as `data/tr.php`. -Each entry contains: +### Author format (v0.2+) -| Field | Description | -|-------------|--------------------------------------------------| -| `term` | Canonical dictionary term | -| `normalized`| Normalized form used for matching | -| `category` | Semantic category, e.g. `profanity`, `insult` | -| `severity` | One of `clean`, `low`, `medium`, `high` | +Each author row contains only user-written canonical fields: -Example: +| Field | Description | +|------------|-----------------------------------------------| +| `term` | Canonical dictionary term | +| `category` | Semantic category, e.g. `profanity`, `insult` | +| `severity` | One of `clean`, `low`, `medium`, `high` | + +Do **not** include `normalized` in author rows. It is derived at dictionary build time. + +Example author row: ```php [ 'term' => 'amk', - 'normalized' => 'amk', 'category' => 'profanity', 'severity' => 'medium', ] ``` +### Build-time construction + +Use `Dictionary::fromRows()` with a `normalizeKey` callable. The callable must apply the same normalization chain the matcher uses at runtime (typically the profile's `NormalizationPipeline`). + +```php +Dictionary::fromRows( + rows: $rows, + normalizeKey: fn (string $term): string => $normalization->normalize($term), +); +``` + +At build time, each `term` is passed through `normalizeKey` to produce the derived `normalized` lookup key stored on `Entry`. + +### Runtime `Entry` fields + +| Field | Source | Description | +|-------------|----------|--------------------------------------------------| +| `term` | Author | Canonical dictionary term | +| `category` | Author | Semantic category | +| `severity` | Author | Severity level | +| `normalized`| Derived | Build-time normalized form used for matching | + +### Breaking changes in v0.2 + +- `Dictionary::fromArray()` removed — use `Dictionary::fromRows()` instead. +- Author dictionary rows no longer accept a `normalized` field. + --- ## Normalization stages @@ -257,7 +286,8 @@ The final score is the sum of all unique match severities. ## Future compatibility notes - New normalization stages belong in the global pipeline unless language-specific. -- Dictionary entries should remain array-based so existing language files keep working. +- Dictionary author rows remain plain PHP arrays with `term`, `category`, and `severity`. +- Derived fields such as `normalized` are produced at build time via `Dictionary::fromRows()`. - Additional severity levels or scoring policies require explicit interfaces in future versions. - Framework adapters should live in separate packages depending on this core library. - Matcher changes are bug-fix only while frozen; see `FOUNDATION.md`. diff --git a/src/Dictionary/Dictionary.php b/src/Dictionary/Dictionary.php index e1eb363..c6fc921 100644 --- a/src/Dictionary/Dictionary.php +++ b/src/Dictionary/Dictionary.php @@ -20,16 +20,11 @@ public function __construct(array $entries) } /** - * @param list $rows + * @param list $rows */ - public static function fromArray(array $rows): self + public static function fromRows(array $rows, callable $normalizeKey): self { - $entries = array_map( - static fn (array $row): Entry => Entry::fromArray($row), - $rows, - ); - - return new self($entries); + return (new DictionaryBuilder($normalizeKey))->build($rows); } public function find(string $normalized): ?Entry diff --git a/src/Dictionary/DictionaryBuilder.php b/src/Dictionary/DictionaryBuilder.php new file mode 100644 index 0000000..72e1dfb --- /dev/null +++ b/src/Dictionary/DictionaryBuilder.php @@ -0,0 +1,116 @@ + */ + private const AUTHOR_FIELDS = ['term', 'category', 'severity']; + + /** @var callable(string): string */ + private $normalizeKey; + + /** + * @param callable(string): string $normalizeKey + */ + public function __construct(callable $normalizeKey) + { + $this->normalizeKey = $normalizeKey; + } + + /** + * @param list> $rows + */ + public function build(array $rows): Dictionary + { + /** @var array $byNormalized */ + $byNormalized = []; + + foreach ($rows as $index => $row) { + $this->assertAuthorRowShape($row, $index); + + $term = $row['term']; + $category = $row['category']; + $severity = $row['severity']; + + $this->assertNonEmptyString($term, 'term', $index); + $this->assertNonEmptyString($category, 'category', $index); + $this->assertValidSeverity($severity, $index); + + $normalized = ($this->normalizeKey)($term); + + if (isset($byNormalized[$normalized])) { + throw new InvalidArgumentException( + sprintf('Duplicate normalized key "%s" at row %d.', $normalized, $index), + ); + } + + $byNormalized[$normalized] = Entry::fromRow( + [ + 'term' => $term, + 'category' => $category, + 'severity' => $severity, + ], + $normalized, + ); + } + + return new Dictionary(array_values($byNormalized)); + } + + /** + * @param array $row + */ + private function assertAuthorRowShape(array $row, int $index): void + { + if (array_key_exists('normalized', $row)) { + throw new InvalidArgumentException( + sprintf('Author dictionary rows must not include "normalized" (row %d).', $index), + ); + } + + foreach (array_keys($row) as $field) { + if (! in_array($field, self::AUTHOR_FIELDS, true)) { + throw new InvalidArgumentException( + sprintf('Unknown author field "%s" at row %d.', $field, $index), + ); + } + } + + foreach (self::AUTHOR_FIELDS as $field) { + if (! array_key_exists($field, $row)) { + throw new InvalidArgumentException( + sprintf('Missing required author field "%s" at row %d.', $field, $index), + ); + } + } + } + + private function assertNonEmptyString(string $value, string $field, int $index): void + { + if ($value === '') { + throw new InvalidArgumentException( + sprintf('Author field "%s" must not be empty at row %d.', $field, $index), + ); + } + } + + private function assertValidSeverity(string $severity, int $index): void + { + try { + Severity::fromString($severity); + } catch (ValueError $exception) { + throw new InvalidArgumentException( + sprintf('Invalid severity "%s" at row %d.', $severity, $index), + 0, + $exception, + ); + } + } +} diff --git a/src/Dictionary/Entry.php b/src/Dictionary/Entry.php index cab3f0c..f398acc 100644 --- a/src/Dictionary/Entry.php +++ b/src/Dictionary/Entry.php @@ -6,6 +6,12 @@ final class Entry { + /** + * @param string $term Author field — canonical dictionary term. + * @param string $normalized Derived field — build-time normalized lookup key. + * @param string $category Author field — semantic category. + * @param string $severity Author field — one of clean, low, medium, high. + */ public function __construct( public readonly string $term, public readonly string $normalized, @@ -15,15 +21,15 @@ public function __construct( } /** - * @param array{term: string, normalized: string, category: string, severity: string} $data + * @param array{term: string, category: string, severity: string} $row */ - public static function fromArray(array $data): self + public static function fromRow(array $row, string $normalized): self { return new self( - $data['term'], - $data['normalized'], - $data['category'], - $data['severity'], + $row['term'], + $normalized, + $row['category'], + $row['severity'], ); } } diff --git a/src/Language/TurkishProfile.php b/src/Language/TurkishProfile.php index f6b5a0e..255ee4c 100644 --- a/src/Language/TurkishProfile.php +++ b/src/Language/TurkishProfile.php @@ -8,6 +8,7 @@ use VerbaGuard\Dictionary\Dictionary; use VerbaGuard\Normalizer\Normalizer; use VerbaGuard\Normalizer\TurkishNormalizer; +use VerbaGuard\Pipeline\NormalizationPipeline; final class TurkishProfile implements LanguageProfile { @@ -21,9 +22,13 @@ public function code(): string public function dictionary(): Dictionary { if ($this->dictionary === null) { - /** @var list $rows */ + /** @var list $rows */ $rows = require dirname(__DIR__, 2) . '/data/tr.php'; - $this->dictionary = Dictionary::fromArray($rows); + $normalization = new NormalizationPipeline($this->normalizers()); + $this->dictionary = Dictionary::fromRows( + $rows, + static fn (string $term): string => $normalization->normalize($term), + ); } return $this->dictionary; diff --git a/tests/DictionaryTest.php b/tests/DictionaryTest.php new file mode 100644 index 0000000..85f953b --- /dev/null +++ b/tests/DictionaryTest.php @@ -0,0 +1,150 @@ + $term, + 'category' => $category, + 'severity' => $severity, + ]; +} + +test('fromRows minimal format produces Entry', function () { + $dictionary = Dictionary::fromRows( + [dictionaryRow()], + static fn (string $term): string => $term, + ); + + $entry = $dictionary->entries()[0]; + + expect($entry)->toBeInstanceOf(Entry::class) + ->and($entry->term)->toBe('amk') + ->and($entry->category)->toBe('profanity') + ->and($entry->severity)->toBe('medium'); +}); + +test('fromRows derives normalized at build time', function () { + $dictionary = Dictionary::fromRows( + [dictionaryRow(term: 'AMK')], + static fn (string $term): string => mb_strtolower($term, 'UTF-8'), + ); + + expect($dictionary->entries()[0]->normalized)->toBe('amk'); +}); + +test('fromRows uses normalizeKey callable', function () { + $dictionary = Dictionary::fromRows( + [dictionaryRow(term: 'test')], + static fn (string $term): string => 'normalized:' . $term, + ); + + expect($dictionary->entries()[0]->normalized)->toBe('normalized:test') + ->and($dictionary->find('normalized:test'))->not->toBeNull() + ->and($dictionary->find('test'))->toBeNull(); +}); + +test('fromRows rejects invalid severity', function () { + Dictionary::fromRows( + [dictionaryRow(severity: 'extreme')], + static fn (string $term): string => $term, + ); +})->throws(InvalidArgumentException::class); + +test('fromRows rejects empty term', function () { + Dictionary::fromRows( + [dictionaryRow(term: '')], + static fn (string $term): string => $term, + ); +})->throws(InvalidArgumentException::class); + +test('fromRows rejects empty category', function () { + Dictionary::fromRows( + [dictionaryRow(category: '')], + static fn (string $term): string => $term, + ); +})->throws(InvalidArgumentException::class); + +test('fromRows rejects normalized in author input', function () { + Dictionary::fromRows( + [ + [ + 'term' => 'amk', + 'normalized' => 'amk', + 'category' => 'profanity', + 'severity' => 'medium', + ], + ], + static fn (string $term): string => $term, + ); +})->throws(InvalidArgumentException::class, 'Author dictionary rows must not include "normalized"'); + +test('fromRows rejects unknown author field', function () { + Dictionary::fromRows( + [ + [ + 'term' => 'amk', + 'category' => 'profanity', + 'severity' => 'medium', + 'alias' => 'amq', + ], + ], + static fn (string $term): string => $term, + ); +})->throws(InvalidArgumentException::class, 'Unknown author field'); + +test('fromRows rejects missing required field', function () { + Dictionary::fromRows( + [ + [ + 'term' => 'amk', + 'category' => 'profanity', + ], + ], + static fn (string $term): string => $term, + ); +})->throws(InvalidArgumentException::class, 'Missing required author field'); + +test('fromRows rejects duplicate normalized key', function () { + Dictionary::fromRows( + [ + dictionaryRow(term: 'foo'), + dictionaryRow(term: 'FOO'), + ], + static fn (string $term): string => mb_strtolower($term, 'UTF-8'), + ); +})->throws(InvalidArgumentException::class, 'Duplicate normalized key'); + +test('find works with derived normalized key', function () { + $dictionary = Dictionary::fromRows( + [dictionaryRow(term: 'AMK')], + static fn (string $term): string => mb_strtolower($term, 'UTF-8'), + ); + + $entry = $dictionary->find('amk'); + + expect($entry)->not->toBeNull() + ->and($entry->term)->toBe('AMK') + ->and($entry->normalized)->toBe('amk'); +}); + +test('entries returns author and derived fields', function () { + $dictionary = Dictionary::fromRows( + [dictionaryRow(term: 'siktir', category: 'profanity', severity: 'high')], + static fn (string $term): string => mb_strtolower($term, 'UTF-8'), + ); + + $entry = $dictionary->entries()[0]; + + expect($entry->term)->toBe('siktir') + ->and($entry->category)->toBe('profanity') + ->and($entry->severity)->toBe('high') + ->and($entry->normalized)->toBe('siktir'); +});