diff --git a/.gitignore b/.gitignore index 4e24b5f..b8abfc1 100644 --- a/.gitignore +++ b/.gitignore @@ -20,10 +20,12 @@ dist/ Thumbs.db .idea/ .vscode/ +.claude/ # Local runtime/config artifacts config.yaml data/ logs/ +outputs/ clawhub-zaomeng-skill/runtime/data/ .tools/ diff --git a/src/skill_support/novel_preparation.py b/src/skill_support/novel_preparation.py index b91ae93..b31e089 100644 --- a/src/skill_support/novel_preparation.py +++ b/src/skill_support/novel_preparation.py @@ -3,6 +3,7 @@ from __future__ import annotations +import json import re import unicodedata from pathlib import Path @@ -10,6 +11,10 @@ from src.utils.text_parser import load_novel_text, split_sentences +_ALIAS_REGISTRY_CACHE: dict[str, "_AliasRegistry"] = {} + +_DEFAULT_ALIAS_FILE = Path(__file__).resolve().parent.parent.parent / "zaomeng-skill" / "character_aliases.json" + MIXED_EXCERPT_MIN_CHARS = 3_000 MIXED_EXCERPT_MIN_SENTENCES = 40 @@ -56,18 +61,113 @@ MATCH_IGNORED_PATTERN = re.compile(r"[\s\u3000\u00b7\u2027\u30fb'\"`~!@#$%^&*()_+\-=\[\]{}\\|;:,.<>/?,。!?:;、“”‘’《》【】()]") +class _AliasRegistry: + __slots__ = ("canonical_to_spec", "alias_to_canonical") + + def __init__(self, canonical_to_spec: dict[str, str], alias_to_canonical: dict[str, str]): + self.canonical_to_spec = canonical_to_spec + self.alias_to_canonical = alias_to_canonical + + @property + def empty(self) -> bool: + return not self.canonical_to_spec + + +_EMPTY_REGISTRY = _AliasRegistry({}, {}) + + +def _load_alias_registry(alias_file: str | Path | None = None) -> _AliasRegistry: + path = Path(alias_file) if alias_file else _DEFAULT_ALIAS_FILE + cache_key = str(path) + if cache_key in _ALIAS_REGISTRY_CACHE: + return _ALIAS_REGISTRY_CACHE[cache_key] + + if not path.exists(): + _ALIAS_REGISTRY_CACHE[cache_key] = _EMPTY_REGISTRY + return _EMPTY_REGISTRY + + try: + data = json.loads(path.read_text("utf-8")) + except (json.JSONDecodeError, OSError): + _ALIAS_REGISTRY_CACHE[cache_key] = _EMPTY_REGISTRY + return _EMPTY_REGISTRY + + canonical_to_spec: dict[str, str] = {} + alias_to_canonical: dict[str, str] = {} + for canonical, aliases in data.items(): + if canonical.startswith("_") or not isinstance(aliases, list): + continue + all_names = [canonical] + [str(a).strip() for a in aliases if str(a).strip()] + canonical_to_spec[canonical] = "|".join(all_names) + for name in all_names: + alias_to_canonical[name] = canonical + normalized = _normalize_match_text(name) + if normalized and normalized not in alias_to_canonical: + alias_to_canonical[normalized] = canonical + + registry = _AliasRegistry(canonical_to_spec, alias_to_canonical) + _ALIAS_REGISTRY_CACHE[cache_key] = registry + return registry + + +def _resolve_character_aliases( + characters: list[str] | None, + registry: _AliasRegistry, +) -> list[str] | None: + if not characters or registry.empty: + return characters + resolved: list[str] = [] + for name in characters: + clean = str(name or "").strip() + if not clean: + resolved.append(clean) + continue + if "|" in clean: + resolved.append(clean) + continue + if clean in registry.canonical_to_spec: + resolved.append(registry.canonical_to_spec[clean]) + continue + if clean in registry.alias_to_canonical: + canonical = registry.alias_to_canonical[clean] + resolved.append(registry.canonical_to_spec[canonical]) + continue + normalized = _normalize_match_text(clean) + if normalized and normalized in registry.alias_to_canonical: + canonical = registry.alias_to_canonical[normalized] + resolved.append(registry.canonical_to_spec[canonical]) + continue + if normalized and normalized in registry.canonical_to_spec: + resolved.append(registry.canonical_to_spec[normalized]) + continue + resolved.append(clean) + return resolved + + +def _canonical_name(character_spec: str) -> str: + clean = str(character_spec or "").strip() + return clean.split("|", 1)[0].strip() + + +def _split_aliases(character_spec: str) -> list[str]: + parts = [a.strip() for a in str(character_spec or "").split("|")] + return [a for a in parts if a] + + def prepare_novel_excerpt( text: str, *, max_sentences: int = 80, max_chars: int = 12_000, characters: list[str] | None = None, + alias_file: str | Path | None = None, ) -> str: return build_excerpt_payload_from_text( text, max_sentences=max_sentences, max_chars=max_chars, characters=characters, + alias_file=alias_file, )["excerpt"] @@ -77,14 +177,19 @@ def build_excerpt_payload_from_text( max_sentences: int = 80, max_chars: int = 12_000, characters: list[str] | None = None, + alias_file: str | Path | None = None, ) -> dict[str, Any]: + registry = _load_alias_registry(alias_file) + characters = _resolve_character_aliases(characters, registry) clean = str(text or "").strip() if not clean: + requested = _normalize_characters(characters) + canonical = [_canonical_name(c) for c in requested] return { "excerpt": "", - "requested_characters": _normalize_characters(characters), + "requested_characters": canonical, "matched_characters": [], - "missing_characters": _normalize_characters(characters), + "missing_characters": canonical, "excerpt_strategy": "empty", "excerpt_stages": _empty_stage_blocks(), } @@ -102,12 +207,13 @@ def build_excerpt_payload_from_text( return payload selected_indices = _select_leading_indices(sentences, max_sentences=max_sentences, max_chars=max_chars) + canonical_requested = [_canonical_name(c) for c in requested] return _build_excerpt_result( sentences, selected_indices, - requested_characters=requested, + requested_characters=canonical_requested, matched_characters=[], - missing_characters=requested, + missing_characters=canonical_requested, excerpt_strategy="leading_sentences", max_chars=max_chars, ) @@ -119,12 +225,14 @@ def load_prepared_novel_excerpt( max_sentences: int = 80, max_chars: int = 12_000, characters: list[str] | None = None, + alias_file: str | Path | None = None, ) -> str: return prepare_novel_excerpt( load_novel_text(str(novel_path)), max_sentences=max_sentences, max_chars=max_chars, characters=characters, + alias_file=alias_file, ) @@ -134,6 +242,7 @@ def build_excerpt_payload( max_sentences: int = 80, max_chars: int = 12_000, characters: list[str] | None = None, + alias_file: str | Path | None = None, ) -> dict[str, object]: path = Path(novel_path) excerpt_payload = build_excerpt_payload_from_text( @@ -141,6 +250,7 @@ def build_excerpt_payload( max_sentences=max_sentences, max_chars=max_chars, characters=characters, + alias_file=alias_file, ) return { "source_path": str(path), @@ -161,9 +271,12 @@ def _normalize_characters(characters: list[str] | None) -> list[str]: seen: set[str] = set() for item in list(characters or []): name = str(item or "").strip() - if not name or name in seen: + if not name: continue - seen.add(name) + canonical = _canonical_name(name) + if canonical in seen: + continue + seen.add(canonical) ordered.append(name) return ordered @@ -174,10 +287,12 @@ def _normalize_match_text(text: str) -> str: def _sentence_mentions_character(sentence: str, character: str) -> bool: - normalized_character = _normalize_match_text(character) - if not normalized_character: - return False - return normalized_character in _normalize_match_text(sentence) + normalized_sentence = _normalize_match_text(sentence) + for alias in _split_aliases(character): + normalized_alias = _normalize_match_text(alias) + if normalized_alias and normalized_alias in normalized_sentence: + return True + return False def _leading_excerpt(sentences: list[str], *, max_sentences: int, max_chars: int) -> str: @@ -213,19 +328,25 @@ def _character_focused_excerpt( max_sentences: int, max_chars: int, ) -> dict[str, Any]: - character_hits: dict[str, list[int]] = {name: [] for name in characters} + canonical_to_spec = {_canonical_name(name): name for name in characters} + character_hits: dict[str, list[int]] = {canon: [] for canon in canonical_to_spec} + all_hit_indices: list[int] = [] + seen_hits: set[int] = set() for idx, sentence in enumerate(sentences): - for name in characters: - if _sentence_mentions_character(sentence, name): - character_hits[name].append(idx) - - matched = [name for name, hits in character_hits.items() if hits] - missing = [name for name in characters if not character_hits[name]] + for canon, spec in canonical_to_spec.items(): + if _sentence_mentions_character(sentence, spec): + character_hits[canon].append(idx) + if idx not in seen_hits: + seen_hits.add(idx) + all_hit_indices.append(idx) + + matched = [canon for canon, hits in character_hits.items() if hits] + missing = [canon for canon in canonical_to_spec if not character_hits[canon]] if not matched: return { "excerpt": "", - "requested_characters": characters, + "requested_characters": list(canonical_to_spec.keys()), "matched_characters": [], "missing_characters": missing, "excerpt_strategy": "leading_sentences", @@ -268,6 +389,7 @@ def _character_focused_excerpt( selected_indices, character_hits=character_hits, matched_characters=matched, + character_specs=list(characters), max_sentences=max_sentences, max_chars=max_chars, ) @@ -275,7 +397,7 @@ def _character_focused_excerpt( return _build_excerpt_result( sentences, selected_indices, - requested_characters=characters, + requested_characters=list(canonical_to_spec.keys()), matched_characters=matched, missing_characters=missing, excerpt_strategy="character_windows_mixed" if augmented else "character_windows", @@ -305,6 +427,7 @@ def _augment_character_excerpt_indices( *, character_hits: dict[str, list[int]], matched_characters: list[str], + character_specs: list[str] | None = None, max_sentences: int, max_chars: int, ) -> list[int]: @@ -350,11 +473,12 @@ def add_candidates(indices: list[int], *, radius: int = 0) -> None: if enough(): return ordered - add_candidates(_dialogue_candidate_indices(sentences, matched_characters), radius=0) + spec_list = character_specs if character_specs else matched_characters + add_candidates(_dialogue_candidate_indices(sentences, spec_list), radius=0) if enough(): return ordered - add_candidates(_thought_or_evaluation_indices(sentences, matched_characters), radius=0) + add_candidates(_thought_or_evaluation_indices(sentences, spec_list), radius=0) return ordered @@ -505,7 +629,13 @@ def _build_representative_hit_plan( *, center_budget: int, ) -> list[int]: - per_character = {name: _spread_sample_indices(character_hits.get(name, []), sample_cap=3) for name in matched_characters} + per_character = { + name: _spread_sample_indices( + character_hits.get(name, []), + sample_cap=max(1, min(len(sorted(set(character_hits.get(name, [])))), center_budget)), + ) + for name in matched_characters + } ordered_centers: list[int] = [] seen: set[int] = set() diff --git a/tests/test_novel_preparation.py b/tests/test_novel_preparation.py index 4ba228e..0777341 100644 --- a/tests/test_novel_preparation.py +++ b/tests/test_novel_preparation.py @@ -6,6 +6,7 @@ from src.skill_support.novel_preparation import ( build_excerpt_payload, + build_excerpt_payload_from_text, load_prepared_novel_excerpt, prepare_novel_excerpt, ) @@ -159,5 +160,82 @@ def test_prepare_novel_excerpt_uses_mixed_character_strategy_when_window_is_too_ self.assertIn("结尾余波未散", payload["excerpt"]) +class AliasKnowledgeBaseTests(unittest.TestCase): + def setUp(self): + import sys + sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "zaomeng-skill" / "tools")) + from _skill_support.novel_preparation import ( + _load_alias_registry, + _resolve_character_aliases, + _ALIAS_REGISTRY_CACHE, + ) + self._load_alias_registry = _load_alias_registry + self._resolve_character_aliases = _resolve_character_aliases + self._cache = _ALIAS_REGISTRY_CACHE + self._cache.clear() + + def tearDown(self): + self._cache.clear() + + def _alias_file(self): + return Path(__file__).resolve().parent.parent / "zaomeng-skill" / "character_aliases.json" + + def test_canonical_name_resolves_all_aliases(self): + reg = self._load_alias_registry(self._alias_file()) + result = self._resolve_character_aliases(["孙悟空"], reg) + self.assertEqual(len(result), 1) + self.assertIn("齐天大圣", result[0]) + self.assertIn("孙行者", result[0]) + self.assertIn("孙猴子", result[0]) + + def test_alias_reverse_lookup(self): + reg = self._load_alias_registry(self._alias_file()) + result = self._resolve_character_aliases(["齐天大圣"], reg) + self.assertEqual(len(result), 1) + self.assertTrue(result[0].startswith("孙悟空|")) + + def test_unknown_name_passes_through(self): + reg = self._load_alias_registry(self._alias_file()) + result = self._resolve_character_aliases(["张三丰"], reg) + self.assertEqual(result, ["张三丰"]) + + def test_manual_pipe_not_overridden(self): + reg = self._load_alias_registry(self._alias_file()) + result = self._resolve_character_aliases(["李四|李老四"], reg) + self.assertEqual(result, ["李四|李老四"]) + + def test_missing_file_graceful_degradation(self): + reg = self._load_alias_registry("/nonexistent/path/aliases.json") + self.assertTrue(reg.empty) + result = self._resolve_character_aliases(["孙悟空"], reg) + self.assertEqual(result, ["孙悟空"]) + + def test_alias_matching_in_excerpt(self): + text = "那齐天大圣一个筋斗翻了十万八千里。唐僧在后面叫道悟空快回来。孙行者笑道师父莫急。" + payload = build_excerpt_payload_from_text( + text, + characters=["孙悟空"], + max_sentences=10, + max_chars=500, + alias_file=self._alias_file(), + ) + matched = payload["matched_characters"] + self.assertTrue(any("孙悟空" in m for m in matched)) + self.assertIn("齐天大圣", payload["excerpt"]) + + def test_reverse_alias_matching_in_excerpt(self): + text = "猪八戒扛着钉耙走在前面。天蓬元帅当年何等威风。" + payload = build_excerpt_payload_from_text( + text, + characters=["天蓬元帅"], + max_sentences=10, + max_chars=500, + alias_file=self._alias_file(), + ) + matched = payload["matched_characters"] + self.assertTrue(any("猪八戒" in m for m in matched)) + self.assertIn("猪八戒", payload["excerpt"]) + + if __name__ == "__main__": unittest.main() diff --git a/zaomeng-skill/character_aliases.json b/zaomeng-skill/character_aliases.json new file mode 100644 index 0000000..0692ebb --- /dev/null +++ b/zaomeng-skill/character_aliases.json @@ -0,0 +1,17 @@ +{ + "_meta": { + "description": "角色别名知识库:canonical_name → [alias1, alias2, ...]。支持双向查找。" + }, + "孙悟空": ["齐天大圣", "孙行者", "孙猴子", "美猴王", "弼马温"], + "猪八戒": ["天蓬元帅", "猪悟能", "呆子"], + "唐僧": ["唐三藏", "玄奘", "金蝉子"], + "沙悟净": ["沙僧", "沙和尚", "卷帘大将"], + "贾宝玉": ["宝二爷", "怡红公子", "绛洞花主"], + "林黛玉": ["颦儿", "潇湘妃子", "林妹妹"], + "薛宝钗": ["宝姐姐", "蘅芜君"], + "王熙凤": ["凤姐", "凤辣子", "琏二奶奶"], + "武松": ["武二郎", "行者武松"], + "林冲": ["豹子头", "林教头"], + "鲁智深": ["鲁达", "花和尚"], + "宋江": ["及时雨", "呼保义", "宋公明"] +} diff --git a/zaomeng-skill/tools/_skill_support/novel_preparation.py b/zaomeng-skill/tools/_skill_support/novel_preparation.py index 3e13a81..b394787 100644 --- a/zaomeng-skill/tools/_skill_support/novel_preparation.py +++ b/zaomeng-skill/tools/_skill_support/novel_preparation.py @@ -3,11 +3,16 @@ from __future__ import annotations +import json import re import unicodedata from pathlib import Path from typing import Any +_ALIAS_REGISTRY_CACHE: dict[str, "_AliasRegistry"] = {} + +_DEFAULT_ALIAS_FILE = Path(__file__).resolve().parent.parent.parent / "character_aliases.json" + TEXT_ENCODINGS = ( "utf-8-sig", "utf-8", @@ -64,6 +69,89 @@ MATCH_IGNORED_PATTERN = re.compile(r"[\s\u3000\u00b7\u2027\u30fb'\"`~!@#$%^&*()_+\-=\[\]{}\\|;:,.<>/?,。!?:;、“”‘’《》【】()]") +class _AliasRegistry: + __slots__ = ("canonical_to_spec", "alias_to_canonical") + + def __init__(self, canonical_to_spec: dict[str, str], alias_to_canonical: dict[str, str]): + self.canonical_to_spec = canonical_to_spec + self.alias_to_canonical = alias_to_canonical + + @property + def empty(self) -> bool: + return not self.canonical_to_spec + + +_EMPTY_REGISTRY = _AliasRegistry({}, {}) + + +def _load_alias_registry(alias_file: str | Path | None = None) -> _AliasRegistry: + path = Path(alias_file) if alias_file else _DEFAULT_ALIAS_FILE + cache_key = str(path) + if cache_key in _ALIAS_REGISTRY_CACHE: + return _ALIAS_REGISTRY_CACHE[cache_key] + + if not path.exists(): + _ALIAS_REGISTRY_CACHE[cache_key] = _EMPTY_REGISTRY + return _EMPTY_REGISTRY + + try: + data = json.loads(path.read_text("utf-8")) + except (json.JSONDecodeError, OSError): + _ALIAS_REGISTRY_CACHE[cache_key] = _EMPTY_REGISTRY + return _EMPTY_REGISTRY + + canonical_to_spec: dict[str, str] = {} + alias_to_canonical: dict[str, str] = {} + for canonical, aliases in data.items(): + if canonical.startswith("_") or not isinstance(aliases, list): + continue + all_names = [canonical] + [str(a).strip() for a in aliases if str(a).strip()] + canonical_to_spec[canonical] = "|".join(all_names) + for name in all_names: + alias_to_canonical[name] = canonical + normalized = _normalize_match_text(name) + if normalized and normalized not in alias_to_canonical: + alias_to_canonical[normalized] = canonical + + registry = _AliasRegistry(canonical_to_spec, alias_to_canonical) + _ALIAS_REGISTRY_CACHE[cache_key] = registry + return registry + + +def _resolve_character_aliases( + characters: list[str] | None, + registry: _AliasRegistry, +) -> list[str] | None: + if not characters or registry.empty: + return characters + resolved: list[str] = [] + for name in characters: + clean = str(name or "").strip() + if not clean: + resolved.append(clean) + continue + if "|" in clean: + resolved.append(clean) + continue + if clean in registry.canonical_to_spec: + resolved.append(registry.canonical_to_spec[clean]) + continue + if clean in registry.alias_to_canonical: + canonical = registry.alias_to_canonical[clean] + resolved.append(registry.canonical_to_spec[canonical]) + continue + normalized = _normalize_match_text(clean) + if normalized and normalized in registry.alias_to_canonical: + canonical = registry.alias_to_canonical[normalized] + resolved.append(registry.canonical_to_spec[canonical]) + continue + if normalized and normalized in registry.canonical_to_spec: + resolved.append(registry.canonical_to_spec[normalized]) + continue + resolved.append(clean) + return resolved + + def _decode_score(text: str) -> tuple[int, int, int, int]: if not text: return (-10_000, 0, 0, 0) @@ -160,12 +248,14 @@ def prepare_novel_excerpt( max_sentences: int = 80, max_chars: int = 12_000, characters: list[str] | None = None, + alias_file: str | Path | None = None, ) -> str: return build_excerpt_payload_from_text( text, max_sentences=max_sentences, max_chars=max_chars, characters=characters, + alias_file=alias_file, )["excerpt"] @@ -175,14 +265,19 @@ def build_excerpt_payload_from_text( max_sentences: int = 80, max_chars: int = 12_000, characters: list[str] | None = None, + alias_file: str | Path | None = None, ) -> dict[str, Any]: + registry = _load_alias_registry(alias_file) + characters = _resolve_character_aliases(characters, registry) clean = str(text or "").strip() if not clean: + requested = _normalize_characters(characters) + canonical = [_canonical_name(c) for c in requested] return { "excerpt": "", - "requested_characters": _normalize_characters(characters), + "requested_characters": canonical, "matched_characters": [], - "missing_characters": _normalize_characters(characters), + "missing_characters": canonical, "excerpt_strategy": "empty", "excerpt_stages": _empty_stage_blocks(), } @@ -200,12 +295,13 @@ def build_excerpt_payload_from_text( return payload selected_indices = _select_leading_indices(sentences, max_sentences=max_sentences, max_chars=max_chars) + canonical_requested = [_canonical_name(c) for c in requested] return _build_excerpt_result( sentences, selected_indices, - requested_characters=requested, + requested_characters=canonical_requested, matched_characters=[], - missing_characters=requested, + missing_characters=canonical_requested, excerpt_strategy="leading_sentences", max_chars=max_chars, ) @@ -217,12 +313,14 @@ def load_prepared_novel_excerpt( max_sentences: int = 80, max_chars: int = 12_000, characters: list[str] | None = None, + alias_file: str | Path | None = None, ) -> str: return prepare_novel_excerpt( load_novel_text(novel_path), max_sentences=max_sentences, max_chars=max_chars, characters=characters, + alias_file=alias_file, ) @@ -232,6 +330,7 @@ def build_excerpt_payload( max_sentences: int = 80, max_chars: int = 12_000, characters: list[str] | None = None, + alias_file: str | Path | None = None, ) -> dict[str, object]: path = Path(novel_path) excerpt_payload = build_excerpt_payload_from_text( @@ -239,6 +338,7 @@ def build_excerpt_payload( max_sentences=max_sentences, max_chars=max_chars, characters=characters, + alias_file=alias_file, ) return { "source_path": str(path), @@ -254,14 +354,29 @@ def build_excerpt_payload( } +def _canonical_name(character_spec: str) -> str: + """Extract the canonical name (first '|'-separated part).""" + clean = str(character_spec or "").strip() + return clean.split("|", 1)[0].strip() + + +def _split_aliases(character_spec: str) -> list[str]: + """Split '孙悟空|齐天大圣|孙行者' into ['孙悟空', '齐天大圣', '孙行者'].""" + parts = [a.strip() for a in str(character_spec or "").split("|")] + return [a for a in parts if a] + + def _normalize_characters(characters: list[str] | None) -> list[str]: ordered: list[str] = [] seen: set[str] = set() for item in list(characters or []): name = str(item or "").strip() - if not name or name in seen: + if not name: + continue + canonical = _canonical_name(name) + if canonical in seen: continue - seen.add(name) + seen.add(canonical) ordered.append(name) return ordered @@ -272,10 +387,12 @@ def _normalize_match_text(text: str) -> str: def _sentence_mentions_character(sentence: str, character: str) -> bool: - normalized_character = _normalize_match_text(character) - if not normalized_character: - return False - return normalized_character in _normalize_match_text(sentence) + normalized_sentence = _normalize_match_text(sentence) + for alias in _split_aliases(character): + normalized_alias = _normalize_match_text(alias) + if normalized_alias and normalized_alias in normalized_sentence: + return True + return False def _leading_excerpt(sentences: list[str], *, max_sentences: int, max_chars: int) -> str: @@ -311,24 +428,25 @@ def _character_focused_excerpt( max_sentences: int, max_chars: int, ) -> dict[str, Any]: - character_hits: dict[str, list[int]] = {name: [] for name in characters} + canonical_to_spec = {_canonical_name(name): name for name in characters} + character_hits: dict[str, list[int]] = {canon: [] for canon in canonical_to_spec} all_hit_indices: list[int] = [] seen_hits: set[int] = set() for idx, sentence in enumerate(sentences): - for name in characters: - if _sentence_mentions_character(sentence, name): - character_hits[name].append(idx) + for canon, spec in canonical_to_spec.items(): + if _sentence_mentions_character(sentence, spec): + character_hits[canon].append(idx) if idx not in seen_hits: seen_hits.add(idx) all_hit_indices.append(idx) - matched = [name for name, hits in character_hits.items() if hits] - missing = [name for name in characters if not character_hits[name]] + matched = [canon for canon, hits in character_hits.items() if hits] + missing = [canon for canon in canonical_to_spec if not character_hits[canon]] if not matched: return { "excerpt": "", - "requested_characters": characters, + "requested_characters": list(canonical_to_spec.keys()), "matched_characters": [], "missing_characters": missing, "excerpt_strategy": "leading_sentences", @@ -371,6 +489,7 @@ def _character_focused_excerpt( selected_indices, character_hits=character_hits, matched_characters=matched, + character_specs=list(characters), max_sentences=max_sentences, max_chars=max_chars, ) @@ -378,7 +497,7 @@ def _character_focused_excerpt( return _build_excerpt_result( sentences, selected_indices, - requested_characters=characters, + requested_characters=list(canonical_to_spec.keys()), matched_characters=matched, missing_characters=missing, excerpt_strategy="character_windows_mixed" if augmented else "character_windows", @@ -408,6 +527,7 @@ def _augment_character_excerpt_indices( *, character_hits: dict[str, list[int]], matched_characters: list[str], + character_specs: list[str] | None = None, max_sentences: int, max_chars: int, ) -> list[int]: @@ -452,11 +572,12 @@ def add_candidates(indices: list[int], *, radius: int = 0) -> None: if enough(): return ordered - add_candidates(_dialogue_candidate_indices(sentences, matched_characters), radius=0) + spec_list = character_specs if character_specs else matched_characters + add_candidates(_dialogue_candidate_indices(sentences, spec_list), radius=0) if enough(): return ordered - add_candidates(_thought_or_evaluation_indices(sentences, matched_characters), radius=0) + add_candidates(_thought_or_evaluation_indices(sentences, spec_list), radius=0) return ordered