From ea8362958e5e397c0a9709a049e38aad409b5c7b Mon Sep 17 00:00:00 2001 From: Kaftow <166228791+Kaftow@users.noreply.github.com> Date: Tue, 21 Apr 2026 12:07:40 +0900 Subject: [PATCH 1/9] Move mapping files into resource folder --- {scripts => resources}/mappings/audience.json | 0 {scripts => resources}/mappings/content_formats.json | 0 {scripts => resources}/mappings/droppable.json | 0 {scripts => resources}/mappings/genres.json | 0 {scripts => resources}/mappings/literary_themes.json | 0 {scripts => resources}/mappings/literary_tropes.json | 0 {scripts => resources}/mappings/main_topics.json | 0 {scripts => resources}/mappings/people_overrides.json | 0 {scripts => resources}/mappings/places_overrides.json | 0 {scripts => resources}/mappings/subgenres.json | 0 10 files changed, 0 insertions(+), 0 deletions(-) rename {scripts => resources}/mappings/audience.json (100%) rename {scripts => resources}/mappings/content_formats.json (100%) rename {scripts => resources}/mappings/droppable.json (100%) rename {scripts => resources}/mappings/genres.json (100%) rename {scripts => resources}/mappings/literary_themes.json (100%) rename {scripts => resources}/mappings/literary_tropes.json (100%) rename {scripts => resources}/mappings/main_topics.json (100%) rename {scripts => resources}/mappings/people_overrides.json (100%) rename {scripts => resources}/mappings/places_overrides.json (100%) rename {scripts => resources}/mappings/subgenres.json (100%) diff --git a/scripts/mappings/audience.json b/resources/mappings/audience.json similarity index 100% rename from scripts/mappings/audience.json rename to resources/mappings/audience.json diff --git a/scripts/mappings/content_formats.json b/resources/mappings/content_formats.json similarity index 100% rename from scripts/mappings/content_formats.json rename to resources/mappings/content_formats.json diff --git a/scripts/mappings/droppable.json b/resources/mappings/droppable.json similarity index 100% rename from scripts/mappings/droppable.json rename to resources/mappings/droppable.json diff --git a/scripts/mappings/genres.json b/resources/mappings/genres.json similarity index 100% rename from scripts/mappings/genres.json rename to resources/mappings/genres.json diff --git a/scripts/mappings/literary_themes.json b/resources/mappings/literary_themes.json similarity index 100% rename from scripts/mappings/literary_themes.json rename to resources/mappings/literary_themes.json diff --git a/scripts/mappings/literary_tropes.json b/resources/mappings/literary_tropes.json similarity index 100% rename from scripts/mappings/literary_tropes.json rename to resources/mappings/literary_tropes.json diff --git a/scripts/mappings/main_topics.json b/resources/mappings/main_topics.json similarity index 100% rename from scripts/mappings/main_topics.json rename to resources/mappings/main_topics.json diff --git a/scripts/mappings/people_overrides.json b/resources/mappings/people_overrides.json similarity index 100% rename from scripts/mappings/people_overrides.json rename to resources/mappings/people_overrides.json diff --git a/scripts/mappings/places_overrides.json b/resources/mappings/places_overrides.json similarity index 100% rename from scripts/mappings/places_overrides.json rename to resources/mappings/places_overrides.json diff --git a/scripts/mappings/subgenres.json b/resources/mappings/subgenres.json similarity index 100% rename from scripts/mappings/subgenres.json rename to resources/mappings/subgenres.json From 49ad631dc5ac8370a5f0a16929ce3a9a9a88de51 Mon Sep 17 00:00:00 2001 From: Kaftow <166228791+Kaftow@users.noreply.github.com> Date: Tue, 21 Apr 2026 15:24:57 +0900 Subject: [PATCH 2/9] Refactor legacy migrate_subjects.py into several modules --- core/__init__.py | 5 + core/classifier_assembler.py | 41 ++++++ core/json_loader.py | 32 +++++ core/migrate_subject_classifier.py | 17 +++ core/pack_registry.py | 85 +++++++++++ core/run_state.py | 22 +++ core/subject_classifier.py | 51 +++++++ rule_engine/__init__.py | 5 + rule_engine/base.py | 15 ++ rule_engine/normalization.py | 24 ++++ rule_packs/__init__.py | 54 +++++++ rule_packs/audience.py | 22 +++ rule_packs/content_formats.py | 22 +++ rule_packs/genres.py | 22 +++ rule_packs/literary_form.py | 16 +++ rule_packs/literary_themes.py | 22 +++ rule_packs/literary_tropes.py | 22 +++ rule_packs/main_topics.py | 22 +++ rule_packs/moods.py | 16 +++ rule_packs/people.py | 23 +++ rule_packs/places.py | 23 +++ rule_packs/subgenres.py | 22 +++ rule_packs/subject_diagnostics.py | 41 ++++++ rule_packs/times.py | 21 +++ rule_packs/utils.py | 65 +++++++++ rules/__init__.py | 8 ++ rules/mapping_rule.py | 17 +++ rules/override_rule.py | 20 +++ rules/passthrough_rule.py | 11 ++ rules/prefix_rule.py | 21 +++ scripts/migrate_subjects.py | 223 ++++------------------------- scripts/run_legacy_subjects.sh | 22 +++ 32 files changed, 833 insertions(+), 199 deletions(-) create mode 100644 core/__init__.py create mode 100644 core/classifier_assembler.py create mode 100644 core/json_loader.py create mode 100644 core/migrate_subject_classifier.py create mode 100644 core/pack_registry.py create mode 100644 core/run_state.py create mode 100644 core/subject_classifier.py create mode 100644 rule_engine/__init__.py create mode 100644 rule_engine/base.py create mode 100644 rule_engine/normalization.py create mode 100644 rule_packs/__init__.py create mode 100644 rule_packs/audience.py create mode 100644 rule_packs/content_formats.py create mode 100644 rule_packs/genres.py create mode 100644 rule_packs/literary_form.py create mode 100644 rule_packs/literary_themes.py create mode 100644 rule_packs/literary_tropes.py create mode 100644 rule_packs/main_topics.py create mode 100644 rule_packs/moods.py create mode 100644 rule_packs/people.py create mode 100644 rule_packs/places.py create mode 100644 rule_packs/subgenres.py create mode 100644 rule_packs/subject_diagnostics.py create mode 100644 rule_packs/times.py create mode 100644 rule_packs/utils.py create mode 100644 rules/__init__.py create mode 100644 rules/mapping_rule.py create mode 100644 rules/override_rule.py create mode 100644 rules/passthrough_rule.py create mode 100644 rules/prefix_rule.py create mode 100755 scripts/run_legacy_subjects.sh diff --git a/core/__init__.py b/core/__init__.py new file mode 100644 index 0000000..ee7aaf5 --- /dev/null +++ b/core/__init__.py @@ -0,0 +1,5 @@ +"""Core orchestration and default migration assembly.""" + +from .subject_classifier import DEFAULT_OUTPUT_TYPES, SubjectClassifier + +__all__ = ["DEFAULT_OUTPUT_TYPES", "SubjectClassifier"] diff --git a/core/classifier_assembler.py b/core/classifier_assembler.py new file mode 100644 index 0000000..23a732e --- /dev/null +++ b/core/classifier_assembler.py @@ -0,0 +1,41 @@ +"""Assembly helpers for building migration classifiers.""" + +from __future__ import annotations + +from collections.abc import Iterable + +from core.json_loader import load_set +from core.pack_registry import ( + AVAILABLE_PACK_NAMES, + PACK_FACTORIES, + PACK_PRESETS, +) +from core.subject_classifier import SubjectClassifier + + +def resolve_pack_names(enabled_packs: Iterable[str] | None) -> list[str]: + """Expand presets into concrete stable pack names.""" + selected = list(enabled_packs or []) + expanded: list[str] = [] + for name in selected: + if name in PACK_PRESETS: + expanded.extend(PACK_PRESETS[name]) + continue + expanded.append(name) + return expanded + + +def build_subject_classifier( + enabled_packs: Iterable[str] | None = None, +) -> SubjectClassifier: + """Build the migration classifier from an explicit pack-name list.""" + selected = resolve_pack_names(enabled_packs) + missing = [name for name in selected if name not in PACK_FACTORIES] + if missing: + available = ", ".join(AVAILABLE_PACK_NAMES) + missing_display = ", ".join(sorted(missing)) + raise ValueError( + f"Unknown rule pack(s): {missing_display}. Available: {available}" + ) + + return SubjectClassifier(rule_packs=[PACK_FACTORIES[name]() for name in selected]) diff --git a/core/json_loader.py b/core/json_loader.py new file mode 100644 index 0000000..1553804 --- /dev/null +++ b/core/json_loader.py @@ -0,0 +1,32 @@ +"""JSON resource loaders for migration assembly.""" + +from __future__ import annotations + +import json +from pathlib import Path + +from rule_engine.normalization import normalize + +REPO_ROOT = Path(__file__).resolve().parent.parent +MAPPINGS_DIR = REPO_ROOT / "resources" / "mappings" + + +def load_mapping(name: str) -> dict[str, str]: + """Load a JSON mapping file from resources/mappings/.""" + path = MAPPINGS_DIR / f"{name}.json" + if not path.exists(): + return {} + with open(path) as handle: + return json.load(handle) + + +def load_set(name: str) -> set[str]: + """Load a JSON list file as a normalized set.""" + path = MAPPINGS_DIR / f"{name}.json" + if not path.exists(): + return set() + with open(path) as handle: + data = json.load(handle) + if isinstance(data, list): + return {normalize(item) for item in data} + return {normalize(item) for item in data.keys()} diff --git a/core/migrate_subject_classifier.py b/core/migrate_subject_classifier.py new file mode 100644 index 0000000..38cc1a1 --- /dev/null +++ b/core/migrate_subject_classifier.py @@ -0,0 +1,17 @@ +"""Compatibility wrapper for migration classifier assembly.""" + +from __future__ import annotations + +from core.classifier_assembler import ( + build_subject_classifier, + resolve_pack_names, +) +from core.pack_registry import AVAILABLE_PACK_NAMES, PACK_FACTORIES, PACK_PRESETS + +__all__ = [ + "AVAILABLE_PACK_NAMES", + "PACK_FACTORIES", + "PACK_PRESETS", + "build_subject_classifier", + "resolve_pack_names", +] diff --git a/core/pack_registry.py b/core/pack_registry.py new file mode 100644 index 0000000..b1df0e7 --- /dev/null +++ b/core/pack_registry.py @@ -0,0 +1,85 @@ +"""Stable pack-name registry for migration assembly.""" + +from __future__ import annotations + +from typing import Callable + +from core.json_loader import load_mapping, load_set +from rule_packs import ( + AudiencePack, + ContentFormatsPack, + GenresPack, + LiteraryFormPack, + LiteraryThemesPack, + LiteraryTropesPack, + MainTopicsPack, + MoodsPack, + PeoplePack, + PlacesPack, + SubgenresPack, + SUBJECT_PACK_CLASSES, + SubjectDiagnosticsPack, + TimesPack, +) + +PackFactory = Callable[[], object] + +SUBJECT_PACK_BUILDERS = {pack_cls.name: pack_cls for pack_cls in SUBJECT_PACK_CLASSES} +PACK_PRESETS: dict[str, tuple[str, ...]] = { + "subject_mappings": ( + "literary_form", + "audience", + "genres", + "subgenres", + "content_formats", + "moods", + "literary_themes", + "literary_tropes", + "main_topics", + "subject_diagnostics", + "people", + "places", + "times", + ), +} + +PACK_FACTORIES: dict[str, PackFactory] = { + "literary_form": lambda: LiteraryFormPack(remove_matched_subjects=True), + "audience": lambda: AudiencePack( + mapping=load_mapping("audience"), + remove_matched_subjects=True, + ), + "genres": lambda: GenresPack( + mapping=load_mapping("genres"), + remove_matched_subjects=True, + ), + "subgenres": lambda: SubgenresPack( + mapping=load_mapping("subgenres"), + remove_matched_subjects=True, + ), + "content_formats": lambda: ContentFormatsPack( + mapping=load_mapping("content_formats"), + remove_matched_subjects=True, + ), + "moods": lambda: MoodsPack(remove_matched_subjects=True), + "literary_themes": lambda: LiteraryThemesPack( + mapping=load_mapping("literary_themes"), + remove_matched_subjects=True, + ), + "literary_tropes": lambda: LiteraryTropesPack( + mapping=load_mapping("literary_tropes"), + remove_matched_subjects=True, + ), + "main_topics": lambda: MainTopicsPack( + mapping=load_mapping("main_topics"), + remove_matched_subjects=True, + ), + "subject_diagnostics": lambda: SubjectDiagnosticsPack( + droppable=load_set("droppable") + ), + "people": lambda: PeoplePack(overrides=load_mapping("people_overrides")), + "places": lambda: PlacesPack(overrides=load_mapping("places_overrides")), + "times": TimesPack, +} + +AVAILABLE_PACK_NAMES = tuple(sorted({*PACK_FACTORIES, *PACK_PRESETS})) diff --git a/core/run_state.py b/core/run_state.py new file mode 100644 index 0000000..a78fae0 --- /dev/null +++ b/core/run_state.py @@ -0,0 +1,22 @@ +"""Shared runtime state for sequential subject classification.""" + +from __future__ import annotations + +from collections.abc import Mapping +from dataclasses import dataclass, field +from typing import Any + + +@dataclass +class RunState: + """Mutable state shared by packs during sequential execution.""" + + work: Mapping[str, Any] + result: dict[str, list[str]] + remaining_subjects: list[str] = field(default_factory=list) + + def add(self, output_type: str, value: str) -> None: + if output_type not in self.result: + self.result[output_type] = [] + if value not in self.result[output_type]: + self.result[output_type].append(value) diff --git a/core/subject_classifier.py b/core/subject_classifier.py new file mode 100644 index 0000000..f85aef2 --- /dev/null +++ b/core/subject_classifier.py @@ -0,0 +1,51 @@ +"""Reusable classification core for subject migration.""" + +from __future__ import annotations + +from collections.abc import Iterable, Mapping +from typing import Any + +from core.run_state import RunState + +DEFAULT_OUTPUT_TYPES = ( + "literary_form", + "audience", + "genres", + "subgenres", + "content_formats", + "moods", + "literary_themes", + "literary_tropes", + "main_topics", + "sub_topics", + "people", + "places", + "times", + "things", + "reading_level", + "classification_codes", + "unmapped", +) + + +class SubjectClassifier: + """Public orchestration layer for work-level subject classification.""" + + def __init__( + self, + rule_packs: Iterable[Any], + output_types: Iterable[str] | None = None, + ) -> None: + self.rule_packs = list(rule_packs) + self.output_types = tuple(output_types or DEFAULT_OUTPUT_TYPES) + + def classify_work(self, work: Mapping[str, Any]) -> dict[str, list[str]]: + """Run the enabled rule packs against a normalized work object.""" + state = RunState( + work=work, + result={tag_type: [] for tag_type in self.output_types}, + remaining_subjects=list(work.get("subjects", [])), + ) + for pack in self.rule_packs: + pack.apply(state) + return state.result diff --git a/rule_engine/__init__.py b/rule_engine/__init__.py new file mode 100644 index 0000000..2386cef --- /dev/null +++ b/rule_engine/__init__.py @@ -0,0 +1,5 @@ +"""Low-level rule engine primitives.""" + +from .base import RulePack + +__all__ = ["RulePack"] diff --git a/rule_engine/base.py b/rule_engine/base.py new file mode 100644 index 0000000..552f7ea --- /dev/null +++ b/rule_engine/base.py @@ -0,0 +1,15 @@ +"""Rule-pack interface for the migration core.""" + +from __future__ import annotations + +from core.run_state import RunState + + +class RulePack: + """A bounded unit of classification logic for one or more output types.""" + + name = "" + output_types: tuple[str, ...] = () + + def apply(self, state: RunState) -> None: + raise NotImplementedError diff --git a/rule_engine/normalization.py b/rule_engine/normalization.py new file mode 100644 index 0000000..7143e7e --- /dev/null +++ b/rule_engine/normalization.py @@ -0,0 +1,24 @@ +"""Normalization and classification helpers.""" + +from __future__ import annotations + +import re + +READING_LEVEL_RE = re.compile( + r"reading level.grade\s*\d+|grade\s*\d+|rl\s*\d+", re.IGNORECASE +) +CLASSIFICATION_RE = re.compile( + r"^[0-9]{3}(\.[0-9]+)?$|^[a-z]{1,3}\s*[0-9]+|^pr[0-9]", re.IGNORECASE +) + + +def normalize(value: str) -> str: + return value.lower().strip() + + +def is_reading_level(value: str) -> bool: + return bool(READING_LEVEL_RE.search(value)) + + +def is_classification_code(value: str) -> bool: + return bool(CLASSIFICATION_RE.match(value.strip())) diff --git a/rule_packs/__init__.py b/rule_packs/__init__.py new file mode 100644 index 0000000..4dd2638 --- /dev/null +++ b/rule_packs/__init__.py @@ -0,0 +1,54 @@ +"""Concrete rule-pack modules.""" + +from .audience import AudiencePack +from .content_formats import ContentFormatsPack +from .genres import GenresPack +from .literary_form import LiteraryFormPack +from .literary_themes import LiteraryThemesPack +from .literary_tropes import LiteraryTropesPack +from .main_topics import MainTopicsPack +from .moods import MoodsPack +from .people import PeoplePack +from .places import PlacesPack +from .subgenres import SubgenresPack +from .subject_diagnostics import SubjectDiagnosticsPack +from .times import TimesPack + +SUBJECT_PACK_CLASSES = ( + LiteraryFormPack, + AudiencePack, + GenresPack, + SubgenresPack, + ContentFormatsPack, + MoodsPack, + LiteraryThemesPack, + LiteraryTropesPack, + MainTopicsPack, +) + +FIELD_PACK_CLASSES = ( + PeoplePack, + PlacesPack, + TimesPack, +) + +ALL_PACK_CLASSES = SUBJECT_PACK_CLASSES + FIELD_PACK_CLASSES + +__all__ = [ + "ALL_PACK_CLASSES", + "AudiencePack", + "ContentFormatsPack", + "FIELD_PACK_CLASSES", + "GenresPack", + "LiteraryFormPack", + "LiteraryThemesPack", + "LiteraryTropesPack", + "MainTopicsPack", + "MoodsPack", + "PeoplePack", + "PlacesPack", + "SUBJECT_PACK_CLASSES", + "SubgenresPack", + "SubjectDiagnosticsPack", + "TimesPack", +] diff --git a/rule_packs/audience.py b/rule_packs/audience.py new file mode 100644 index 0000000..165c391 --- /dev/null +++ b/rule_packs/audience.py @@ -0,0 +1,22 @@ +"""Rule pack for audience tags.""" + +from __future__ import annotations + +from collections.abc import Mapping + +from rule_packs.utils import SubjectPack +from rules import MappingRule, PrefixRule + + +class AudiencePack(SubjectPack): + name = "audience" + output_types = ("audience",) + output_type = "audience" + + def __init__( + self, + mapping: Mapping[str, str] | None = None, + remove_matched_subjects: bool = True, + ) -> None: + self.rules = (PrefixRule("audience"), MappingRule(mapping)) + self.remove_matched_subjects = remove_matched_subjects diff --git a/rule_packs/content_formats.py b/rule_packs/content_formats.py new file mode 100644 index 0000000..c1260b4 --- /dev/null +++ b/rule_packs/content_formats.py @@ -0,0 +1,22 @@ +"""Rule pack for content_formats tags.""" + +from __future__ import annotations + +from collections.abc import Mapping + +from rule_packs.utils import SubjectPack +from rules import MappingRule, PrefixRule + + +class ContentFormatsPack(SubjectPack): + name = "content_formats" + output_types = ("content_formats",) + output_type = "content_formats" + + def __init__( + self, + mapping: Mapping[str, str] | None = None, + remove_matched_subjects: bool = True, + ) -> None: + self.rules = (PrefixRule("format"), MappingRule(mapping)) + self.remove_matched_subjects = remove_matched_subjects diff --git a/rule_packs/genres.py b/rule_packs/genres.py new file mode 100644 index 0000000..b110f12 --- /dev/null +++ b/rule_packs/genres.py @@ -0,0 +1,22 @@ +"""Rule pack for genre tags.""" + +from __future__ import annotations + +from collections.abc import Mapping + +from rule_packs.utils import SubjectPack +from rules import MappingRule, PrefixRule + + +class GenresPack(SubjectPack): + name = "genres" + output_types = ("genres",) + output_type = "genres" + + def __init__( + self, + mapping: Mapping[str, str] | None = None, + remove_matched_subjects: bool = True, + ) -> None: + self.rules = (PrefixRule("genre"), MappingRule(mapping)) + self.remove_matched_subjects = remove_matched_subjects diff --git a/rule_packs/literary_form.py b/rule_packs/literary_form.py new file mode 100644 index 0000000..1972b30 --- /dev/null +++ b/rule_packs/literary_form.py @@ -0,0 +1,16 @@ +"""Rule pack for literary_form.""" + +from __future__ import annotations + +from rule_packs.utils import SubjectPack +from rules import PrefixRule + + +class LiteraryFormPack(SubjectPack): + name = "literary_form" + output_types = ("literary_form",) + output_type = "literary_form" + + def __init__(self, remove_matched_subjects: bool = True) -> None: + self.rules = (PrefixRule("form"),) + self.remove_matched_subjects = remove_matched_subjects diff --git a/rule_packs/literary_themes.py b/rule_packs/literary_themes.py new file mode 100644 index 0000000..2357776 --- /dev/null +++ b/rule_packs/literary_themes.py @@ -0,0 +1,22 @@ +"""Rule pack for literary_themes tags.""" + +from __future__ import annotations + +from collections.abc import Mapping + +from rule_packs.utils import SubjectPack +from rules import MappingRule, PrefixRule + + +class LiteraryThemesPack(SubjectPack): + name = "literary_themes" + output_types = ("literary_themes",) + output_type = "literary_themes" + + def __init__( + self, + mapping: Mapping[str, str] | None = None, + remove_matched_subjects: bool = True, + ) -> None: + self.rules = (PrefixRule("theme"), MappingRule(mapping)) + self.remove_matched_subjects = remove_matched_subjects diff --git a/rule_packs/literary_tropes.py b/rule_packs/literary_tropes.py new file mode 100644 index 0000000..1c1db97 --- /dev/null +++ b/rule_packs/literary_tropes.py @@ -0,0 +1,22 @@ +"""Rule pack for literary_tropes tags.""" + +from __future__ import annotations + +from collections.abc import Mapping + +from rule_packs.utils import SubjectPack +from rules import MappingRule, PrefixRule + + +class LiteraryTropesPack(SubjectPack): + name = "literary_tropes" + output_types = ("literary_tropes",) + output_type = "literary_tropes" + + def __init__( + self, + mapping: Mapping[str, str] | None = None, + remove_matched_subjects: bool = True, + ) -> None: + self.rules = (PrefixRule("trope"), MappingRule(mapping)) + self.remove_matched_subjects = remove_matched_subjects diff --git a/rule_packs/main_topics.py b/rule_packs/main_topics.py new file mode 100644 index 0000000..5c4e359 --- /dev/null +++ b/rule_packs/main_topics.py @@ -0,0 +1,22 @@ +"""Rule pack for main_topics tags.""" + +from __future__ import annotations + +from collections.abc import Mapping + +from rule_packs.utils import SubjectPack +from rules import MappingRule, PrefixRule + + +class MainTopicsPack(SubjectPack): + name = "main_topics" + output_types = ("main_topics",) + output_type = "main_topics" + + def __init__( + self, + mapping: Mapping[str, str] | None = None, + remove_matched_subjects: bool = True, + ) -> None: + self.rules = (PrefixRule("topic"), MappingRule(mapping)) + self.remove_matched_subjects = remove_matched_subjects diff --git a/rule_packs/moods.py b/rule_packs/moods.py new file mode 100644 index 0000000..c436e0e --- /dev/null +++ b/rule_packs/moods.py @@ -0,0 +1,16 @@ +"""Rule pack for moods tags.""" + +from __future__ import annotations + +from rule_packs.utils import SubjectPack +from rules import PrefixRule + + +class MoodsPack(SubjectPack): + name = "moods" + output_types = ("moods",) + output_type = "moods" + + def __init__(self, remove_matched_subjects: bool = True) -> None: + self.rules = (PrefixRule("mood"),) + self.remove_matched_subjects = remove_matched_subjects diff --git a/rule_packs/people.py b/rule_packs/people.py new file mode 100644 index 0000000..e5660f1 --- /dev/null +++ b/rule_packs/people.py @@ -0,0 +1,23 @@ +"""Rule pack for subject_people.""" + +from __future__ import annotations + +from collections.abc import Mapping + +from core.run_state import RunState +from rule_engine.base import RulePack +from rules import OverrideRule + + +class PeoplePack(RulePack): + name = "people" + output_types = ("people",) + + def __init__(self, overrides: Mapping[str, str] | None = None) -> None: + self.rule = OverrideRule(overrides) + + def apply(self, state: RunState) -> None: + for raw in state.work.get("subject_people", []): + value = self.rule.apply(raw) + if value is not None: + state.add("people", value) diff --git a/rule_packs/places.py b/rule_packs/places.py new file mode 100644 index 0000000..0a13464 --- /dev/null +++ b/rule_packs/places.py @@ -0,0 +1,23 @@ +"""Rule pack for subject_places.""" + +from __future__ import annotations + +from collections.abc import Mapping + +from core.run_state import RunState +from rule_engine.base import RulePack +from rules import OverrideRule + + +class PlacesPack(RulePack): + name = "places" + output_types = ("places",) + + def __init__(self, overrides: Mapping[str, str] | None = None) -> None: + self.rule = OverrideRule(overrides) + + def apply(self, state: RunState) -> None: + for raw in state.work.get("subject_places", []): + value = self.rule.apply(raw) + if value is not None: + state.add("places", value) diff --git a/rule_packs/subgenres.py b/rule_packs/subgenres.py new file mode 100644 index 0000000..eb832c5 --- /dev/null +++ b/rule_packs/subgenres.py @@ -0,0 +1,22 @@ +"""Rule pack for subgenre tags.""" + +from __future__ import annotations + +from collections.abc import Mapping + +from rule_packs.utils import SubjectPack +from rules import MappingRule, PrefixRule + + +class SubgenresPack(SubjectPack): + name = "subgenres" + output_types = ("subgenres",) + output_type = "subgenres" + + def __init__( + self, + mapping: Mapping[str, str] | None = None, + remove_matched_subjects: bool = True, + ) -> None: + self.rules = (PrefixRule("subgenre"), MappingRule(mapping)) + self.remove_matched_subjects = remove_matched_subjects diff --git a/rule_packs/subject_diagnostics.py b/rule_packs/subject_diagnostics.py new file mode 100644 index 0000000..4e6a0f7 --- /dev/null +++ b/rule_packs/subject_diagnostics.py @@ -0,0 +1,41 @@ +"""Rule pack for dropped, reading-level, classification, and unmapped subjects.""" + +from __future__ import annotations + +from core.run_state import RunState +from rule_engine.base import RulePack +from rule_engine.normalization import ( + is_classification_code, + is_reading_level, + normalize, +) + + +class SubjectDiagnosticsPack(RulePack): + name = "subject_diagnostics" + output_types = ("reading_level", "classification_codes", "unmapped") + + def __init__(self, droppable: set[str] | None = None) -> None: + self.droppable = set(droppable or ()) + + def apply(self, state: RunState) -> None: + for raw in state.remaining_subjects: + key = normalize(raw) + if key in self.droppable: + continue + + if is_reading_level(raw): + value = raw.strip() + if value: + state.add("reading_level", value) + continue + + if is_classification_code(raw): + value = raw.strip() + if value: + state.add("classification_codes", value) + continue + + value = raw.strip() + if value: + state.add("unmapped", value) diff --git a/rule_packs/times.py b/rule_packs/times.py new file mode 100644 index 0000000..c50a61a --- /dev/null +++ b/rule_packs/times.py @@ -0,0 +1,21 @@ +"""Rule pack for subject_times.""" + +from __future__ import annotations + +from core.run_state import RunState +from rule_engine.base import RulePack +from rules import PassthroughRule + + +class TimesPack(RulePack): + name = "times" + output_types = ("times",) + + def __init__(self) -> None: + self.rule = PassthroughRule() + + def apply(self, state: RunState) -> None: + for raw in state.work.get("subject_times", []): + value = self.rule.apply(raw) + if value is not None: + state.add("times", value) diff --git a/rule_packs/utils.py b/rule_packs/utils.py new file mode 100644 index 0000000..c27827e --- /dev/null +++ b/rule_packs/utils.py @@ -0,0 +1,65 @@ +"""Shared helpers for subject-based packs.""" + +from __future__ import annotations + +from collections.abc import Iterable +from typing import Protocol + +from core.run_state import RunState +from rule_engine.base import RulePack + + +class SubjectValueRule(Protocol): + def match(self, raw: str) -> str | None: ... + + +def classify_subject_value(raw: str, rules: Iterable[SubjectValueRule]) -> str | None: + for rule in rules: + match = rule.match(raw) + if match is not None: + return match + return None + + +def classify_subject_values( + state: RunState, + output_type: str, + rules: Iterable[SubjectValueRule], +) -> None: + next_subjects: list[str] = [] + for raw in state.remaining_subjects: + match = classify_subject_value(raw, rules) + if match is None: + next_subjects.append(raw) + continue + state.add(output_type, match) + state.remaining_subjects = next_subjects + + +def apply_subject_pack( + state: RunState, + output_type: str, + rules: Iterable[SubjectValueRule], + remove_matched_subjects: bool, +) -> None: + if remove_matched_subjects: + classify_subject_values(state, output_type, rules) + return + for raw in state.remaining_subjects: + match = classify_subject_value(raw, rules) + if match is not None: + state.add(output_type, match) + + +class SubjectPack(RulePack): + """Small helper for packs that operate on the shared subject sequence.""" + + output_type = "" + + def apply(self, state: RunState) -> None: + apply_subject_pack( + state, + output_type=self.output_type, + rules=self.rules, + remove_matched_subjects=self.remove_matched_subjects, + ) diff --git a/rules/__init__.py b/rules/__init__.py new file mode 100644 index 0000000..9cbc515 --- /dev/null +++ b/rules/__init__.py @@ -0,0 +1,8 @@ +"""Composable rule units for pack implementations.""" + +from .mapping_rule import MappingRule +from .override_rule import OverrideRule +from .passthrough_rule import PassthroughRule +from .prefix_rule import PrefixRule + +__all__ = ["MappingRule", "OverrideRule", "PassthroughRule", "PrefixRule"] diff --git a/rules/mapping_rule.py b/rules/mapping_rule.py new file mode 100644 index 0000000..2cb85d3 --- /dev/null +++ b/rules/mapping_rule.py @@ -0,0 +1,17 @@ +"""Direct mapping lookups for normalized values.""" + +from __future__ import annotations + +from collections.abc import Mapping + +from rule_engine.normalization import normalize + + +class MappingRule: + """Match normalized input values against a provided mapping.""" + + def __init__(self, mapping: Mapping[str, str] | None = None) -> None: + self.mapping = dict(mapping or {}) + + def match(self, raw: str) -> str | None: + return self.mapping.get(normalize(raw)) diff --git a/rules/override_rule.py b/rules/override_rule.py new file mode 100644 index 0000000..0fe74c1 --- /dev/null +++ b/rules/override_rule.py @@ -0,0 +1,20 @@ +"""Override-based normalization for field values.""" + +from __future__ import annotations + +from collections.abc import Mapping + +from rule_engine.normalization import normalize + + +class OverrideRule: + """Normalize a field value using overrides with raw fallback.""" + + def __init__(self, overrides: Mapping[str, str] | None = None) -> None: + self.overrides = dict(overrides or {}) + + def apply(self, raw: str) -> str | None: + cleaned = raw.strip() + if not cleaned: + return None + return self.overrides.get(normalize(raw), cleaned) diff --git a/rules/passthrough_rule.py b/rules/passthrough_rule.py new file mode 100644 index 0000000..5dbb88b --- /dev/null +++ b/rules/passthrough_rule.py @@ -0,0 +1,11 @@ +"""Passthrough normalization for field values.""" + +from __future__ import annotations + + +class PassthroughRule: + """Return cleaned field values without additional transformation.""" + + def apply(self, raw: str) -> str | None: + cleaned = raw.strip() + return cleaned or None diff --git a/rules/prefix_rule.py b/rules/prefix_rule.py new file mode 100644 index 0000000..11b49f9 --- /dev/null +++ b/rules/prefix_rule.py @@ -0,0 +1,21 @@ +"""Prefix-based matching for subject values.""" + +from __future__ import annotations + + +class PrefixRule: + """Match values like ``theme:love`` and return the normalized payload.""" + + def __init__(self, prefix: str) -> None: + self.prefix = prefix + + def match(self, raw: str) -> str | None: + if not self.prefix or ":" not in raw: + return None + prefix, _, value = raw.partition(":") + if prefix.strip().lower() != self.prefix: + return None + cleaned = value.strip() + if not cleaned: + return None + return cleaned.title() diff --git a/scripts/migrate_subjects.py b/scripts/migrate_subjects.py index 4ec609b..57e8018 100644 --- a/scripts/migrate_subjects.py +++ b/scripts/migrate_subjects.py @@ -15,219 +15,32 @@ import argparse import json import os -import re import sys from pathlib import Path -import requests +REPO_ROOT = Path(__file__).resolve().parent.parent +if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) + +from core.classifier_assembler import build_subject_classifier +from core.pack_registry import AVAILABLE_PACK_NAMES # --------------------------------------------------------------------------- # Paths # --------------------------------------------------------------------------- -REPO_ROOT = Path(__file__).parent.parent -MAPPINGS_DIR = Path(__file__).parent / "mappings" - OL_WORK_URL = "https://openlibrary.org/works/{work_id}.json" -# --------------------------------------------------------------------------- -# Load mappings -# --------------------------------------------------------------------------- - -def load_mapping(name: str) -> dict[str, str]: - """Load a JSON mapping file from scripts/mappings/.""" - path = MAPPINGS_DIR / f"{name}.json" - if not path.exists(): - return {} - with open(path) as f: - return json.load(f) - - -def load_set(name: str) -> set[str]: - """Load a JSON list file as a set (e.g. droppable.json).""" - path = MAPPINGS_DIR / f"{name}.json" - if not path.exists(): - return set() - with open(path) as f: - data = json.load(f) - if isinstance(data, list): - return {s.lower().strip() for s in data} - return set(data.keys()) - - -# --------------------------------------------------------------------------- -# Normalization helpers -# --------------------------------------------------------------------------- - -READING_LEVEL_RE = re.compile( - r"reading level.grade\s*\d+|grade\s*\d+|rl\s*\d+", re.IGNORECASE -) -CLASSIFICATION_RE = re.compile( - r"^[0-9]{3}(\.[0-9]+)?$|^[a-z]{1,3}\s*[0-9]+|^pr[0-9]", re.IGNORECASE -) - - -def normalize(s: str) -> str: - """Lowercase and strip a subject string for mapping lookup.""" - return s.lower().strip() - - -def is_reading_level(s: str) -> bool: - return bool(READING_LEVEL_RE.search(s)) - - -def is_classification_code(s: str) -> bool: - return bool(CLASSIFICATION_RE.match(s.strip())) - - -# --------------------------------------------------------------------------- -# Core classifier -# --------------------------------------------------------------------------- - -class SubjectClassifier: - def __init__(self): - self.genres_map = load_mapping("genres") - self.subgenres_map = load_mapping("subgenres") - self.formats_map = load_mapping("content_formats") - self.themes_map = load_mapping("literary_themes") - self.tropes_map = load_mapping("literary_tropes") - self.topics_map = load_mapping("main_topics") - self.audience_map = load_mapping("audience") - self.droppable = load_set("droppable") - self.people_overrides = load_mapping("people_overrides") - self.places_overrides = load_mapping("places_overrides") - - def classify_subject(self, raw: str) -> tuple[str, str | None]: - """ - Classify a single subject string. - - Returns (type, canonical_value) where type is one of: - literary_form, genres, subgenres, content_formats, literary_themes, - literary_tropes, main_topics, audience, reading_level, - classification_code, drop, unmapped - """ - key = normalize(raw) - - # Audience strings (before hard drops, since some overlap) - if key in self.audience_map: - return ("audience", self.audience_map[key]) - - # Hard drops - if key in self.droppable: - return ("drop", None) - - # Reading levels - if is_reading_level(raw): - return ("reading_level", raw.strip()) - - # Classification codes (Dewey, LC call numbers) - if is_classification_code(raw): - return ("classification_code", raw.strip()) - - # Explicit prefix-typed tags (e.g. "form:novel", "genre:tragedy") - if ":" in raw: - prefix, _, value = raw.partition(":") - prefix = prefix.strip().lower() - value = value.strip() - type_map = { - "form": "literary_form", - "audience": "audience", - "genre": "genres", - "subgenre": "subgenres", - "format": "content_formats", - "theme": "literary_themes", - "trope": "literary_tropes", - "topic": "main_topics", - "mood": "moods", - } - if prefix in type_map: - return (type_map[prefix], value.title()) - - # Mapping lookups (in priority order) - if key in self.genres_map: - return ("genres", self.genres_map[key]) - if key in self.subgenres_map: - return ("subgenres", self.subgenres_map[key]) - if key in self.formats_map: - return ("content_formats", self.formats_map[key]) - if key in self.themes_map: - return ("literary_themes", self.themes_map[key]) - if key in self.tropes_map: - return ("literary_tropes", self.tropes_map[key]) - if key in self.topics_map: - return ("main_topics", self.topics_map[key]) - - return ("unmapped", raw.strip()) - - def classify_work(self, work: dict) -> dict: - """ - Given a work JSON dict (from OL API), produce a structured tag output. - """ - result: dict[str, list] = { - "literary_form": [], - "audience": [], - "genres": [], - "subgenres": [], - "content_formats": [], - "moods": [], - "literary_themes": [], - "literary_tropes": [], - "main_topics": [], - "sub_topics": [], - "people": [], - "places": [], - "times": [], - "things": [], - "reading_level": [], - "classification_codes": [], - "unmapped": [], - } - - # Classify flat subjects - for raw in work.get("subjects", []): - tag_type, value = self.classify_subject(raw) - if tag_type == "drop" or value is None: - continue - if tag_type == "reading_level": - result["reading_level"].append(value) - elif tag_type == "classification_code": - result["classification_codes"].append(value) - elif tag_type in result: - if value not in result[tag_type]: - result[tag_type].append(value) - else: - result["unmapped"].append(raw) - - # subject_people → canonical names - for raw in work.get("subject_people", []): - key = normalize(raw) - canonical = self.people_overrides.get(key, raw.strip()) - if canonical not in result["people"]: - result["people"].append(canonical) - - # subject_places → canonical places - for raw in work.get("subject_places", []): - key = normalize(raw) - canonical = self.places_overrides.get(key, raw.strip()) - if canonical not in result["places"]: - result["places"].append(canonical) - - # subject_times → pass through (times are free-form) - for raw in work.get("subject_times", []): - cleaned = raw.strip() - if cleaned and cleaned not in result["times"]: - result["times"].append(cleaned) - - return result - - # --------------------------------------------------------------------------- # Fetching # --------------------------------------------------------------------------- + def fetch_work(work_id: str) -> dict: """Fetch a work JSON from Open Library.""" + import requests + work_id = work_id.replace("/works/", "").strip() if not work_id.endswith(".json"): url = OL_WORK_URL.format(work_id=work_id) @@ -247,6 +60,7 @@ def load_work_file(path: str) -> dict: # Output # --------------------------------------------------------------------------- + def print_result(work_id: str, result: dict): print(f"\n=== {work_id} ===") for key, values in result.items(): @@ -268,6 +82,7 @@ def write_result(work_id: str, result: dict, output_dir: str): # CLI # --------------------------------------------------------------------------- + def main(): parser = argparse.ArgumentParser( description="Migrate OL legacy subjects to canonical typed tags." @@ -277,11 +92,21 @@ def main(): group.add_argument("--file", help="Path to a local work JSON file") group.add_argument("--batch", help="Path to newline-delimited OL Work IDs file") - parser.add_argument("--output", default="output", help="Output directory for batch mode") - parser.add_argument("--dry-run", action="store_true", help="Print results, don't write files") + parser.add_argument( + "--output", default="output", help="Output directory for batch mode" + ) + parser.add_argument( + "--dry-run", action="store_true", help="Print results, don't write files" + ) + parser.add_argument( + "--pack", + action="append", + choices=AVAILABLE_PACK_NAMES, + help="Enable only the named rule pack. Repeat to combine multiple packs.", + ) args = parser.parse_args() - classifier = SubjectClassifier() + classifier = build_subject_classifier(args.pack) if args.work: print(f"Fetching {args.work}...") diff --git a/scripts/run_legacy_subjects.sh b/scripts/run_legacy_subjects.sh new file mode 100755 index 0000000..829baa4 --- /dev/null +++ b/scripts/run_legacy_subjects.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" + +python3 "${REPO_ROOT}/scripts/migrate_subjects.py" \ + --pack literary_form \ + --pack audience \ + --pack genres \ + --pack subgenres \ + --pack content_formats \ + --pack moods \ + --pack literary_themes \ + --pack literary_tropes \ + --pack main_topics \ + --pack subject_diagnostics \ + --pack people \ + --pack places \ + --pack times \ + "$@" From e656611eb32fb77f6029310e337f5ea4558b8bd3 Mon Sep 17 00:00:00 2001 From: Kaftow <166228791+Kaftow@users.noreply.github.com> Date: Tue, 21 Apr 2026 15:25:40 +0900 Subject: [PATCH 3/9] Update README.md --- scripts/README.md | 63 +++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 58 insertions(+), 5 deletions(-) diff --git a/scripts/README.md b/scripts/README.md index 97c6ba7..5ac204c 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -14,10 +14,11 @@ Open Library works currently have a flat `subjects` list (plus `subject_people`, ### `migrate_subjects.py` -The main migration tool. Given a work's OL JSON, it: +The current runner/compatibility entry point. Given a work's OL JSON, it: 1. Loads the legacy `subjects`, `subject_people`, `subject_places`, and `subject_times` lists -2. Applies rule-based and keyword matching to classify each string into the correct canonical type +2. Builds a `SubjectClassifier` from one or more enabled rule packs +3. Applies rule-based and keyword matching to classify each string into the correct canonical type 3. Outputs a structured tag object ready for import into the new schema **Usage:** @@ -28,13 +29,29 @@ python scripts/migrate_subjects.py --work OL82563W # From a local JSON file python scripts/migrate_subjects.py --file work.json +# Legacy-compatible fixed-order wrapper +./scripts/run_legacy_subjects.sh --file work.json + # Batch from a newline-delimited list of OL IDs python scripts/migrate_subjects.py --batch ol_ids.txt --output output/ # Dry run (print proposed mappings without writing) python scripts/migrate_subjects.py --work OL82563W --dry-run + +# Run the old full sequence explicitly through the wrapper +./scripts/run_legacy_subjects.sh --file work.json --dry-run + +# Run only a subset of rule packs +python scripts/migrate_subjects.py --file work.json --pack genres --pack content_formats --pack subject_diagnostics --dry-run + +# Run a single tag-type module +python scripts/migrate_subjects.py --file work.json --pack content_formats --dry-run ``` +`migrate_subjects.py` no longer enables a default full preset when `--pack` is omitted. If you want the old full sequence, use `run_legacy_subjects.sh` or pass the pack list explicitly. + +`run_legacy_subjects.sh` is just a thin wrapper around `migrate_subjects.py` with the pack order written out explicitly, so it is easy to inspect and change. Any extra CLI args are forwarded as-is. + **Output format:** ```json { @@ -60,12 +77,49 @@ The `unmapped` field collects strings that couldn't be classified — these are --- +### Architecture + +The reusable classification core now lives outside the script entry point: + +```text +core/ + json_loader.py # JSON resource loading for default assembly + subject_classifier.py # public work-level orchestration core + pack_registry.py # stable pack names -> factories / presets + classifier_assembler.py # pack resolution + classifier assembly + migrate_subject_classifier.py # compatibility shim for older imports +rule_engine/ + base.py # RulePack interface + normalization.py # shared text normalization helpers +rules/ + prefix_rule.py # subject prefix matching + mapping_rule.py # normalized direct mapping + override_rule.py # override-based field normalization + passthrough_rule.py # cleaned passthrough fields +rule_packs/ + genres.py # one module per tag type + content_formats.py + audience.py + literary_themes.py + literary_tropes.py + main_topics.py + people.py + places.py + times.py +config/ + packs/ # future static pack configs +``` + +`scripts/migrate_subjects.py` remains the operational entry point, but classification logic is now encapsulated in the shared core so future runners can reuse it. + +The classification core itself is kept narrow: `SubjectClassifier` consumes a normalized `work` object plus already-constructed packs, and returns a result. JSON resource loading now lives in the default assembly layer rather than inside individual packs. + ### Adding Mapping Rules -Mappings live in `scripts/mappings/`. Each file covers one tag type: +Mappings live in `resources/mappings/`. Each file covers one tag type: ``` -scripts/ +resources/ mappings/ genres.json # legacy string → canonical genre subgenres.json # legacy string → canonical subgenre @@ -89,7 +143,6 @@ Each mapping file is a JSON object where keys are legacy strings (lowercase, str } ``` -To add a new mapping: edit the appropriate file and open a PR. No code changes needed for new string mappings. --- From 025dd517ba43e5bf0bb867056df0c2e72aa7df56 Mon Sep 17 00:00:00 2001 From: Kaftow <166228791+Kaftow@users.noreply.github.com> Date: Wed, 22 Apr 2026 00:22:57 +0900 Subject: [PATCH 4/9] Move pack default assembly into pack-owned factories --- core/pack_registry.py | 50 ++++++++----------------------- rule_packs/audience.py | 8 +++++ rule_packs/content_formats.py | 8 +++++ rule_packs/genres.py | 8 +++++ rule_packs/literary_form.py | 4 +++ rule_packs/literary_themes.py | 8 +++++ rule_packs/literary_tropes.py | 8 +++++ rule_packs/main_topics.py | 8 +++++ rule_packs/moods.py | 4 +++ rule_packs/people.py | 5 ++++ rule_packs/places.py | 5 ++++ rule_packs/subgenres.py | 8 +++++ rule_packs/subject_diagnostics.py | 5 ++++ rule_packs/times.py | 4 +++ 14 files changed, 96 insertions(+), 37 deletions(-) diff --git a/core/pack_registry.py b/core/pack_registry.py index b1df0e7..3130d31 100644 --- a/core/pack_registry.py +++ b/core/pack_registry.py @@ -4,7 +4,6 @@ from typing import Callable -from core.json_loader import load_mapping, load_set from rule_packs import ( AudiencePack, ContentFormatsPack, @@ -44,42 +43,19 @@ } PACK_FACTORIES: dict[str, PackFactory] = { - "literary_form": lambda: LiteraryFormPack(remove_matched_subjects=True), - "audience": lambda: AudiencePack( - mapping=load_mapping("audience"), - remove_matched_subjects=True, - ), - "genres": lambda: GenresPack( - mapping=load_mapping("genres"), - remove_matched_subjects=True, - ), - "subgenres": lambda: SubgenresPack( - mapping=load_mapping("subgenres"), - remove_matched_subjects=True, - ), - "content_formats": lambda: ContentFormatsPack( - mapping=load_mapping("content_formats"), - remove_matched_subjects=True, - ), - "moods": lambda: MoodsPack(remove_matched_subjects=True), - "literary_themes": lambda: LiteraryThemesPack( - mapping=load_mapping("literary_themes"), - remove_matched_subjects=True, - ), - "literary_tropes": lambda: LiteraryTropesPack( - mapping=load_mapping("literary_tropes"), - remove_matched_subjects=True, - ), - "main_topics": lambda: MainTopicsPack( - mapping=load_mapping("main_topics"), - remove_matched_subjects=True, - ), - "subject_diagnostics": lambda: SubjectDiagnosticsPack( - droppable=load_set("droppable") - ), - "people": lambda: PeoplePack(overrides=load_mapping("people_overrides")), - "places": lambda: PlacesPack(overrides=load_mapping("places_overrides")), - "times": TimesPack, + "literary_form": LiteraryFormPack.default, + "audience": AudiencePack.default, + "genres": GenresPack.default, + "subgenres": SubgenresPack.default, + "content_formats": ContentFormatsPack.default, + "moods": MoodsPack.default, + "literary_themes": LiteraryThemesPack.default, + "literary_tropes": LiteraryTropesPack.default, + "main_topics": MainTopicsPack.default, + "subject_diagnostics": SubjectDiagnosticsPack.default, + "people": PeoplePack.default, + "places": PlacesPack.default, + "times": TimesPack.default, } AVAILABLE_PACK_NAMES = tuple(sorted({*PACK_FACTORIES, *PACK_PRESETS})) diff --git a/rule_packs/audience.py b/rule_packs/audience.py index 165c391..8f59146 100644 --- a/rule_packs/audience.py +++ b/rule_packs/audience.py @@ -4,6 +4,7 @@ from collections.abc import Mapping +from core.json_loader import load_mapping from rule_packs.utils import SubjectPack from rules import MappingRule, PrefixRule @@ -20,3 +21,10 @@ def __init__( ) -> None: self.rules = (PrefixRule("audience"), MappingRule(mapping)) self.remove_matched_subjects = remove_matched_subjects + + @classmethod + def default(cls) -> "AudiencePack": + return cls( + mapping=load_mapping("audience"), + remove_matched_subjects=True, + ) diff --git a/rule_packs/content_formats.py b/rule_packs/content_formats.py index c1260b4..c96e197 100644 --- a/rule_packs/content_formats.py +++ b/rule_packs/content_formats.py @@ -4,6 +4,7 @@ from collections.abc import Mapping +from core.json_loader import load_mapping from rule_packs.utils import SubjectPack from rules import MappingRule, PrefixRule @@ -20,3 +21,10 @@ def __init__( ) -> None: self.rules = (PrefixRule("format"), MappingRule(mapping)) self.remove_matched_subjects = remove_matched_subjects + + @classmethod + def default(cls) -> "ContentFormatsPack": + return cls( + mapping=load_mapping("content_formats"), + remove_matched_subjects=True, + ) diff --git a/rule_packs/genres.py b/rule_packs/genres.py index b110f12..1812744 100644 --- a/rule_packs/genres.py +++ b/rule_packs/genres.py @@ -4,6 +4,7 @@ from collections.abc import Mapping +from core.json_loader import load_mapping from rule_packs.utils import SubjectPack from rules import MappingRule, PrefixRule @@ -20,3 +21,10 @@ def __init__( ) -> None: self.rules = (PrefixRule("genre"), MappingRule(mapping)) self.remove_matched_subjects = remove_matched_subjects + + @classmethod + def default(cls) -> "GenresPack": + return cls( + mapping=load_mapping("genres"), + remove_matched_subjects=True, + ) diff --git a/rule_packs/literary_form.py b/rule_packs/literary_form.py index 1972b30..8c02391 100644 --- a/rule_packs/literary_form.py +++ b/rule_packs/literary_form.py @@ -14,3 +14,7 @@ class LiteraryFormPack(SubjectPack): def __init__(self, remove_matched_subjects: bool = True) -> None: self.rules = (PrefixRule("form"),) self.remove_matched_subjects = remove_matched_subjects + + @classmethod + def default(cls) -> "LiteraryFormPack": + return cls(remove_matched_subjects=True) diff --git a/rule_packs/literary_themes.py b/rule_packs/literary_themes.py index 2357776..7ccb6bb 100644 --- a/rule_packs/literary_themes.py +++ b/rule_packs/literary_themes.py @@ -4,6 +4,7 @@ from collections.abc import Mapping +from core.json_loader import load_mapping from rule_packs.utils import SubjectPack from rules import MappingRule, PrefixRule @@ -20,3 +21,10 @@ def __init__( ) -> None: self.rules = (PrefixRule("theme"), MappingRule(mapping)) self.remove_matched_subjects = remove_matched_subjects + + @classmethod + def default(cls) -> "LiteraryThemesPack": + return cls( + mapping=load_mapping("literary_themes"), + remove_matched_subjects=True, + ) diff --git a/rule_packs/literary_tropes.py b/rule_packs/literary_tropes.py index 1c1db97..9a18f81 100644 --- a/rule_packs/literary_tropes.py +++ b/rule_packs/literary_tropes.py @@ -4,6 +4,7 @@ from collections.abc import Mapping +from core.json_loader import load_mapping from rule_packs.utils import SubjectPack from rules import MappingRule, PrefixRule @@ -20,3 +21,10 @@ def __init__( ) -> None: self.rules = (PrefixRule("trope"), MappingRule(mapping)) self.remove_matched_subjects = remove_matched_subjects + + @classmethod + def default(cls) -> "LiteraryTropesPack": + return cls( + mapping=load_mapping("literary_tropes"), + remove_matched_subjects=True, + ) diff --git a/rule_packs/main_topics.py b/rule_packs/main_topics.py index 5c4e359..68d6320 100644 --- a/rule_packs/main_topics.py +++ b/rule_packs/main_topics.py @@ -4,6 +4,7 @@ from collections.abc import Mapping +from core.json_loader import load_mapping from rule_packs.utils import SubjectPack from rules import MappingRule, PrefixRule @@ -20,3 +21,10 @@ def __init__( ) -> None: self.rules = (PrefixRule("topic"), MappingRule(mapping)) self.remove_matched_subjects = remove_matched_subjects + + @classmethod + def default(cls) -> "MainTopicsPack": + return cls( + mapping=load_mapping("main_topics"), + remove_matched_subjects=True, + ) diff --git a/rule_packs/moods.py b/rule_packs/moods.py index c436e0e..d6a2731 100644 --- a/rule_packs/moods.py +++ b/rule_packs/moods.py @@ -14,3 +14,7 @@ class MoodsPack(SubjectPack): def __init__(self, remove_matched_subjects: bool = True) -> None: self.rules = (PrefixRule("mood"),) self.remove_matched_subjects = remove_matched_subjects + + @classmethod + def default(cls) -> "MoodsPack": + return cls(remove_matched_subjects=True) diff --git a/rule_packs/people.py b/rule_packs/people.py index e5660f1..2ad387e 100644 --- a/rule_packs/people.py +++ b/rule_packs/people.py @@ -4,6 +4,7 @@ from collections.abc import Mapping +from core.json_loader import load_mapping from core.run_state import RunState from rule_engine.base import RulePack from rules import OverrideRule @@ -21,3 +22,7 @@ def apply(self, state: RunState) -> None: value = self.rule.apply(raw) if value is not None: state.add("people", value) + + @classmethod + def default(cls) -> "PeoplePack": + return cls(overrides=load_mapping("people_overrides")) diff --git a/rule_packs/places.py b/rule_packs/places.py index 0a13464..1757eba 100644 --- a/rule_packs/places.py +++ b/rule_packs/places.py @@ -4,6 +4,7 @@ from collections.abc import Mapping +from core.json_loader import load_mapping from core.run_state import RunState from rule_engine.base import RulePack from rules import OverrideRule @@ -21,3 +22,7 @@ def apply(self, state: RunState) -> None: value = self.rule.apply(raw) if value is not None: state.add("places", value) + + @classmethod + def default(cls) -> "PlacesPack": + return cls(overrides=load_mapping("places_overrides")) diff --git a/rule_packs/subgenres.py b/rule_packs/subgenres.py index eb832c5..97fb428 100644 --- a/rule_packs/subgenres.py +++ b/rule_packs/subgenres.py @@ -4,6 +4,7 @@ from collections.abc import Mapping +from core.json_loader import load_mapping from rule_packs.utils import SubjectPack from rules import MappingRule, PrefixRule @@ -20,3 +21,10 @@ def __init__( ) -> None: self.rules = (PrefixRule("subgenre"), MappingRule(mapping)) self.remove_matched_subjects = remove_matched_subjects + + @classmethod + def default(cls) -> "SubgenresPack": + return cls( + mapping=load_mapping("subgenres"), + remove_matched_subjects=True, + ) diff --git a/rule_packs/subject_diagnostics.py b/rule_packs/subject_diagnostics.py index 4e6a0f7..adfa89c 100644 --- a/rule_packs/subject_diagnostics.py +++ b/rule_packs/subject_diagnostics.py @@ -2,6 +2,7 @@ from __future__ import annotations +from core.json_loader import load_set from core.run_state import RunState from rule_engine.base import RulePack from rule_engine.normalization import ( @@ -39,3 +40,7 @@ def apply(self, state: RunState) -> None: value = raw.strip() if value: state.add("unmapped", value) + + @classmethod + def default(cls) -> "SubjectDiagnosticsPack": + return cls(droppable=load_set("droppable")) diff --git a/rule_packs/times.py b/rule_packs/times.py index c50a61a..6808ca8 100644 --- a/rule_packs/times.py +++ b/rule_packs/times.py @@ -19,3 +19,7 @@ def apply(self, state: RunState) -> None: value = self.rule.apply(raw) if value is not None: state.add("times", value) + + @classmethod + def default(cls) -> "TimesPack": + return cls() From 6002bd6f164708f34408a3284a90fc8d582b4039 Mon Sep 17 00:00:00 2001 From: Kaftow <166228791+Kaftow@users.noreply.github.com> Date: Wed, 22 Apr 2026 00:39:29 +0900 Subject: [PATCH 5/9] Print remaining and removed subjects in migrate_subjects.py --- core/run_state.py | 22 ++++++++++++++++ core/subject_classifier.py | 18 +++++++++++-- rule_packs/utils.py | 13 ++++++++++ scripts/migrate_subjects.py | 52 +++++++++++++++++++++++++------------ 4 files changed, 86 insertions(+), 19 deletions(-) diff --git a/core/run_state.py b/core/run_state.py index a78fae0..a2f524e 100644 --- a/core/run_state.py +++ b/core/run_state.py @@ -13,10 +13,32 @@ class RunState: work: Mapping[str, Any] result: dict[str, list[str]] + original_subjects: list[str] = field(default_factory=list) remaining_subjects: list[str] = field(default_factory=list) + removed_subjects: list[str] = field(default_factory=list) + subject_matches: list[dict[str, str]] = field(default_factory=list) def add(self, output_type: str, value: str) -> None: if output_type not in self.result: self.result[output_type] = [] if value not in self.result[output_type]: self.result[output_type].append(value) + + def record_subject_match( + self, + raw: str, + output_type: str, + value: str, + action: str, + ) -> None: + self.subject_matches.append( + { + "subject": raw, + "output_type": output_type, + "value": value, + "action": action, + } + ) + + def record_removed_subject(self, raw: str) -> None: + self.removed_subjects.append(raw) diff --git a/core/subject_classifier.py b/core/subject_classifier.py index f85aef2..c1ee98e 100644 --- a/core/subject_classifier.py +++ b/core/subject_classifier.py @@ -40,12 +40,26 @@ def __init__( self.output_types = tuple(output_types or DEFAULT_OUTPUT_TYPES) def classify_work(self, work: Mapping[str, Any]) -> dict[str, list[str]]: + """Return only the proposed tags for compatibility callers.""" + return self.classify_work_report(work)["proposed_tags"] + + def classify_work_report(self, work: Mapping[str, Any]) -> dict[str, Any]: """Run the enabled rule packs against a normalized work object.""" + original_subjects = list(work.get("subjects", [])) state = RunState( work=work, result={tag_type: [] for tag_type in self.output_types}, - remaining_subjects=list(work.get("subjects", [])), + original_subjects=original_subjects, + remaining_subjects=list(original_subjects), ) for pack in self.rule_packs: pack.apply(state) - return state.result + return { + "proposed_tags": state.result, + "subject_proposal": { + "original": state.original_subjects, + "removed": state.removed_subjects, + "remaining": state.remaining_subjects, + }, + "subject_matches": state.subject_matches, + } diff --git a/rule_packs/utils.py b/rule_packs/utils.py index c27827e..221f5ca 100644 --- a/rule_packs/utils.py +++ b/rule_packs/utils.py @@ -33,6 +33,13 @@ def classify_subject_values( next_subjects.append(raw) continue state.add(output_type, match) + state.record_subject_match( + raw=raw, + output_type=output_type, + value=match, + action="move", + ) + state.record_removed_subject(raw) state.remaining_subjects = next_subjects @@ -49,6 +56,12 @@ def apply_subject_pack( match = classify_subject_value(raw, rules) if match is not None: state.add(output_type, match) + state.record_subject_match( + raw=raw, + output_type=output_type, + value=match, + action="extract_only", + ) class SubjectPack(RulePack): diff --git a/scripts/migrate_subjects.py b/scripts/migrate_subjects.py index 57e8018..fe64208 100644 --- a/scripts/migrate_subjects.py +++ b/scripts/migrate_subjects.py @@ -61,20 +61,38 @@ def load_work_file(path: str) -> dict: # --------------------------------------------------------------------------- -def print_result(work_id: str, result: dict): +def print_report(work_id: str, report: dict): print(f"\n=== {work_id} ===") - for key, values in result.items(): + print(" proposed_tags:") + for key, values in report["proposed_tags"].items(): if values: - print(f" {key}:") + print(f" {key}:") for v in values: - print(f" - {v}") - - -def write_result(work_id: str, result: dict, output_dir: str): + print(f" - {v}") + + subject_proposal = report["subject_proposal"] + print(" subject_proposal:") + for key in ("removed", "remaining"): + values = subject_proposal[key] + print(f" {key}:") + for value in values: + print(f" - {value}") + + if report["subject_matches"]: + print(" subject_matches:") + for match in report["subject_matches"]: + print( + " - " + f"{match['subject']} -> {match['output_type']}:{match['value']} " + f"({match['action']})" + ) + + +def write_report(work_id: str, report: dict, output_dir: str): os.makedirs(output_dir, exist_ok=True) out_path = Path(output_dir) / f"{work_id}.json" with open(out_path, "w") as f: - json.dump({"work_id": work_id, **result}, f, indent=2) + json.dump({"work_id": work_id, **report}, f, indent=2) print(f"Written: {out_path}") @@ -111,20 +129,20 @@ def main(): if args.work: print(f"Fetching {args.work}...") work = fetch_work(args.work) - result = classifier.classify_work(work) + report = classifier.classify_work_report(work) if args.dry_run: - print_result(args.work, result) + print_report(args.work, report) else: - write_result(args.work, result, args.output) + write_report(args.work, report, args.output) elif args.file: work = load_work_file(args.file) work_id = work.get("key", Path(args.file).stem).split("/")[-1] - result = classifier.classify_work(work) + report = classifier.classify_work_report(work) if args.dry_run: - print_result(work_id, result) + print_report(work_id, report) else: - write_result(work_id, result, args.output) + write_report(work_id, report, args.output) elif args.batch: with open(args.batch) as f: @@ -134,11 +152,11 @@ def main(): try: print(f"Processing {work_id}...") work = fetch_work(work_id) - result = classifier.classify_work(work) + report = classifier.classify_work_report(work) if args.dry_run: - print_result(work_id, result) + print_report(work_id, report) else: - write_result(work_id, result, args.output) + write_report(work_id, report, args.output) except Exception as e: print(f"ERROR processing {work_id}: {e}", file=sys.stderr) From 1830c7cfaa405a11bf63ca6d24afa9507d2825dd Mon Sep 17 00:00:00 2001 From: Kaftow <166228791+Kaftow@users.noreply.github.com> Date: Wed, 22 Apr 2026 00:56:53 +0900 Subject: [PATCH 6/9] Support rule-level move and extract-only subject actions --- core/run_state.py | 4 +++ rule_packs/subject_diagnostics.py | 2 ++ rule_packs/utils.py | 56 ++++++++++++++----------------- rules/__init__.py | 9 ++++- rules/mapping_rule.py | 15 +++++++-- rules/match_result.py | 13 +++++++ rules/prefix_rule.py | 9 +++-- 7 files changed, 71 insertions(+), 37 deletions(-) create mode 100644 rules/match_result.py diff --git a/core/run_state.py b/core/run_state.py index a2f524e..658d5b4 100644 --- a/core/run_state.py +++ b/core/run_state.py @@ -16,6 +16,7 @@ class RunState: original_subjects: list[str] = field(default_factory=list) remaining_subjects: list[str] = field(default_factory=list) removed_subjects: list[str] = field(default_factory=list) + retained_matched_subjects: set[str] = field(default_factory=set) subject_matches: list[dict[str, str]] = field(default_factory=list) def add(self, output_type: str, value: str) -> None: @@ -42,3 +43,6 @@ def record_subject_match( def record_removed_subject(self, raw: str) -> None: self.removed_subjects.append(raw) + + def record_retained_subject(self, raw: str) -> None: + self.retained_matched_subjects.add(raw.lower().strip()) diff --git a/rule_packs/subject_diagnostics.py b/rule_packs/subject_diagnostics.py index adfa89c..a7849cd 100644 --- a/rule_packs/subject_diagnostics.py +++ b/rule_packs/subject_diagnostics.py @@ -24,6 +24,8 @@ def apply(self, state: RunState) -> None: key = normalize(raw) if key in self.droppable: continue + if key in state.retained_matched_subjects: + continue if is_reading_level(raw): value = raw.strip() diff --git a/rule_packs/utils.py b/rule_packs/utils.py index 221f5ca..00943b1 100644 --- a/rule_packs/utils.py +++ b/rule_packs/utils.py @@ -7,63 +7,59 @@ from core.run_state import RunState from rule_engine.base import RulePack +from rules import RuleMatch class SubjectValueRule(Protocol): - def match(self, raw: str) -> str | None: ... + def match(self, raw: str) -> RuleMatch | str | None: ... -def classify_subject_value(raw: str, rules: Iterable[SubjectValueRule]) -> str | None: +def _coerce_match(match: RuleMatch | str, default_action: str) -> RuleMatch: + if isinstance(match, RuleMatch): + return match + return RuleMatch(value=match, action=default_action) + + +def classify_subject_value( + raw: str, + rules: Iterable[SubjectValueRule], + default_action: str, +) -> RuleMatch | None: for rule in rules: match = rule.match(raw) if match is not None: - return match + return _coerce_match(match, default_action) return None -def classify_subject_values( +def apply_subject_pack( state: RunState, output_type: str, rules: Iterable[SubjectValueRule], + remove_matched_subjects: bool, ) -> None: + default_action = "move" if remove_matched_subjects else "extract_only" next_subjects: list[str] = [] for raw in state.remaining_subjects: - match = classify_subject_value(raw, rules) + match = classify_subject_value(raw, rules, default_action=default_action) if match is None: next_subjects.append(raw) continue - state.add(output_type, match) + state.add(output_type, match.value) state.record_subject_match( raw=raw, output_type=output_type, - value=match, - action="move", + value=match.value, + action=match.action, ) - state.record_removed_subject(raw) + if match.action == "move": + state.record_removed_subject(raw) + continue + state.record_retained_subject(raw) + next_subjects.append(raw) state.remaining_subjects = next_subjects -def apply_subject_pack( - state: RunState, - output_type: str, - rules: Iterable[SubjectValueRule], - remove_matched_subjects: bool, -) -> None: - if remove_matched_subjects: - classify_subject_values(state, output_type, rules) - return - for raw in state.remaining_subjects: - match = classify_subject_value(raw, rules) - if match is not None: - state.add(output_type, match) - state.record_subject_match( - raw=raw, - output_type=output_type, - value=match, - action="extract_only", - ) - - class SubjectPack(RulePack): """Small helper for packs that operate on the shared subject sequence.""" diff --git a/rules/__init__.py b/rules/__init__.py index 9cbc515..ed93b0e 100644 --- a/rules/__init__.py +++ b/rules/__init__.py @@ -1,8 +1,15 @@ """Composable rule units for pack implementations.""" +from .match_result import RuleMatch from .mapping_rule import MappingRule from .override_rule import OverrideRule from .passthrough_rule import PassthroughRule from .prefix_rule import PrefixRule -__all__ = ["MappingRule", "OverrideRule", "PassthroughRule", "PrefixRule"] +__all__ = [ + "MappingRule", + "OverrideRule", + "PassthroughRule", + "PrefixRule", + "RuleMatch", +] diff --git a/rules/mapping_rule.py b/rules/mapping_rule.py index 2cb85d3..2fd1bc7 100644 --- a/rules/mapping_rule.py +++ b/rules/mapping_rule.py @@ -5,13 +5,22 @@ from collections.abc import Mapping from rule_engine.normalization import normalize +from rules.match_result import RuleMatch class MappingRule: """Match normalized input values against a provided mapping.""" - def __init__(self, mapping: Mapping[str, str] | None = None) -> None: + def __init__( + self, + mapping: Mapping[str, str] | None = None, + default_action: str = "move", + ) -> None: self.mapping = dict(mapping or {}) + self.default_action = default_action - def match(self, raw: str) -> str | None: - return self.mapping.get(normalize(raw)) + def match(self, raw: str) -> RuleMatch | None: + value = self.mapping.get(normalize(raw)) + if value is None: + return None + return RuleMatch(value=value, action=self.default_action) diff --git a/rules/match_result.py b/rules/match_result.py new file mode 100644 index 0000000..5a3a16f --- /dev/null +++ b/rules/match_result.py @@ -0,0 +1,13 @@ +"""Structured subject-match results with per-rule actions.""" + +from __future__ import annotations + +from dataclasses import dataclass + + +@dataclass(frozen=True) +class RuleMatch: + """A normalized match value plus the subject-handling action to apply.""" + + value: str + action: str diff --git a/rules/prefix_rule.py b/rules/prefix_rule.py index 11b49f9..8ec6b9a 100644 --- a/rules/prefix_rule.py +++ b/rules/prefix_rule.py @@ -2,14 +2,17 @@ from __future__ import annotations +from rules.match_result import RuleMatch + class PrefixRule: """Match values like ``theme:love`` and return the normalized payload.""" - def __init__(self, prefix: str) -> None: + def __init__(self, prefix: str, action: str = "move") -> None: self.prefix = prefix + self.action = action - def match(self, raw: str) -> str | None: + def match(self, raw: str) -> RuleMatch | None: if not self.prefix or ":" not in raw: return None prefix, _, value = raw.partition(":") @@ -18,4 +21,4 @@ def match(self, raw: str) -> str | None: cleaned = value.strip() if not cleaned: return None - return cleaned.title() + return RuleMatch(value=cleaned.title(), action=self.action) From 98fd8135fbacd66948c6d52d39b153e00a74499f Mon Sep 17 00:00:00 2001 From: Kaftow <166228791+Kaftow@users.noreply.github.com> Date: Wed, 22 Apr 2026 00:57:52 +0900 Subject: [PATCH 7/9] Finish rule pack for content_formats --- rule_packs/content_formats.py | 40 +++++++++++++++++++++++++++++------ 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/rule_packs/content_formats.py b/rule_packs/content_formats.py index c96e197..e4d271d 100644 --- a/rule_packs/content_formats.py +++ b/rule_packs/content_formats.py @@ -8,6 +8,19 @@ from rule_packs.utils import SubjectPack from rules import MappingRule, PrefixRule +MOVE = "move" +EXTRACT_ONLY = "extract_only" + +# First-pass direct-match policies based on current dry-run evidence. +MOVE_TAGS = frozenset( + { + "Memoir", + "Anthology", + "Letters", + "Dictionary", + } +) + class ContentFormatsPack(SubjectPack): name = "content_formats" @@ -16,15 +29,30 @@ class ContentFormatsPack(SubjectPack): def __init__( self, - mapping: Mapping[str, str] | None = None, - remove_matched_subjects: bool = True, + move_mapping: Mapping[str, str] | None = None, + extract_only_mapping: Mapping[str, str] | None = None, ) -> None: - self.rules = (PrefixRule("format"), MappingRule(mapping)) - self.remove_matched_subjects = remove_matched_subjects + self.rules = ( + PrefixRule("format", action=EXTRACT_ONLY), + MappingRule(move_mapping, default_action=MOVE), + MappingRule(extract_only_mapping, default_action=EXTRACT_ONLY), + ) + self.remove_matched_subjects = False @classmethod def default(cls) -> "ContentFormatsPack": + mapping = load_mapping("content_formats") + move_mapping = { + legacy: canonical + for legacy, canonical in mapping.items() + if canonical in MOVE_TAGS + } + extract_only_mapping = { + legacy: canonical + for legacy, canonical in mapping.items() + if canonical not in MOVE_TAGS + } return cls( - mapping=load_mapping("content_formats"), - remove_matched_subjects=True, + move_mapping=move_mapping, + extract_only_mapping=extract_only_mapping, ) From 335c71f4deb00bd79ee882c836cadf1628490d27 Mon Sep 17 00:00:00 2001 From: Kaftow <166228791+Kaftow@users.noreply.github.com> Date: Wed, 22 Apr 2026 01:27:49 +0900 Subject: [PATCH 8/9] Delete unnecessary abstract layer --- core/__init__.py | 5 - core/classifier_assembler.py | 41 ------- core/migrate_subject_classifier.py | 17 --- core/pack_registry.py | 61 ---------- resources/mappings/audience.json | 23 ---- resources/mappings/genres.json | 75 ------------ resources/mappings/literary_themes.json | 66 ----------- resources/mappings/literary_tropes.json | 27 ----- resources/mappings/main_topics.json | 61 ---------- resources/mappings/people_overrides.json | 10 -- resources/mappings/places_overrides.json | 13 -- resources/mappings/subgenres.json | 60 ---------- rule_packs/__init__.py | 40 +------ rule_packs/audience.py | 30 ----- rule_packs/genres.py | 30 ----- rule_packs/literary_form.py | 20 ---- rule_packs/literary_themes.py | 30 ----- rule_packs/literary_tropes.py | 30 ----- rule_packs/main_topics.py | 30 ----- rule_packs/moods.py | 20 ---- rule_packs/people.py | 28 ----- rule_packs/places.py | 28 ----- rule_packs/subgenres.py | 30 ----- rule_packs/times.py | 25 ---- rules/__init__.py | 4 - rules/override_rule.py | 20 ---- rules/passthrough_rule.py | 11 -- scripts/README.md | 145 +++++++++++------------ scripts/migrate_subjects.py | 39 +++++- scripts/run_legacy_subjects.sh | 22 ---- 30 files changed, 109 insertions(+), 932 deletions(-) delete mode 100644 core/__init__.py delete mode 100644 core/classifier_assembler.py delete mode 100644 core/migrate_subject_classifier.py delete mode 100644 core/pack_registry.py delete mode 100644 resources/mappings/audience.json delete mode 100644 resources/mappings/genres.json delete mode 100644 resources/mappings/literary_themes.json delete mode 100644 resources/mappings/literary_tropes.json delete mode 100644 resources/mappings/main_topics.json delete mode 100644 resources/mappings/people_overrides.json delete mode 100644 resources/mappings/places_overrides.json delete mode 100644 resources/mappings/subgenres.json delete mode 100644 rule_packs/audience.py delete mode 100644 rule_packs/genres.py delete mode 100644 rule_packs/literary_form.py delete mode 100644 rule_packs/literary_themes.py delete mode 100644 rule_packs/literary_tropes.py delete mode 100644 rule_packs/main_topics.py delete mode 100644 rule_packs/moods.py delete mode 100644 rule_packs/people.py delete mode 100644 rule_packs/places.py delete mode 100644 rule_packs/subgenres.py delete mode 100644 rule_packs/times.py delete mode 100644 rules/override_rule.py delete mode 100644 rules/passthrough_rule.py delete mode 100755 scripts/run_legacy_subjects.sh diff --git a/core/__init__.py b/core/__init__.py deleted file mode 100644 index ee7aaf5..0000000 --- a/core/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -"""Core orchestration and default migration assembly.""" - -from .subject_classifier import DEFAULT_OUTPUT_TYPES, SubjectClassifier - -__all__ = ["DEFAULT_OUTPUT_TYPES", "SubjectClassifier"] diff --git a/core/classifier_assembler.py b/core/classifier_assembler.py deleted file mode 100644 index 23a732e..0000000 --- a/core/classifier_assembler.py +++ /dev/null @@ -1,41 +0,0 @@ -"""Assembly helpers for building migration classifiers.""" - -from __future__ import annotations - -from collections.abc import Iterable - -from core.json_loader import load_set -from core.pack_registry import ( - AVAILABLE_PACK_NAMES, - PACK_FACTORIES, - PACK_PRESETS, -) -from core.subject_classifier import SubjectClassifier - - -def resolve_pack_names(enabled_packs: Iterable[str] | None) -> list[str]: - """Expand presets into concrete stable pack names.""" - selected = list(enabled_packs or []) - expanded: list[str] = [] - for name in selected: - if name in PACK_PRESETS: - expanded.extend(PACK_PRESETS[name]) - continue - expanded.append(name) - return expanded - - -def build_subject_classifier( - enabled_packs: Iterable[str] | None = None, -) -> SubjectClassifier: - """Build the migration classifier from an explicit pack-name list.""" - selected = resolve_pack_names(enabled_packs) - missing = [name for name in selected if name not in PACK_FACTORIES] - if missing: - available = ", ".join(AVAILABLE_PACK_NAMES) - missing_display = ", ".join(sorted(missing)) - raise ValueError( - f"Unknown rule pack(s): {missing_display}. Available: {available}" - ) - - return SubjectClassifier(rule_packs=[PACK_FACTORIES[name]() for name in selected]) diff --git a/core/migrate_subject_classifier.py b/core/migrate_subject_classifier.py deleted file mode 100644 index 38cc1a1..0000000 --- a/core/migrate_subject_classifier.py +++ /dev/null @@ -1,17 +0,0 @@ -"""Compatibility wrapper for migration classifier assembly.""" - -from __future__ import annotations - -from core.classifier_assembler import ( - build_subject_classifier, - resolve_pack_names, -) -from core.pack_registry import AVAILABLE_PACK_NAMES, PACK_FACTORIES, PACK_PRESETS - -__all__ = [ - "AVAILABLE_PACK_NAMES", - "PACK_FACTORIES", - "PACK_PRESETS", - "build_subject_classifier", - "resolve_pack_names", -] diff --git a/core/pack_registry.py b/core/pack_registry.py deleted file mode 100644 index 3130d31..0000000 --- a/core/pack_registry.py +++ /dev/null @@ -1,61 +0,0 @@ -"""Stable pack-name registry for migration assembly.""" - -from __future__ import annotations - -from typing import Callable - -from rule_packs import ( - AudiencePack, - ContentFormatsPack, - GenresPack, - LiteraryFormPack, - LiteraryThemesPack, - LiteraryTropesPack, - MainTopicsPack, - MoodsPack, - PeoplePack, - PlacesPack, - SubgenresPack, - SUBJECT_PACK_CLASSES, - SubjectDiagnosticsPack, - TimesPack, -) - -PackFactory = Callable[[], object] - -SUBJECT_PACK_BUILDERS = {pack_cls.name: pack_cls for pack_cls in SUBJECT_PACK_CLASSES} -PACK_PRESETS: dict[str, tuple[str, ...]] = { - "subject_mappings": ( - "literary_form", - "audience", - "genres", - "subgenres", - "content_formats", - "moods", - "literary_themes", - "literary_tropes", - "main_topics", - "subject_diagnostics", - "people", - "places", - "times", - ), -} - -PACK_FACTORIES: dict[str, PackFactory] = { - "literary_form": LiteraryFormPack.default, - "audience": AudiencePack.default, - "genres": GenresPack.default, - "subgenres": SubgenresPack.default, - "content_formats": ContentFormatsPack.default, - "moods": MoodsPack.default, - "literary_themes": LiteraryThemesPack.default, - "literary_tropes": LiteraryTropesPack.default, - "main_topics": MainTopicsPack.default, - "subject_diagnostics": SubjectDiagnosticsPack.default, - "people": PeoplePack.default, - "places": PlacesPack.default, - "times": TimesPack.default, -} - -AVAILABLE_PACK_NAMES = tuple(sorted({*PACK_FACTORIES, *PACK_PRESETS})) diff --git a/resources/mappings/audience.json b/resources/mappings/audience.json deleted file mode 100644 index b80e84a..0000000 --- a/resources/mappings/audience.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "juvenile fiction": "Juvenile", - "juvenile literature": "Juvenile", - "juvenile nonfiction": "Juvenile", - "children's fiction": "Children", - "children's literature": "Children", - "children's nonfiction": "Children", - "children's stories": "Children", - "picture books": "Children", - "board books": "Preschool", - "baby books": "Preschool", - "young adult fiction": "Young Adult", - "young adult literature": "Young Adult", - "young adult nonfiction": "Young Adult", - "teen fiction": "Young Adult", - "teenage fiction": "Young Adult", - "ya fiction": "Young Adult", - "readers (adult)": "Adult", - "adult fiction": "Adult", - "academic": "Academic", - "scholarly": "Academic", - "textbooks": "Academic" -} diff --git a/resources/mappings/genres.json b/resources/mappings/genres.json deleted file mode 100644 index 7220cbb..0000000 --- a/resources/mappings/genres.json +++ /dev/null @@ -1,75 +0,0 @@ -{ - "absurdist fiction": "Absurd", - "absurdism": "Absurd", - "action": "Action", - "action and adventure": "Adventure", - "adventure": "Adventure", - "adventure fiction": "Adventure", - "adventure stories": "Adventure", - "black comedy": "Comedy", - "british comedy": "Comedy", - "comedy": "Comedy", - "comic fiction": "Comedy", - "comedies": "Comedy", - "crime": "Crime", - "crime fiction": "Crime", - "crime stories": "Crime", - "criminal fiction": "Crime", - "detective and mystery stories": "Mystery", - "detective fiction": "Mystery", - "detective stories": "Mystery", - "drama": "Drama", - "dramatic fiction": "Drama", - "erotica": "Erotica", - "erotic fiction": "Erotica", - "fantasy": "Fantasy", - "fantasy fiction": "Fantasy", - "fantasy stories": "Fantasy", - "historical": "Historical", - "historical fiction": "Historical", - "historical novel": "Historical", - "fiction, historical": "Historical", - "horror": "Horror", - "horror fiction": "Horror", - "horror stories": "Horror", - "humor": "Humor", - "humorous fiction": "Humor", - "humorous stories": "Humor", - "lgbtq fiction": "LGBTQ+", - "gay fiction": "LGBTQ+", - "lesbian fiction": "LGBTQ+", - "queer fiction": "LGBTQ+", - "classic fiction": "Literary", - "classic literature": "Literary", - "classics": "Literary", - "literary fiction": "Literary", - "literature": "Literary", - "mystery fiction": "Mystery", - "mystery": "Mystery", - "mystery and suspense": "Mystery", - "mysteries": "Mystery", - "whodunit": "Mystery", - "mythology": "Mythology", - "mythological fiction": "Mythology", - "romance": "Romance", - "romance fiction": "Romance", - "romantic fiction": "Romance", - "love stories": "Romance", - "love story": "Romance", - "man-woman relationships, fiction": "Romance", - "satire": "Satire", - "satirical fiction": "Satire", - "science fiction": "Sci-Fi", - "sci-fi": "Sci-Fi", - "sf": "Sci-Fi", - "speculative fiction": "Sci-Fi", - "suspense fiction": "Thriller", - "thriller": "Thriller", - "thrillers": "Thriller", - "thriller fiction": "Thriller", - "tragedy": "Tragedy", - "tragic fiction": "Tragedy", - "western": "Western", - "westerns": "Western", - "western stories": "Western" -} diff --git a/resources/mappings/literary_themes.json b/resources/mappings/literary_themes.json deleted file mode 100644 index 7b8120f..0000000 --- a/resources/mappings/literary_themes.json +++ /dev/null @@ -1,66 +0,0 @@ -{ - "betrayal": "Betrayal", - "class conflict": "Class", - "class struggle": "Class", - "class warfare": "Class", - "class wars": "Class", - "social class": "Class", - "social conflict": "Class", - "coming of age": "Coming of Age", - "loss of innocence": "Coming of Age", - "death": "Death", - "death and dying": "Death", - "mortality": "Death", - "desire": "Desire", - "longing": "Desire", - "obsession": "Obsession", - "duty": "Duty", - "loyalty": "Duty", - "fate": "Fate", - "determinism": "Fate", - "freedom": "Freedom", - "liberation": "Freedom", - "autonomy": "Freedom", - "gender": "Gender", - "gender roles": "Gender", - "grief": "Grief", - "mourning": "Grief", - "guilt": "Guilt", - "remorse": "Guilt", - "identity": "Identity", - "self-discovery": "Identity", - "innocence": "Innocence", - "justice": "Justice", - "injustice": "Justice", - "love": "Love", - "romantic love": "Love", - "memory": "Memory", - "nostalgia": "Memory", - "mortality": "Mortality", - "nature": "Nature", - "environment": "Nature", - "power": "Power", - "power dynamics": "Power", - "abuse of power": "Power", - "state power": "Power", - "race": "Race", - "racism": "Race", - "racial identity": "Race", - "redemption": "Redemption", - "atonement": "Redemption", - "rejection": "Rejection", - "rejection (psychology)": "Rejection", - "rejet (psychologie)": "Rejection", - "revenge": "Revenge", - "vengeance": "Revenge", - "vengeance -- fiction": "Revenge", - "revenge -- fiction": "Revenge", - "sacrifice": "Sacrifice", - "survival": "Survival", - "truth": "Truth", - "deception": "Truth", - "honesty": "Truth", - "violence": "Violence", - "war": "War", - "warfare": "War" -} diff --git a/resources/mappings/literary_tropes.json b/resources/mappings/literary_tropes.json deleted file mode 100644 index 566f9b4..0000000 --- a/resources/mappings/literary_tropes.json +++ /dev/null @@ -1,27 +0,0 @@ -{ - "foundlings": "Foundlings", - "foundlings in fiction": "Foundlings", - "foundlings -- fiction": "Foundlings", - "enfants trouvés": "Foundlings", - "orphans": "Orphan Protagonist", - "orphans in fiction": "Orphan Protagonist", - "orphans -- fiction": "Orphan Protagonist", - "triangles (interpersonal relations)": "Love Triangles", - "triangles (interpersonal relationships)": "Love Triangles", - "triangles (interpersonal relations)--fiction": "Love Triangles", - "triangle (relations humaines)": "Love Triangles", - "love triangle": "Love Triangles", - "love triangles": "Love Triangles", - "inheritance and succession": "Inheritance and Succession", - "unreliable narrator": "Unreliable Narrator", - "enemies to lovers": "Enemies to Lovers", - "found family": "Found Family", - "chosen one": "Chosen One", - "antihero": "Antihero", - "slow burn": "Slow Burn", - "redemption arc": "Redemption Arc", - "forbidden love": "Forbidden Love", - "fake identity": "Fake Identity", - "frame narrative": "Frame Narrative", - "epistolary": "Epistolary Structure" -} diff --git a/resources/mappings/main_topics.json b/resources/mappings/main_topics.json deleted file mode 100644 index 17d6dd6..0000000 --- a/resources/mappings/main_topics.json +++ /dev/null @@ -1,61 +0,0 @@ -{ - "interpersonal relations": "Interpersonal relations", - "interpersonal relationships": "Interpersonal relations", - "interpersonal relations, fiction": "Interpersonal relations", - "family life": "Family life", - "families": "Family life", - "family": "Family life", - "fiction, family life, general": "Family life", - "social conditions": "Social conditions", - "social life and customs": "Social life and customs", - "manners and customs": "Manners and customs", - "mœurs et coutumes": "Manners and customs", - "inheritance": "Inheritance", - "inheritance and succession": "Inheritance", - "debt": "Debt", - "slavery": "Slavery", - "slavery in fiction": "Slavery", - "education": "Education", - "class": "Class", - "social class": "Class", - "war": "War", - "battles": "War", - "religion": "Religion", - "church": "Religion", - "magic": "Magic", - "witchcraft": "Witchcraft", - "medicine": "Medicine", - "technology": "Technology", - "politics": "Politics", - "government": "Politics", - "law": "Law", - "justice": "Justice", - "trade": "Trade", - "economics": "Economics", - "labor": "Labor", - "work": "Labor", - "immigration": "Immigration", - "colonialism": "Colonialism", - "empire": "Imperialism", - "imperialism": "Imperialism", - "race": "Race", - "gender": "Gender", - "feminism": "Feminism", - "sexuality": "Sexuality", - "language": "Language", - "art": "Art", - "music": "Music", - "science": "Science", - "nature": "Nature", - "environment": "Environment", - "travel": "Travel", - "exploration": "Exploration", - "philosophy": "Philosophy", - "ethics": "Ethics", - "morality": "Ethics", - "psychology": "Psychology", - "mental health": "Mental health", - "poverty": "Poverty", - "wealth": "Wealth", - "power": "Power" -} diff --git a/resources/mappings/people_overrides.json b/resources/mappings/people_overrides.json deleted file mode 100644 index c8576ab..0000000 --- a/resources/mappings/people_overrides.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "heathcliff (fictitious character)": "Heathcliff", - "heathcliff (fictitious character : brontë)": "Heathcliff", - "heathcliff (fictitious character : bronte)": "Heathcliff", - "catherine earnshawm (fictitious character)": "Catherine Earnshaw", - "harry potter (fictitious character)": "Harry Potter", - "hermione granger (fictitious character)": "Hermione Granger", - "ron weasley (fictitious character)": "Ron Weasley", - "beuve de hanstone (legendary character)": "Beuve de Hanstone" -} diff --git a/resources/mappings/places_overrides.json b/resources/mappings/places_overrides.json deleted file mode 100644 index 145e981..0000000 --- a/resources/mappings/places_overrides.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "yorkshire (england)": "Yorkshire", - "yorkshire (england) -- fiction": "Yorkshire", - "england, fiction": "England", - "england -- fiction": "England", - "london (england)": "London", - "new york (n.y.)": "New York City", - "new york (state)": "New York State", - "united states": "United States", - "great britain": "Great Britain", - "france": "France", - "paris (france)": "Paris" -} diff --git a/resources/mappings/subgenres.json b/resources/mappings/subgenres.json deleted file mode 100644 index 3c271e0..0000000 --- a/resources/mappings/subgenres.json +++ /dev/null @@ -1,60 +0,0 @@ -{ - "apocalyptic fiction": "Apocalyptic", - "apocalyptic": "Apocalyptic", - "biopunk": "Biopunk", - "climate fiction": "Cli-fi", - "cli-fi": "Cli-fi", - "bildungsroman": "Coming of Age", - "coming-of-age": "Coming of Age", - "coming of age": "Coming of Age", - "coming of age fiction": "Coming of Age", - "cyberpunk": "Cyberpunk", - "detective fiction": "Detective", - "detective stories": "Detective", - "dystopia": "Dystopian", - "dystopian": "Dystopian", - "dystopian fiction": "Dystopian", - "epistolary fiction": "Epistolary", - "epistolary": "Epistolary", - "epistolary novel": "Epistolary", - "epic": "Epic", - "epic fantasy": "Epic", - "espionage": "Espionage", - "spy fiction": "Espionage", - "spy stories": "Espionage", - "family saga": "Family Saga", - "family sagas": "Family Saga", - "saga": "Family Saga", - "futurism": "Futurism", - "futuristic fiction": "Futurism", - "gonzo": "Gonzo", - "gonzo journalism": "Gonzo", - "gothic": "Gothic", - "gothic fiction": "Gothic", - "gothic horror": "Gothic", - "gothic romance": "Gothic", - "english gothic fiction": "Gothic", - "british gothic": "Gothic", - "southern gothic": "Gothic", - "historical fiction": "Historical", - "fiction, historical": "Historical", - "locked room mystery": "Locked Room", - "locked-room": "Locked Room", - "melodrama": "Melodrama", - "picaresque": "Picaresque", - "picaresque novel": "Picaresque", - "post-apocalyptic": "Post-Apocalyptic", - "post-apocalyptic fiction": "Post-Apocalyptic", - "post apocalyptic": "Post-Apocalyptic", - "psychological": "Psychological", - "psychological fiction": "Psychological", - "psychological thriller": "Psychological", - "psychological horror": "Psychological", - "fiction, psychological": "Psychological", - "space opera": "Space Opera", - "steampunk": "Steampunk", - "true crime": "True Crime", - "utopian": "Utopian", - "utopian fiction": "Utopian", - "utopia": "Utopian" -} diff --git a/rule_packs/__init__.py b/rule_packs/__init__.py index 4dd2638..436daac 100644 --- a/rule_packs/__init__.py +++ b/rule_packs/__init__.py @@ -1,54 +1,18 @@ """Concrete rule-pack modules.""" -from .audience import AudiencePack from .content_formats import ContentFormatsPack -from .genres import GenresPack -from .literary_form import LiteraryFormPack -from .literary_themes import LiteraryThemesPack -from .literary_tropes import LiteraryTropesPack -from .main_topics import MainTopicsPack -from .moods import MoodsPack -from .people import PeoplePack -from .places import PlacesPack -from .subgenres import SubgenresPack from .subject_diagnostics import SubjectDiagnosticsPack -from .times import TimesPack -SUBJECT_PACK_CLASSES = ( - LiteraryFormPack, - AudiencePack, - GenresPack, - SubgenresPack, - ContentFormatsPack, - MoodsPack, - LiteraryThemesPack, - LiteraryTropesPack, - MainTopicsPack, -) +SUBJECT_PACK_CLASSES = (ContentFormatsPack,) -FIELD_PACK_CLASSES = ( - PeoplePack, - PlacesPack, - TimesPack, -) +FIELD_PACK_CLASSES = () ALL_PACK_CLASSES = SUBJECT_PACK_CLASSES + FIELD_PACK_CLASSES __all__ = [ "ALL_PACK_CLASSES", - "AudiencePack", "ContentFormatsPack", "FIELD_PACK_CLASSES", - "GenresPack", - "LiteraryFormPack", - "LiteraryThemesPack", - "LiteraryTropesPack", - "MainTopicsPack", - "MoodsPack", - "PeoplePack", - "PlacesPack", "SUBJECT_PACK_CLASSES", - "SubgenresPack", "SubjectDiagnosticsPack", - "TimesPack", ] diff --git a/rule_packs/audience.py b/rule_packs/audience.py deleted file mode 100644 index 8f59146..0000000 --- a/rule_packs/audience.py +++ /dev/null @@ -1,30 +0,0 @@ -"""Rule pack for audience tags.""" - -from __future__ import annotations - -from collections.abc import Mapping - -from core.json_loader import load_mapping -from rule_packs.utils import SubjectPack -from rules import MappingRule, PrefixRule - - -class AudiencePack(SubjectPack): - name = "audience" - output_types = ("audience",) - output_type = "audience" - - def __init__( - self, - mapping: Mapping[str, str] | None = None, - remove_matched_subjects: bool = True, - ) -> None: - self.rules = (PrefixRule("audience"), MappingRule(mapping)) - self.remove_matched_subjects = remove_matched_subjects - - @classmethod - def default(cls) -> "AudiencePack": - return cls( - mapping=load_mapping("audience"), - remove_matched_subjects=True, - ) diff --git a/rule_packs/genres.py b/rule_packs/genres.py deleted file mode 100644 index 1812744..0000000 --- a/rule_packs/genres.py +++ /dev/null @@ -1,30 +0,0 @@ -"""Rule pack for genre tags.""" - -from __future__ import annotations - -from collections.abc import Mapping - -from core.json_loader import load_mapping -from rule_packs.utils import SubjectPack -from rules import MappingRule, PrefixRule - - -class GenresPack(SubjectPack): - name = "genres" - output_types = ("genres",) - output_type = "genres" - - def __init__( - self, - mapping: Mapping[str, str] | None = None, - remove_matched_subjects: bool = True, - ) -> None: - self.rules = (PrefixRule("genre"), MappingRule(mapping)) - self.remove_matched_subjects = remove_matched_subjects - - @classmethod - def default(cls) -> "GenresPack": - return cls( - mapping=load_mapping("genres"), - remove_matched_subjects=True, - ) diff --git a/rule_packs/literary_form.py b/rule_packs/literary_form.py deleted file mode 100644 index 8c02391..0000000 --- a/rule_packs/literary_form.py +++ /dev/null @@ -1,20 +0,0 @@ -"""Rule pack for literary_form.""" - -from __future__ import annotations - -from rule_packs.utils import SubjectPack -from rules import PrefixRule - - -class LiteraryFormPack(SubjectPack): - name = "literary_form" - output_types = ("literary_form",) - output_type = "literary_form" - - def __init__(self, remove_matched_subjects: bool = True) -> None: - self.rules = (PrefixRule("form"),) - self.remove_matched_subjects = remove_matched_subjects - - @classmethod - def default(cls) -> "LiteraryFormPack": - return cls(remove_matched_subjects=True) diff --git a/rule_packs/literary_themes.py b/rule_packs/literary_themes.py deleted file mode 100644 index 7ccb6bb..0000000 --- a/rule_packs/literary_themes.py +++ /dev/null @@ -1,30 +0,0 @@ -"""Rule pack for literary_themes tags.""" - -from __future__ import annotations - -from collections.abc import Mapping - -from core.json_loader import load_mapping -from rule_packs.utils import SubjectPack -from rules import MappingRule, PrefixRule - - -class LiteraryThemesPack(SubjectPack): - name = "literary_themes" - output_types = ("literary_themes",) - output_type = "literary_themes" - - def __init__( - self, - mapping: Mapping[str, str] | None = None, - remove_matched_subjects: bool = True, - ) -> None: - self.rules = (PrefixRule("theme"), MappingRule(mapping)) - self.remove_matched_subjects = remove_matched_subjects - - @classmethod - def default(cls) -> "LiteraryThemesPack": - return cls( - mapping=load_mapping("literary_themes"), - remove_matched_subjects=True, - ) diff --git a/rule_packs/literary_tropes.py b/rule_packs/literary_tropes.py deleted file mode 100644 index 9a18f81..0000000 --- a/rule_packs/literary_tropes.py +++ /dev/null @@ -1,30 +0,0 @@ -"""Rule pack for literary_tropes tags.""" - -from __future__ import annotations - -from collections.abc import Mapping - -from core.json_loader import load_mapping -from rule_packs.utils import SubjectPack -from rules import MappingRule, PrefixRule - - -class LiteraryTropesPack(SubjectPack): - name = "literary_tropes" - output_types = ("literary_tropes",) - output_type = "literary_tropes" - - def __init__( - self, - mapping: Mapping[str, str] | None = None, - remove_matched_subjects: bool = True, - ) -> None: - self.rules = (PrefixRule("trope"), MappingRule(mapping)) - self.remove_matched_subjects = remove_matched_subjects - - @classmethod - def default(cls) -> "LiteraryTropesPack": - return cls( - mapping=load_mapping("literary_tropes"), - remove_matched_subjects=True, - ) diff --git a/rule_packs/main_topics.py b/rule_packs/main_topics.py deleted file mode 100644 index 68d6320..0000000 --- a/rule_packs/main_topics.py +++ /dev/null @@ -1,30 +0,0 @@ -"""Rule pack for main_topics tags.""" - -from __future__ import annotations - -from collections.abc import Mapping - -from core.json_loader import load_mapping -from rule_packs.utils import SubjectPack -from rules import MappingRule, PrefixRule - - -class MainTopicsPack(SubjectPack): - name = "main_topics" - output_types = ("main_topics",) - output_type = "main_topics" - - def __init__( - self, - mapping: Mapping[str, str] | None = None, - remove_matched_subjects: bool = True, - ) -> None: - self.rules = (PrefixRule("topic"), MappingRule(mapping)) - self.remove_matched_subjects = remove_matched_subjects - - @classmethod - def default(cls) -> "MainTopicsPack": - return cls( - mapping=load_mapping("main_topics"), - remove_matched_subjects=True, - ) diff --git a/rule_packs/moods.py b/rule_packs/moods.py deleted file mode 100644 index d6a2731..0000000 --- a/rule_packs/moods.py +++ /dev/null @@ -1,20 +0,0 @@ -"""Rule pack for moods tags.""" - -from __future__ import annotations - -from rule_packs.utils import SubjectPack -from rules import PrefixRule - - -class MoodsPack(SubjectPack): - name = "moods" - output_types = ("moods",) - output_type = "moods" - - def __init__(self, remove_matched_subjects: bool = True) -> None: - self.rules = (PrefixRule("mood"),) - self.remove_matched_subjects = remove_matched_subjects - - @classmethod - def default(cls) -> "MoodsPack": - return cls(remove_matched_subjects=True) diff --git a/rule_packs/people.py b/rule_packs/people.py deleted file mode 100644 index 2ad387e..0000000 --- a/rule_packs/people.py +++ /dev/null @@ -1,28 +0,0 @@ -"""Rule pack for subject_people.""" - -from __future__ import annotations - -from collections.abc import Mapping - -from core.json_loader import load_mapping -from core.run_state import RunState -from rule_engine.base import RulePack -from rules import OverrideRule - - -class PeoplePack(RulePack): - name = "people" - output_types = ("people",) - - def __init__(self, overrides: Mapping[str, str] | None = None) -> None: - self.rule = OverrideRule(overrides) - - def apply(self, state: RunState) -> None: - for raw in state.work.get("subject_people", []): - value = self.rule.apply(raw) - if value is not None: - state.add("people", value) - - @classmethod - def default(cls) -> "PeoplePack": - return cls(overrides=load_mapping("people_overrides")) diff --git a/rule_packs/places.py b/rule_packs/places.py deleted file mode 100644 index 1757eba..0000000 --- a/rule_packs/places.py +++ /dev/null @@ -1,28 +0,0 @@ -"""Rule pack for subject_places.""" - -from __future__ import annotations - -from collections.abc import Mapping - -from core.json_loader import load_mapping -from core.run_state import RunState -from rule_engine.base import RulePack -from rules import OverrideRule - - -class PlacesPack(RulePack): - name = "places" - output_types = ("places",) - - def __init__(self, overrides: Mapping[str, str] | None = None) -> None: - self.rule = OverrideRule(overrides) - - def apply(self, state: RunState) -> None: - for raw in state.work.get("subject_places", []): - value = self.rule.apply(raw) - if value is not None: - state.add("places", value) - - @classmethod - def default(cls) -> "PlacesPack": - return cls(overrides=load_mapping("places_overrides")) diff --git a/rule_packs/subgenres.py b/rule_packs/subgenres.py deleted file mode 100644 index 97fb428..0000000 --- a/rule_packs/subgenres.py +++ /dev/null @@ -1,30 +0,0 @@ -"""Rule pack for subgenre tags.""" - -from __future__ import annotations - -from collections.abc import Mapping - -from core.json_loader import load_mapping -from rule_packs.utils import SubjectPack -from rules import MappingRule, PrefixRule - - -class SubgenresPack(SubjectPack): - name = "subgenres" - output_types = ("subgenres",) - output_type = "subgenres" - - def __init__( - self, - mapping: Mapping[str, str] | None = None, - remove_matched_subjects: bool = True, - ) -> None: - self.rules = (PrefixRule("subgenre"), MappingRule(mapping)) - self.remove_matched_subjects = remove_matched_subjects - - @classmethod - def default(cls) -> "SubgenresPack": - return cls( - mapping=load_mapping("subgenres"), - remove_matched_subjects=True, - ) diff --git a/rule_packs/times.py b/rule_packs/times.py deleted file mode 100644 index 6808ca8..0000000 --- a/rule_packs/times.py +++ /dev/null @@ -1,25 +0,0 @@ -"""Rule pack for subject_times.""" - -from __future__ import annotations - -from core.run_state import RunState -from rule_engine.base import RulePack -from rules import PassthroughRule - - -class TimesPack(RulePack): - name = "times" - output_types = ("times",) - - def __init__(self) -> None: - self.rule = PassthroughRule() - - def apply(self, state: RunState) -> None: - for raw in state.work.get("subject_times", []): - value = self.rule.apply(raw) - if value is not None: - state.add("times", value) - - @classmethod - def default(cls) -> "TimesPack": - return cls() diff --git a/rules/__init__.py b/rules/__init__.py index ed93b0e..e597b50 100644 --- a/rules/__init__.py +++ b/rules/__init__.py @@ -2,14 +2,10 @@ from .match_result import RuleMatch from .mapping_rule import MappingRule -from .override_rule import OverrideRule -from .passthrough_rule import PassthroughRule from .prefix_rule import PrefixRule __all__ = [ "MappingRule", - "OverrideRule", - "PassthroughRule", "PrefixRule", "RuleMatch", ] diff --git a/rules/override_rule.py b/rules/override_rule.py deleted file mode 100644 index 0fe74c1..0000000 --- a/rules/override_rule.py +++ /dev/null @@ -1,20 +0,0 @@ -"""Override-based normalization for field values.""" - -from __future__ import annotations - -from collections.abc import Mapping - -from rule_engine.normalization import normalize - - -class OverrideRule: - """Normalize a field value using overrides with raw fallback.""" - - def __init__(self, overrides: Mapping[str, str] | None = None) -> None: - self.overrides = dict(overrides or {}) - - def apply(self, raw: str) -> str | None: - cleaned = raw.strip() - if not cleaned: - return None - return self.overrides.get(normalize(raw), cleaned) diff --git a/rules/passthrough_rule.py b/rules/passthrough_rule.py deleted file mode 100644 index 5dbb88b..0000000 --- a/rules/passthrough_rule.py +++ /dev/null @@ -1,11 +0,0 @@ -"""Passthrough normalization for field values.""" - -from __future__ import annotations - - -class PassthroughRule: - """Return cleaned field values without additional transformation.""" - - def apply(self, raw: str) -> str | None: - cleaned = raw.strip() - return cleaned or None diff --git a/scripts/README.md b/scripts/README.md index 5ac204c..98f726e 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -1,12 +1,22 @@ # scripts -Tools for migrating Open Library's legacy subject strings to canonical typed tags. +Tools for running the current subject-migration dry runs. --- ## Overview -Open Library works currently have a flat `subjects` list (plus `subject_people`, `subject_places`, `subject_times`) containing a mix of genres, themes, tropes, catalog codes, reading levels, and noise. These scripts help convert that legacy data into structured, typed canonical tags. +The current migration scope is intentionally narrow: + +- `content_formats` is the only actively developed type-specific migration pack +- `subject_diagnostics` is kept as a minimal QA/support pack + +The goal of the script is to run subject-driven migration proposals against Open Library work JSON and show: + +- which structured tags would be proposed +- which legacy subjects would be removed +- which legacy subjects would remain +- which subjects matched with `move` vs `extract_only` --- @@ -14,12 +24,11 @@ Open Library works currently have a flat `subjects` list (plus `subject_people`, ### `migrate_subjects.py` -The current runner/compatibility entry point. Given a work's OL JSON, it: +The current runner. Given a work's OL JSON, it: 1. Loads the legacy `subjects`, `subject_people`, `subject_places`, and `subject_times` lists -2. Builds a `SubjectClassifier` from one or more enabled rule packs -3. Applies rule-based and keyword matching to classify each string into the correct canonical type -3. Outputs a structured tag object ready for import into the new schema +2. Applies the currently enabled subject packs +3. Outputs a proposal-style run report for review **Usage:** ```bash @@ -29,122 +38,110 @@ python scripts/migrate_subjects.py --work OL82563W # From a local JSON file python scripts/migrate_subjects.py --file work.json -# Legacy-compatible fixed-order wrapper -./scripts/run_legacy_subjects.sh --file work.json - # Batch from a newline-delimited list of OL IDs python scripts/migrate_subjects.py --batch ol_ids.txt --output output/ # Dry run (print proposed mappings without writing) python scripts/migrate_subjects.py --work OL82563W --dry-run -# Run the old full sequence explicitly through the wrapper -./scripts/run_legacy_subjects.sh --file work.json --dry-run - -# Run only a subset of rule packs -python scripts/migrate_subjects.py --file work.json --pack genres --pack content_formats --pack subject_diagnostics --dry-run - -# Run a single tag-type module +# Run only content_formats python scripts/migrate_subjects.py --file work.json --pack content_formats --dry-run + +# Run content_formats plus diagnostics +python scripts/migrate_subjects.py --file work.json --pack subject_mappings --dry-run ``` -`migrate_subjects.py` no longer enables a default full preset when `--pack` is omitted. If you want the old full sequence, use `run_legacy_subjects.sh` or pass the pack list explicitly. +Available packs: -`run_legacy_subjects.sh` is just a thin wrapper around `migrate_subjects.py` with the pack order written out explicitly, so it is easy to inspect and change. Any extra CLI args are forwarded as-is. +- `content_formats` +- `subject_diagnostics` +- `subject_mappings` (preset for both) **Output format:** ```json { "work_id": "OL82563W", - "literary_form": ["Fiction"], - "genres": ["Tragedy", "Gothic", "Romance"], - "subgenres": ["Psychological", "Historical"], - "content_formats": ["Novel"], - "moods": [], - "literary_themes": ["Love", "Revenge", "Death"], - "literary_tropes": ["Foundlings", "Love Triangles"], - "main_topics": ["Interpersonal relations", "Family life", "Class"], - "sub_topics": ["Country life", "Rural families", "Landscape"], - "people": ["Heathcliff", "Catherine Earnshaw"], - "places": ["Yorkshire", "England"], - "times": [], - "things": [], - "unmapped": ["Pr4172 .w7 2009c", "823/.8", "Zhang pian xiao shuo"] + "proposed_tags": { + "content_formats": ["Memoir", "Biography"], + "reading_level": ["Grade 4"], + "unmapped": ["abc"] + }, + "subject_proposal": { + "original": ["Memoirs", "Biography", "abc", "Grade 4"], + "removed": ["Memoirs"], + "remaining": ["Biography", "abc", "Grade 4"] + }, + "subject_matches": [ + { + "subject": "Memoirs", + "output_type": "content_formats", + "value": "Memoir", + "action": "move" + }, + { + "subject": "Biography", + "output_type": "content_formats", + "value": "Biography", + "action": "extract_only" + } + ] } ``` -The `unmapped` field collects strings that couldn't be classified — these are candidates for manual review or the `other` / droppable bucket. +This report is meant for dry-run review and QA, not as a final persisted work format. --- ### Architecture -The reusable classification core now lives outside the script entry point: +The current implementation is intentionally small and only supports the present migration scope: ```text core/ - json_loader.py # JSON resource loading for default assembly - subject_classifier.py # public work-level orchestration core - pack_registry.py # stable pack names -> factories / presets - classifier_assembler.py # pack resolution + classifier assembly - migrate_subject_classifier.py # compatibility shim for older imports + json_loader.py # JSON resource loading + run_state.py # shared run/proposal state + subject_classifier.py # work-level orchestration + report output rule_engine/ - base.py # RulePack interface - normalization.py # shared text normalization helpers + base.py # RulePack interface + normalization.py # shared text normalization helpers rules/ - prefix_rule.py # subject prefix matching - mapping_rule.py # normalized direct mapping - override_rule.py # override-based field normalization - passthrough_rule.py # cleaned passthrough fields + match_result.py # structured value + action matches + mapping_rule.py # normalized mapping matches + prefix_rule.py # prefix-based matches rule_packs/ - genres.py # one module per tag type - content_formats.py - audience.py - literary_themes.py - literary_tropes.py - main_topics.py - people.py - places.py - times.py -config/ - packs/ # future static pack configs + content_formats.py # current migration logic under active development + subject_diagnostics.py # minimal QA/support pack + utils.py # shared subject-pack execution helper ``` -`scripts/migrate_subjects.py` remains the operational entry point, but classification logic is now encapsulated in the shared core so future runners can reuse it. - -The classification core itself is kept narrow: `SubjectClassifier` consumes a normalized `work` object plus already-constructed packs, and returns a result. JSON resource loading now lives in the default assembly layer rather than inside individual packs. +`scripts/migrate_subjects.py` remains the operational entry point and keeps the pack selection local to the script. ### Adding Mapping Rules -Mappings live in `resources/mappings/`. Each file covers one tag type: +Mappings live in `resources/mappings/`. ``` resources/ mappings/ - genres.json # legacy string → canonical genre - subgenres.json # legacy string → canonical subgenre content_formats.json # legacy string → canonical format - literary_themes.json # legacy string → canonical theme - literary_tropes.json # legacy string → canonical trope droppable.json # strings to discard (reading levels, codes, etc.) - people_overrides.json # OL people string → canonical name - places_overrides.json # OL place string → canonical place ``` -Each mapping file is a JSON object where keys are legacy strings (lowercase, stripped) and values are the canonical tag: +`content_formats.json` is a JSON object where keys are legacy subject strings and values are canonical content format tags: ```json { - "historical fiction": "Historical", - "fiction, historical": "Historical", - "psychological fiction": "Psychological", - "gothic fiction": "Gothic", - "english gothic fiction": "Gothic" + "memoirs": "Memoir", + "biography": "Biography", + "letters": "Letters", + "novels": "Novel" } ``` +`ContentFormatsPack` then splits those mappings into: ---- +- `move` cases for currently clean first-pass formats +- `extract_only` cases for overlapping or not-yet-approved removals ## Development @@ -160,5 +157,3 @@ Requirements: `requests`, `tqdm` (for batch progress) ## Data Sources - OL Work JSON: `https://openlibrary.org/works/{OL_ID}.json` -- OL Search API: `https://openlibrary.org/search.json` -- Tag objects: `https://openlibrary.org/tags/{TAG_ID}.json` diff --git a/scripts/migrate_subjects.py b/scripts/migrate_subjects.py index fe64208..b858b3a 100644 --- a/scripts/migrate_subjects.py +++ b/scripts/migrate_subjects.py @@ -17,13 +17,15 @@ import os import sys from pathlib import Path +from typing import Callable REPO_ROOT = Path(__file__).resolve().parent.parent if str(REPO_ROOT) not in sys.path: sys.path.insert(0, str(REPO_ROOT)) -from core.classifier_assembler import build_subject_classifier -from core.pack_registry import AVAILABLE_PACK_NAMES +from core.subject_classifier import SubjectClassifier +from rule_packs.content_formats import ContentFormatsPack +from rule_packs.subject_diagnostics import SubjectDiagnosticsPack # --------------------------------------------------------------------------- # Paths @@ -31,6 +33,39 @@ OL_WORK_URL = "https://openlibrary.org/works/{work_id}.json" +PackFactory = Callable[[], object] +PACK_PRESETS: dict[str, tuple[str, ...]] = { + "subject_mappings": ("content_formats", "subject_diagnostics"), +} +PACK_FACTORIES: dict[str, PackFactory] = { + "content_formats": ContentFormatsPack.default, + "subject_diagnostics": SubjectDiagnosticsPack.default, +} +AVAILABLE_PACK_NAMES = tuple(sorted({*PACK_FACTORIES, *PACK_PRESETS})) + + +def resolve_pack_names(enabled_packs: list[str] | None) -> list[str]: + selected = list(enabled_packs or []) + expanded: list[str] = [] + for name in selected: + if name in PACK_PRESETS: + expanded.extend(PACK_PRESETS[name]) + continue + expanded.append(name) + return expanded + + +def build_subject_classifier(enabled_packs: list[str] | None = None) -> SubjectClassifier: + selected = resolve_pack_names(enabled_packs) + missing = [name for name in selected if name not in PACK_FACTORIES] + if missing: + available = ", ".join(AVAILABLE_PACK_NAMES) + missing_display = ", ".join(sorted(missing)) + raise ValueError( + f"Unknown rule pack(s): {missing_display}. Available: {available}" + ) + return SubjectClassifier(rule_packs=[PACK_FACTORIES[name]() for name in selected]) + # --------------------------------------------------------------------------- # Fetching diff --git a/scripts/run_legacy_subjects.sh b/scripts/run_legacy_subjects.sh deleted file mode 100755 index 829baa4..0000000 --- a/scripts/run_legacy_subjects.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/env bash - -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" - -python3 "${REPO_ROOT}/scripts/migrate_subjects.py" \ - --pack literary_form \ - --pack audience \ - --pack genres \ - --pack subgenres \ - --pack content_formats \ - --pack moods \ - --pack literary_themes \ - --pack literary_tropes \ - --pack main_topics \ - --pack subject_diagnostics \ - --pack people \ - --pack places \ - --pack times \ - "$@" From d5b7c2922fe6376f3d2e108986275779e2845534 Mon Sep 17 00:00:00 2001 From: Kaftow <166228791+Kaftow@users.noreply.github.com> Date: Wed, 22 Apr 2026 02:07:23 +0900 Subject: [PATCH 9/9] Minimize rule_pack package --- demo_content_formats.json | 20 ++++++++++ rule_packs/content_formats.py | 5 +-- rule_packs/{utils.py => subject_migration.py} | 38 +++++++------------ 3 files changed, 35 insertions(+), 28 deletions(-) create mode 100644 demo_content_formats.json rename rule_packs/{utils.py => subject_migration.py} (52%) diff --git a/demo_content_formats.json b/demo_content_formats.json new file mode 100644 index 0000000..0155f48 --- /dev/null +++ b/demo_content_formats.json @@ -0,0 +1,20 @@ +{ + "key": "/works/OLDEMO1W", + "subjects": [ + "Memoirs", + "Anthology", + "Letters", + "Dictionary", + "Biography", + "Autobiography", + "Manga", + "Encyclopedia", + "Novel", + "format:Diary", + "abc", + "Grade 4" + ], + "subject_people": [], + "subject_places": [], + "subject_times": [] +} diff --git a/rule_packs/content_formats.py b/rule_packs/content_formats.py index e4d271d..6131bbb 100644 --- a/rule_packs/content_formats.py +++ b/rule_packs/content_formats.py @@ -5,7 +5,7 @@ from collections.abc import Mapping from core.json_loader import load_mapping -from rule_packs.utils import SubjectPack +from rule_packs.subject_migration import SubjectMigrationPack from rules import MappingRule, PrefixRule MOVE = "move" @@ -22,7 +22,7 @@ ) -class ContentFormatsPack(SubjectPack): +class ContentFormatsPack(SubjectMigrationPack): name = "content_formats" output_types = ("content_formats",) output_type = "content_formats" @@ -37,7 +37,6 @@ def __init__( MappingRule(move_mapping, default_action=MOVE), MappingRule(extract_only_mapping, default_action=EXTRACT_ONLY), ) - self.remove_matched_subjects = False @classmethod def default(cls) -> "ContentFormatsPack": diff --git a/rule_packs/utils.py b/rule_packs/subject_migration.py similarity index 52% rename from rule_packs/utils.py rename to rule_packs/subject_migration.py index 00943b1..7b4aa69 100644 --- a/rule_packs/utils.py +++ b/rule_packs/subject_migration.py @@ -1,4 +1,4 @@ -"""Shared helpers for subject-based packs.""" +"""Shared helpers for subject-driven migration packs.""" from __future__ import annotations @@ -10,38 +10,26 @@ from rules import RuleMatch -class SubjectValueRule(Protocol): - def match(self, raw: str) -> RuleMatch | str | None: ... +class SubjectMatchRule(Protocol): + def match(self, raw: str) -> RuleMatch | None: ... -def _coerce_match(match: RuleMatch | str, default_action: str) -> RuleMatch: - if isinstance(match, RuleMatch): - return match - return RuleMatch(value=match, action=default_action) - - -def classify_subject_value( - raw: str, - rules: Iterable[SubjectValueRule], - default_action: str, -) -> RuleMatch | None: +def first_match(raw: str, rules: Iterable[SubjectMatchRule]) -> RuleMatch | None: for rule in rules: match = rule.match(raw) if match is not None: - return _coerce_match(match, default_action) + return match return None -def apply_subject_pack( +def apply_subject_migration( state: RunState, output_type: str, - rules: Iterable[SubjectValueRule], - remove_matched_subjects: bool, + rules: Iterable[SubjectMatchRule], ) -> None: - default_action = "move" if remove_matched_subjects else "extract_only" next_subjects: list[str] = [] for raw in state.remaining_subjects: - match = classify_subject_value(raw, rules, default_action=default_action) + match = first_match(raw, rules) if match is None: next_subjects.append(raw) continue @@ -60,15 +48,15 @@ def apply_subject_pack( state.remaining_subjects = next_subjects -class SubjectPack(RulePack): - """Small helper for packs that operate on the shared subject sequence.""" +class SubjectMigrationPack(RulePack): + """Base class for packs that migrate legacy subjects into structured tags.""" - output_type = "" + output_type: str = "" + rules: Iterable[SubjectMatchRule] def apply(self, state: RunState) -> None: - apply_subject_pack( + apply_subject_migration( state, output_type=self.output_type, rules=self.rules, - remove_matched_subjects=self.remove_matched_subjects, )