From f15d4df8e656ce280413d1285521d73661e667fb Mon Sep 17 00:00:00 2001 From: Beckett Frey Date: Tue, 14 Apr 2026 17:07:38 -0500 Subject: [PATCH] Fix problem with dedicated internal function --- src/voxkit/storage/datasets.py | 41 ++++++++++++++++++++ tests/storage/test_datasets.py | 68 ++++++++++++++++++++++++++++++++++ 2 files changed, 109 insertions(+) diff --git a/src/voxkit/storage/datasets.py b/src/voxkit/storage/datasets.py index 88a619b..5ec0cad 100644 --- a/src/voxkit/storage/datasets.py +++ b/src/voxkit/storage/datasets.py @@ -419,6 +419,45 @@ def export_dataset(dataset_id: str, output_root: Path) -> Tuple[bool, str]: return False, f"Failed to export dataset: {str(e)}" +def _rewrite_imported_alignments(new_dataset_path: Path) -> None: + """Rewrite alignment metadata paths after importing a dataset to a new location. + + When a dataset is imported, its directory is copied to a new location under a + new dataset id. Any ``local`` alignment has a ``tg_path`` that lives inside + the dataset directory and still references the source location. For each such + alignment, rewrite ``tg_path`` to ``/alignments// + textgrids``. Non-local alignments (``local == False``) store TextGrids at the + dataset's ``original_path``, which is unchanged by import, so they are left + alone. + """ + alignments_dir = new_dataset_path / ALIGNMENTS_ROOT + if not alignments_dir.is_dir(): + return + + for alignment_dir in alignments_dir.iterdir(): + if not alignment_dir.is_dir(): + continue + metadata_file = alignment_dir / "voxkit_alignment.json" + if not metadata_file.exists(): + continue + try: + with open(metadata_file, "r") as f: + alignment_metadata = json.load(f) + except (OSError, json.JSONDecodeError) as e: + print(f"Skipping alignment metadata rewrite for '{metadata_file}': {e}") + continue + + if not alignment_metadata.get("local"): + continue + + alignment_metadata["tg_path"] = str(alignment_dir / "textgrids") + try: + with open(metadata_file, "w") as f: + json.dump(alignment_metadata, f, indent=4) + except OSError as e: + print(f"Failed to rewrite alignment metadata '{metadata_file}': {e}") + + def import_dataset(dataset_path: Path) -> Tuple[bool, str]: """Import an existing dataset into VoxKit storage. @@ -489,6 +528,8 @@ def import_dataset(dataset_path: Path) -> Tuple[bool, str]: with open(metadata_path, "w") as f: json.dump(dataset_metadata, f, indent=2) + _rewrite_imported_alignments(dataset_dest) + return True, "Dataset imported successfully." except Exception as e: diff --git a/tests/storage/test_datasets.py b/tests/storage/test_datasets.py index 1f5c621..850690b 100644 --- a/tests/storage/test_datasets.py +++ b/tests/storage/test_datasets.py @@ -463,6 +463,74 @@ def test_import_dataset_success(self, monkeypatch): assert imp_success is True assert "imported successfully" in imp_message + def test_import_dataset_rewrites_alignment_paths(self, monkeypatch): + import json + + from voxkit.storage import datasets + from voxkit.storage.datasets import ( + _get_datasets_root, + create_dataset, + export_dataset, + import_dataset, + ) + + monkeypatch.setattr(datasets, "get_storage_root", mock_get_storage_root) + + success, message = create_dataset( + name="dataset_align_import", + description="Testing alignment path rewrite on import", + original_path=valid_dataset_path, + cached=True, + anonymize=False, + transcribed=True, + ) + assert success is True + assert isinstance(message, dict) + source_id = message["id"] + + source_root = _get_datasets_root() / source_id + alignment_id = "test_alignment_001" + alignment_dir = source_root / "alignments" / alignment_id + tg_dir = alignment_dir / "textgrids" + tg_dir.mkdir(parents=True, exist_ok=True) + + alignment_metadata = { + "id": alignment_id, + "engine_id": "mfa", + "model_metadata": {}, + "local": True, + "alignment_date": "2026-04-14T00:00:00", + "status": "completed", + "tg_path": str(tg_dir), + } + alignment_json = alignment_dir / "voxkit_alignment.json" + with open(alignment_json, "w") as f: + json.dump(alignment_metadata, f) + + export_path = mock_get_storage_root() + export_dataset(source_id, export_path) + + exported_dir = export_path / Path(message["name"] + "_" + str(source_id)) + imp_success, _ = import_dataset(exported_dir) + assert imp_success is True + + # Find the newly imported dataset (id differs from source_id). + imported_ids = [ + p.name for p in _get_datasets_root().iterdir() if p.is_dir() and p.name != source_id + ] + assert len(imported_ids) == 1 + new_id = imported_ids[0] + new_root = _get_datasets_root() / new_id + + new_alignment_json = new_root / "alignments" / alignment_id / "voxkit_alignment.json" + assert new_alignment_json.exists() + with open(new_alignment_json, "r") as f: + rewritten = json.load(f) + + expected_tg_path = str(new_root / "alignments" / alignment_id / "textgrids") + assert rewritten["tg_path"] == expected_tg_path + assert str(source_root) not in rewritten["tg_path"] + def test_import_dataset_nonexistent(self, monkeypatch): from voxkit.storage import datasets from voxkit.storage.datasets import import_dataset