Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions src/voxkit/storage/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -419,6 +419,45 @@ def export_dataset(dataset_id: str, output_root: Path) -> Tuple[bool, str]:
return False, f"Failed to export dataset: {str(e)}"


def _rewrite_imported_alignments(new_dataset_path: Path) -> None:
"""Rewrite alignment metadata paths after importing a dataset to a new location.

When a dataset is imported, its directory is copied to a new location under a
new dataset id. Any ``local`` alignment has a ``tg_path`` that lives inside
the dataset directory and still references the source location. For each such
alignment, rewrite ``tg_path`` to ``<new_dataset>/alignments/<alignment_id>/
textgrids``. Non-local alignments (``local == False``) store TextGrids at the
dataset's ``original_path``, which is unchanged by import, so they are left
alone.
"""
alignments_dir = new_dataset_path / ALIGNMENTS_ROOT
if not alignments_dir.is_dir():
return

for alignment_dir in alignments_dir.iterdir():
if not alignment_dir.is_dir():
continue
metadata_file = alignment_dir / "voxkit_alignment.json"
if not metadata_file.exists():
continue
try:
with open(metadata_file, "r") as f:
alignment_metadata = json.load(f)
except (OSError, json.JSONDecodeError) as e:
print(f"Skipping alignment metadata rewrite for '{metadata_file}': {e}")
continue

if not alignment_metadata.get("local"):
continue

alignment_metadata["tg_path"] = str(alignment_dir / "textgrids")
try:
with open(metadata_file, "w") as f:
json.dump(alignment_metadata, f, indent=4)
except OSError as e:
print(f"Failed to rewrite alignment metadata '{metadata_file}': {e}")


def import_dataset(dataset_path: Path) -> Tuple[bool, str]:
"""Import an existing dataset into VoxKit storage.

Expand Down Expand Up @@ -489,6 +528,8 @@ def import_dataset(dataset_path: Path) -> Tuple[bool, str]:
with open(metadata_path, "w") as f:
json.dump(dataset_metadata, f, indent=2)

_rewrite_imported_alignments(dataset_dest)

return True, "Dataset imported successfully."

except Exception as e:
Expand Down
68 changes: 68 additions & 0 deletions tests/storage/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -463,6 +463,74 @@ def test_import_dataset_success(self, monkeypatch):
assert imp_success is True
assert "imported successfully" in imp_message

def test_import_dataset_rewrites_alignment_paths(self, monkeypatch):
import json

from voxkit.storage import datasets
from voxkit.storage.datasets import (
_get_datasets_root,
create_dataset,
export_dataset,
import_dataset,
)

monkeypatch.setattr(datasets, "get_storage_root", mock_get_storage_root)

success, message = create_dataset(
name="dataset_align_import",
description="Testing alignment path rewrite on import",
original_path=valid_dataset_path,
cached=True,
anonymize=False,
transcribed=True,
)
assert success is True
assert isinstance(message, dict)
source_id = message["id"]

source_root = _get_datasets_root() / source_id
alignment_id = "test_alignment_001"
alignment_dir = source_root / "alignments" / alignment_id
tg_dir = alignment_dir / "textgrids"
tg_dir.mkdir(parents=True, exist_ok=True)

alignment_metadata = {
"id": alignment_id,
"engine_id": "mfa",
"model_metadata": {},
"local": True,
"alignment_date": "2026-04-14T00:00:00",
"status": "completed",
"tg_path": str(tg_dir),
}
alignment_json = alignment_dir / "voxkit_alignment.json"
with open(alignment_json, "w") as f:
json.dump(alignment_metadata, f)

export_path = mock_get_storage_root()
export_dataset(source_id, export_path)

exported_dir = export_path / Path(message["name"] + "_" + str(source_id))
imp_success, _ = import_dataset(exported_dir)
assert imp_success is True

# Find the newly imported dataset (id differs from source_id).
imported_ids = [
p.name for p in _get_datasets_root().iterdir() if p.is_dir() and p.name != source_id
]
assert len(imported_ids) == 1
new_id = imported_ids[0]
new_root = _get_datasets_root() / new_id

new_alignment_json = new_root / "alignments" / alignment_id / "voxkit_alignment.json"
assert new_alignment_json.exists()
with open(new_alignment_json, "r") as f:
rewritten = json.load(f)

expected_tg_path = str(new_root / "alignments" / alignment_id / "textgrids")
assert rewritten["tg_path"] == expected_tg_path
assert str(source_root) not in rewritten["tg_path"]

def test_import_dataset_nonexistent(self, monkeypatch):
from voxkit.storage import datasets
from voxkit.storage.datasets import import_dataset
Expand Down
Loading