diff --git a/src/voxkit/storage/datasets.py b/src/voxkit/storage/datasets.py index 88a619b..57d91b8 100644 --- a/src/voxkit/storage/datasets.py +++ b/src/voxkit/storage/datasets.py @@ -510,6 +510,7 @@ def validate_dataset(dataset_path: Path) -> Tuple[bool, str]: - Each speaker directory contains audio files (.wav, .flac, .mp3, .ogg, .m4a) - Each speaker directory contains label files (.lab, .txt) - Number of audio files matches number of label files per speaker + - Each audio file has a matching label file with the same stem name Expected structure: @@ -562,15 +563,9 @@ def validate_dataset(dataset_path: Path) -> Tuple[bool, str]: audio_files = [ f for f in os.listdir(speaker_path) - if f.endswith(".wav") - or f.endswith(".flac") - or f.endswith(".mp3") - or f.endswith(".ogg") - or f.endswith(".m4a") - ] - label_files = [ - f for f in os.listdir(speaker_path) if f.endswith(".lab") or f.endswith(".txt") + if f.endswith((".wav", ".flac", ".mp3", ".ogg", ".m4a")) ] + label_files = [f for f in os.listdir(speaker_path) if f.endswith((".lab", ".txt"))] if not audio_files: return False, f"No audio files found in speaker directory '{speaker_path}'." @@ -585,4 +580,14 @@ def validate_dataset(dataset_path: Path) -> Tuple[bool, str]: f"directory '{speaker_path}'.", ) + audio_stems = {Path(f).stem for f in audio_files} + label_stems = {Path(f).stem for f in label_files} + unmatched = audio_stems.symmetric_difference(label_stems) + if unmatched: + return ( + False, + f"Unpaired audio/label files in speaker directory '{speaker_path}': " + f"{', '.join(sorted(unmatched))}.", + ) + return True, "Dataset is valid." diff --git a/tests/storage/test_datasets.py b/tests/storage/test_datasets.py index 1f5c621..00416af 100644 --- a/tests/storage/test_datasets.py +++ b/tests/storage/test_datasets.py @@ -813,3 +813,19 @@ def test_validate_dataset_mismatched_counts(self, monkeypatch): assert is_valid is False assert "Mismatch" in msg + + def test_validate_dataset_unpaired_stems(self, monkeypatch): + from voxkit.storage.datasets import validate_dataset + + # Create a dataset where counts match but stems do not + # (e.g. recording_A.wav paired with recording_B.lab) + unpaired_path = mock_get_storage_root() / "fake_datasets" / "unpaired_stems" + speaker_path = unpaired_path / "speaker_1" + speaker_path.mkdir(parents=True, exist_ok=True) + (speaker_path / "recording_A.wav").touch() + (speaker_path / "recording_B.lab").touch() + + is_valid, msg = validate_dataset(unpaired_path) + + assert is_valid is False + assert "Unpaired" in msg