Skip to content

Dataset

Dataset

Class representing the structure of a dataset.

Parameters:

Name Type Description Default
name str

Dataset name.

required
audio_dir str

Audio directory.

required
ref_dir str

Reference directory.

required
Source code in asrbench\dataset.py
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
class Dataset:
    """Class representing the structure of a dataset.

    Arguments:
        name: Dataset name.
        audio_dir: Audio directory.
        ref_dir: Reference directory.
    """

    def __init__(
            self,
            name: str,
            audio_dir: str,
            ref_dir: str,
    ):
        self.__name: str = name
        self.__audio_dir: Path = Path(audio_dir)
        self.__ref_dir: Path = Path(ref_dir)
        self.__pairs: List[TranscribePair] = []
        self.get_data()

    @property
    def name(self) -> str:
        """Dataset identifier."""
        return self.__name

    @property
    def pairs(self) -> List[TranscribePair]:
        """Dataset data pairs."""
        return self.__pairs

    def get_data(self) -> None:
        """Set up dataset TranscriberPairs."""
        self.check_directories()
        audio_files: List[Path] = self.get_audio_files()

        for audio_file in audio_files:
            self.pairs.append(
                TranscribePair(
                    audio_path=audio_file.__str__(),
                    reference=self.get_ref_by_audio(audio_file),
                )
            )

    def check_directories(self) -> None:
        """Check if the Dataset directories are valid."""
        self._check_dir(self.__audio_dir)
        self._check_dir(self.__ref_dir)

    def _check_dir(self, dir_: Path) -> None:
        """Check that the directory provided is valid.

        Parameters:
            dir_: directory to be checked.
        """
        if not dir_.is_dir():
            raise ValueError(
                f"Directory {dir_} of "
                f"Dataset {self.name} is not valid."
            )

    def get_audio_files(self) -> List[Path]:
        """It takes all the files from the audio directory.
        If the directory is empty it raises an error."""
        audio_files: List[Path] = list(self.__audio_dir.glob("*"))

        if not audio_files:
            raise ValueError(
                f"Audio directory {self.__audio_dir} of "
                f"dataset {self.name} is empty."
            )

        return audio_files

    def get_ref_by_audio(self, audio: Path) -> str:
        """Fetches the contents of the reference file from the path of
        the audio file.

        Parameters:
            audio: Path for audio file.
        """
        ref_file: Path = self.__ref_dir.joinpath(
            audio.with_suffix(".txt").name,
        )

        if not ref_file.exists():
            raise FileNotFoundError(
                f"Reference file for {audio.name} not exists.",
            )

        return ref_file.open().read()

    @classmethod
    def from_config(cls, name: str, config: Dict[str, str]):
        """Set up Dataset from config Dict in configfile.

        Parameters:
            name: dataset identifier.
            config: dictionary containing the dataset configuration.
        """
        return Dataset(
            name=name,
            audio_dir=_get_param(config, "audio_dir", name),
            ref_dir=_get_param(config, "reference_dir", name)
        )

    def __repr__(self) -> str:
        return f"<Dataset dir={self.__audio_dir} with {len(self.pairs)} pairs.>"

name: str property

Dataset identifier.

pairs: List[TranscribePair] property

Dataset data pairs.

check_directories()

Check if the Dataset directories are valid.

Source code in asrbench\dataset.py
56
57
58
59
def check_directories(self) -> None:
    """Check if the Dataset directories are valid."""
    self._check_dir(self.__audio_dir)
    self._check_dir(self.__ref_dir)

from_config(name, config) classmethod

Set up Dataset from config Dict in configfile.

Parameters:

Name Type Description Default
name str

dataset identifier.

required
config Dict[str, str]

dictionary containing the dataset configuration.

required
Source code in asrbench\dataset.py
104
105
106
107
108
109
110
111
112
113
114
115
116
@classmethod
def from_config(cls, name: str, config: Dict[str, str]):
    """Set up Dataset from config Dict in configfile.

    Parameters:
        name: dataset identifier.
        config: dictionary containing the dataset configuration.
    """
    return Dataset(
        name=name,
        audio_dir=_get_param(config, "audio_dir", name),
        ref_dir=_get_param(config, "reference_dir", name)
    )

get_audio_files()

It takes all the files from the audio directory. If the directory is empty it raises an error.

Source code in asrbench\dataset.py
73
74
75
76
77
78
79
80
81
82
83
84
def get_audio_files(self) -> List[Path]:
    """It takes all the files from the audio directory.
    If the directory is empty it raises an error."""
    audio_files: List[Path] = list(self.__audio_dir.glob("*"))

    if not audio_files:
        raise ValueError(
            f"Audio directory {self.__audio_dir} of "
            f"dataset {self.name} is empty."
        )

    return audio_files

get_data()

Set up dataset TranscriberPairs.

Source code in asrbench\dataset.py
43
44
45
46
47
48
49
50
51
52
53
54
def get_data(self) -> None:
    """Set up dataset TranscriberPairs."""
    self.check_directories()
    audio_files: List[Path] = self.get_audio_files()

    for audio_file in audio_files:
        self.pairs.append(
            TranscribePair(
                audio_path=audio_file.__str__(),
                reference=self.get_ref_by_audio(audio_file),
            )
        )

get_ref_by_audio(audio)

Fetches the contents of the reference file from the path of the audio file.

Parameters:

Name Type Description Default
audio Path

Path for audio file.

required
Source code in asrbench\dataset.py
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
def get_ref_by_audio(self, audio: Path) -> str:
    """Fetches the contents of the reference file from the path of
    the audio file.

    Parameters:
        audio: Path for audio file.
    """
    ref_file: Path = self.__ref_dir.joinpath(
        audio.with_suffix(".txt").name,
    )

    if not ref_file.exists():
        raise FileNotFoundError(
            f"Reference file for {audio.name} not exists.",
        )

    return ref_file.open().read()