AudioDigitClassification/audio_model.py at main · mcoric96/AudioDigitClassification · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import pytorch_lightning as pl
import torch
import torch.nn as nn
from torchmetrics import Accuracy


class AudioTransformerClassifier(pl.LightningModule):
    """
    A PyTorch Lightning Module for audio classification using a Transformer-based architecture.
    This model processes input audio features through a series of transformations,
    including a linear projection, positional encoding, Transformer encoder layers,
    and a final classification layer.
    It supports different pooling strategies to aggregate timeseries.
    """

    def __init__(
        self,
        input_length: int,
        num_input_features: int,
        d_model: int = 128,
        num_classes: int = 10,
        num_encoder_layers: int = 4,
        nhead: int = 4,
        dim_feedforward: int = 256,
        dropout: float = 0.1,
        pooling_mode: str = "mean",
        lr: float = 1e-3,
    ):
        super().__init__()
        self.save_hyperparameters()

        self.input_proj = nn.Linear(
            num_input_features, d_model
        )  # (batch, seq_len, num_features) → (batch, seq_len, d_model)

        # learnable positional embedding: shape (1, seq_len, d_model)
        self.pos_embedding = nn.Parameter(torch.randn(1, input_length, d_model))

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True,  # enables input shape (batch, seq_len, d_model)
        )
        self.encoder = nn.TransformerEncoder(
            encoder_layer, num_layers=num_encoder_layers
        )

        self.pooling_mode = pooling_mode

        self.classifier = nn.Linear(d_model, num_classes)

        self.criterion = nn.CrossEntropyLoss()

        self.train_acc = Accuracy(task="multiclass", num_classes=num_classes)
        self.val_acc = Accuracy(task="multiclass", num_classes=num_classes)
        self.test_acc = Accuracy(task="multiclass", num_classes=num_classes)

        self.lr = lr

    def pooling_function(self, x: torch.Tensor) -> torch.Tensor:
        """
        Applies a pooling operation to the input tensor along the specified dimension.
        Supported pooling modes:
            - "mean": Computes the mean across dimension 1.
            - "sum": Computes the sum across dimension 1.
            - "max": Computes the maximum value across dimension 1.
        Args:
            x (torch.Tensor): Input tensor to be pooled.
        Returns:
            torch.Tensor: The pooled tensor according to the selected pooling mode.
        Raises:
            ValueError: If an invalid pooling mode is specified.
        """

        if self.pooling_mode == "mean":
            return x.mean(dim=1)
        elif self.pooling_mode == "sum":
            return x.sum(dim=1)
        elif self.pooling_mode == "max":
            return x.max(dim=1).values
        else:
            raise ValueError(f"Invalid pooling_mode: {self.pooling_mode}")

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Performs a forward pass through the audio classification model.
        Args:
            x (torch.Tensor): Input tensor of shape (batch, seq_len, num_features),
                representing a batch of audio feature sequences.
        Returns:
            torch.Tensor: Output logits tensor of shape (batch, num_classes),
                representing the predicted class scores for each input in the batch.
        The forward pass includes:
            - Projecting input features to model dimension.
            - Adding positional embeddings.
            - Passing through the encoder.
            - Applying a pooling function to aggregate sequence information.
            - Classifying the pooled representation.
        """

        # x shape: (batch, seq_len, num_features)
        x = self.input_proj(x)  # → (batch, seq_len, d_model)

        x = (
            x + self.pos_embedding[:, : x.size(1), :]
        )  # add positional embedding for given sequence

        x = self.encoder(x)  # → (batch, seq_len, d_model)

        x = self.pooling_function(x)  # x → (batch, d_model)

        logits = self.classifier(x)  # → (batch, num_classes)

        return logits

    def training_step(self, batch, batch_idx):
        """
        Performs a single training step.
        Args:
            batch (tuple): A tuple containing input data (x) and target labels (y).
            batch_idx (int): Index of the current batch.
        Returns:
            torch.Tensor: The computed loss for the current batch.
        Logs:
            train_loss (float): Training loss for the batch.
            train_acc (float): Training accuracy for the batch.
        """

        x, y = batch
        logits = self(x)
        loss = self.criterion(logits, y)
        acc = self.train_acc(logits, y)
        self.log("train_loss", loss, prog_bar=True)
        self.log("train_acc", acc, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.criterion(logits, y)
        acc = self.val_acc(logits, y)
        self.log("val_loss", loss, prog_bar=True)
        self.log("val_acc", acc, prog_bar=True)

    def test_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.criterion(logits, y)
        acc = self.test_acc(logits, y)
        self.log("test_loss", loss)
        self.log("test_acc", acc)

    def configure_optimizers(self):
        """
        Configures and returns the optimizer for training the model.
        Returns:
            torch.optim.Adam: An Adam optimizer initialized with the model's parameters and learning rate.
        """

        return torch.optim.Adam(self.parameters(), lr=self.lr)

    def total_params(self) -> int:
        """
        Calculates the total number of parameters in the model.
        Returns:
            int: The total count of parameters.
        """

        return sum(p.numel() for p in self.parameters())