boa-constrictor/model_lstm.py at main · codeonthespectrum/boa-constrictor · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51

import torch
import torch.nn as nn
import numpy as np

def BoaLSTM(d_model=256, num_layers=4, vocab_size=256, device="cuda"):
    """ Construct a BoaBytePredictor with LSTM """


    class BoaBytePredictorLSTM(nn.Module):
        """ LSTM model adapted to predict the next byte in a sequence. """

        def __init__(self, d_model=256, num_layers=4, vocab_size=256):
            super().__init__()
            # Embedding for vocab_size possible bytes
            self.embedding = nn.Embedding(vocab_size, d_model)
            self.lstm = nn.LSTM(input_size=d_model, hidden_size=d_model, num_layers=num_layers, batch_first=True)
            self.head = nn.Sequential(
                nn.Linear(d_model, d_model),
                nn.ReLU(),
                # Output logits for each of the vocab_size possible next bytes
                nn.Linear(d_model, vocab_size)
            )
            self.d_model= d_model
            self.num_layers = num_layers

        def forward(self, x):
            h = self.embedding(x)  # [B, L, D]
            output, _ = self.lstm(h)
            return self.head(output)

        @torch.inference_mode()
        def init_stream(self, max_len: int, batch_size: int = 1, device=None, dtype=None):
              h_0 = torch.zeros(self.num_layers, batch_size, self.d_model, device=device, dtype=dtype)
              c_0 = torch.zeros(self.num_layers, batch_size, self.d_model, device=device, dtype=dtype)
              return [(h_0, c_0)]

        @torch.inference_mode()
        def step(self, byte_t: torch.LongTensor, caches) -> torch.Tensor:
                # byte_t: [B] -> logits: [B, 256]
              x = self.embedding(byte_t).unsqueeze(1)  # [B, 1, D]
              prev_states = caches[0]
              lstm_out, new_states = self.lstm(x, prev_states)

              caches[0] = new_states

              logits = self.head(lstm_out)
              return logits.squeeze(1)

    model = BoaBytePredictorLSTM(d_model, num_layers, vocab_size)
    return model.to(device)