双向长短期记忆网络(Bidirectional Long Short-Term Memory,Bi-LSTM)是 LSTM 的一个扩展版本,它在处理序列数据时同时考虑了过去和未来的信息。与传统的单向 LSTM 只考虑从过去到未来的信息流不同,Bi-LSTM 通过引入两个独立的 LSTM 层,分别从序列的前向和后向进行处理,从而能够捕捉到更全面的序列信息。在 Bi-LSTM 中,输入序列首先从前向 LSTM 层传递,然后再从后向 LSTM 层传递。最终,两个方向上的隐藏状态会被联合起来,形成整个序列的表示。

#=> input x

import torch
import torch.nn as nn
from torch.optim import Adam

class BiLSTM(nn.Module):
    def __init__(self, d_emb, d_hid, vocab_size, n_layer, dropout=0.5):
        super(BiLSTM, self).__init__()
        self.emb = nn.Embedding(vocab_size, d_emb)
        self.enc = nn.LSTM(d_emb, d_hid, n_layer, dropout=dropout, bidirectional=True)
        self.dec = nn.Linear(d_hid*2, vocab_size)

    def forward(self, x, h0, c0):
        x = self.emb(x)
        seq_len, bsz, d_emb = x.shape
        # hs: (seq_len    , bsz, d_hid * 2)
        # h : (2 * n_layer, bsz, d_hid    )
        hs, (h,c) = self.enc(x, (h0, c0))
        logits = self.dec(hs)
        logits = logits.view(seq_len * bsz, -1)
        return logits, h


if __name__ == "__main__":
    # config
    d_emb, d_hid, vocab_size, n_layer = 8, 4, 16, 5
    bsz, lr = 2, 1e-4
    # data
    x = torch.Tensor([[1, 7],
                      [2, 8],
                      [3, 9]]).long()
    y = torch.Tensor([[2, 8],
                      [3, 9],
                      [4, 10]]).long()
    # model
    model = BiLSTM(d_emb, d_hid, vocab_size, n_layer)
    optimizer = Adam(model.parameters(), lr=lr)
    loss_fn = nn.CrossEntropyLoss()
    # training
    h0 = torch.zeros(n_layer*2, bsz, d_hid)
    c0 = torch.zeros(n_layer*2, bsz, d_hid)
    optimizer.zero_grad()
    logits, h = model(x, h0, c0)
    loss = loss_fn(logits, y.reshape(-1))
    loss.backward()
    optimizer.step()
    print(loss.item())