Bi-LSTM
双向长短期记忆网络(Bidirectional Long Short-Term Memory,Bi-LSTM)是 LSTM 的一个扩展版本,它在处理序列数据时同时考虑了过去和未来的信息。与传统的单向 LSTM 只考虑从过去到未来的信息流不同,Bi-LSTM 通过引入两个独立的 LSTM 层,分别从序列的前向和后向进行处理,从而能够捕捉到更全面的序列信息。在 Bi-LSTM 中,输入序列首先从前向 LSTM 层传递,然后再从后向 LSTM 层传递。最终,两个方向上的隐藏状态会被联合起来,形成整个序列的表示。
#=> input x
import torch
import torch.nn as nn
from torch.optim import Adam
class BiLSTM(nn.Module):
def __init__(self, d_emb, d_hid, vocab_size, n_layer, dropout=0.5):
super(BiLSTM, self).__init__()
self.emb = nn.Embedding(vocab_size, d_emb)
self.enc = nn.LSTM(d_emb, d_hid, n_layer, dropout=dropout, bidirectional=True)
self.dec = nn.Linear(d_hid*2, vocab_size)
def forward(self, x, h0, c0):
x = self.emb(x)
seq_len, bsz, d_emb = x.shape
# hs: (seq_len , bsz, d_hid * 2)
# h : (2 * n_layer, bsz, d_hid )
hs, (h,c) = self.enc(x, (h0, c0))
logits = self.dec(hs)
logits = logits.view(seq_len * bsz, -1)
return logits, h
if __name__ == "__main__":
# config
d_emb, d_hid, vocab_size, n_layer = 8, 4, 16, 5
bsz, lr = 2, 1e-4
# data
x = torch.Tensor([[1, 7],
[2, 8],
[3, 9]]).long()
y = torch.Tensor([[2, 8],
[3, 9],
[4, 10]]).long()
# model
model = BiLSTM(d_emb, d_hid, vocab_size, n_layer)
optimizer = Adam(model.parameters(), lr=lr)
loss_fn = nn.CrossEntropyLoss()
# training
h0 = torch.zeros(n_layer*2, bsz, d_hid)
c0 = torch.zeros(n_layer*2, bsz, d_hid)
optimizer.zero_grad()
logits, h = model(x, h0, c0)
loss = loss_fn(logits, y.reshape(-1))
loss.backward()
optimizer.step()
print(loss.item())