Pytorch學習記錄-torchtext和Pytorch的實例4
在完成基本的torchtext之後,找到了這個教程,《基於Pytorch和torchtext來理解和實現seq2seq模型》。 這個項目主要包括了6個子項目 1. ~~使用神經網路訓練Seq2Seq~~ 2. ~~使用RNN encoder-decoder訓練短語表示用於統計機器翻譯~~ 3. ~~使用共同學習完成NMT的構建和翻譯~~ 4. ~~打包填充序列、掩碼和推理~~ 5. ~~卷積Seq2Seq~~ 6. ~~Transformer~~
結束Transformer之後隔了兩天沒有學習,這兩天對幾個模型進行對比和總結吧,在完成前三個模型的時候曾經寫過一個總結,今天主要是看一下六個模型的變化以及實現。關鍵是實現,用了15天,但是模型實現部分只能看懂一般Seq2Seq……
六個模型都是Seq2Seq,都包含有Encoder和Decoder兩部分,只是模型核心不同,並且在層與層或是Encoder與Decoder之間不斷加新東西分別是:LSTM->多層GRU->Attention->PadMaskAttention->CNN->Transformer
注意圖中Decoder部分。 OK,這裡實現一下。
import torch import torch.nn as nn
class Encoder(nn.Module): def __init__(self, input_dim, hid_dim, emb_dim, dropout): super(Encoder, self).__init__()
self.input_dim=input_dim self.emb_dim=emb_dim self.hid_dim=hid_dim self.dropout=dropout
self.embedding=nn.Embedding(input_dim, emb_dim) self.rnn=nn.GRU(emb_dim,hid_dim) self.dropout=nn.Dropout(dropout)
def forward(self, src): embedded=self.dropout(self.embedding(src)) outputs, hidden=self.rnn(embedded)
return hidden
class Decoder(nn.Module): def __init__(self, output_dim, hid_dim, emb_dim, dropout): super(Decoder.self).__init__()
self.output_dim=output_dim self.emb_dim=emb_dim self.hid_dim=hid_dim self.dropout=dropout
self.embedding=nn.Embedding(output_dim,emb_dim) # 在實現的時候,通過將$y_t$和$z$串聯傳入GRU,所以輸入的維度應該是emb_dim+ hid_dim self.rnn=nn.GRU(emb_dim+hid_dim,hid_dim) # linear層輸入的是 $y_t, s_t$ 和 $z$串聯,而隱藏狀態和上下文向量都是$h$維度相同,所以輸入的維度是emb_dim+hid_dim*2 self.out=nn.Linear(emb_dim+hid_dim*2,output_dim) self.dropout=nn.Dropout(dropout) def forward(self, input, hidden, context): input=input.unsqueeze(0) embedded=self.dropout(self.embedding(input)) emb_context=torch.cat((embedded,context),dim=2) output, hidden=self.rnn(emb_context, hidden) output=torch.cat((embedded.squeeze(0), hidden.squeeze(0), context.squeeze(0)), dim = 1)
prediction=self.out(output)
return prediction, hidden
class Seq2Seq(nn.Module): def __init__(self, encoder, decoder, device): super(Seq2Seq,self).__init__() self.encoder=encoder self.decoder=decoder self.device=device assert encoder.hid_dim == decoder.hid_dim, "Hidden dimensions of encoder and decoder must be equal!"
def forward(self,src, trg, teacher_forcing_ratio=0.5): batch_size=trg.shape[1] max_len=trg.shape[0] trg_vocab_size=self.decoder.output_dim
outputs=torch.zeros(max_len,batch_size,trg_vocab_size).to(self.device) context=self.encoder(src) hidden=context input=trg[0,:] for t in range(1,max_len): output,hidden=self.decoder(input,hidden,context) outputs[t]=output teacher_force=random.random()<teacher_forcing_ratio top1=output.max(1)[1] input=(trg[t] if teacher_force else top1)
return outputs
但是這樣乾的一個問題是,Decoder獲取的是上下文向量是Encoder輸入的所有單詞信息,但是當需要用到具體某一個時間節點的信息時,沒有,仍然是整個句子的全局信息。於是,就有了Attention
有了上面的需求,就需要將Encoder所有時間節點的隱藏層狀態輸出,然後進行加權求和。
權值是與Decoder當前時間節點相關聯的一套數值(這個的意思是,對於解碼器的每個時間節點,對於編碼器所有時間節點的hidden state的加權係數是不一樣的),權值即為attention vector,記作a。
就是說,現在通過attention vector來關注Encoder時間節點信息,通過將Encoder每個時間節點輸出的hidden state和attention vector加權求和之後,得到的w(t),上下文向量輸入到RNN和線性預測層(要注意的是:在Decoder的第1個時間節點,輸入RNN層的hidden state並不是w而是h,即Encoder最後一個時間節點輸出的隱藏狀態)。
另外在Encoder部分使用的是bidirectional RNN。通過bidirectional RNN,每層可以有兩個RNN網路。 - 前向RNN從左到右處理句子(圖中綠色) - 後向RNN從右到左處理句子(圖中黃色) 在這裡要做的就是設置 bidirectional = True ,然後輸入嵌入好的句子。
由於Decoder不是雙向的,它只需要一個上下文向量$ z $作為其初始隱藏狀態$ s_0 $,而Encoder提供有兩個,前向和後向($ z ^ ightarrow = h_T ^ ightarrow $和$ z ^ leftarrow = h_T ^ leftarrow $)。通過將兩個上下文向量連接在一起,通過線性層$ g $並應用$ anh $激活函數來解決這個問題。$$z= anh(g(h_T^ ightarrow, h_T^leftarrow)) = anh(g(z^ ightarrow, z^leftarrow)) = s_0$$
由於我們希望我們的模型回顧整個源句,我們返回輸出,源句中每個標記的堆疊前向和後向隱藏狀態。我們還返回hidden,它在解碼器中充當我們的初始隱藏狀態。
OK,來實現一下看看區別,你會發現,Decoder基本類似,
class Encoder(nn.Module): def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout): super(Encoder,self).__init__() self.input_dim=input_dim self.emb_dim=emb_dim self.enc_hid_dim=enc_hid_dim self.dec_hid_dim=dec_hid_dim self.dropout=dropout
self.embedding=nn.Embedding(input_dim, emb_dim) self.rnn=nn.GRU(emb_dim, enc_hid_dim,bidirectional=True) self.fc=nn.Linear(enc_hid_dim*2, dec_hid_dim) self.dropout=nn.Dropout(dropout)
def forward(self, src): embedded=self.dropout(self.embedding(src)) outputs, hidden=self.rnn(embedded) # hidden[-2,:,:]和hidden[-1,:,:]分別代表前向和後向,通過tanh來激活 hidden=torch.tanh(self.fc(torch.cat((hidden[-2,:,:],hidden[-1,:,:]),dim=1)))
return outputs, hidden
class Attention(nn.Module): def __init__(self, enc_hid_dim,dec_hid_dim): super(Attention,self).__init__() self.enc_hid_dim=enc_hid_dim self.dec_hid_dim=dec_hid_dim
self.attn=nn.Linear((enc_hid_dim*2)+dec_hid_dim,dec_hid_dim) self.v=nn.Parameter(torch.rand(dec_hid_dim))
def forward(self, hidden, encoder_outputs): batch_size=encoder_outputs.shape[1] src_len=encoder_outputs.shape[0] hidden=hidden.unsqueeze(1).repeat(1,src_len,1) encoder_outputs=encoder_outputs.permute(1,0,2) energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim = 2))) energy = energy.permute(0, 2, 1) v = self.v.repeat(batch_size, 1).unsqueeze(1) attention = torch.bmm(v, energy).squeeze(1)
return F.softmax(attention, dim=1)
class Decoder(nn.Module): def __init__(self,output_dim,emb_dim,enc_hid_dim,dec_hid_dim,dropout, attention): super(Decoder,self).__init__() self.emb_dim = emb_dim self.enc_hid_dim = enc_hid_dim self.dec_hid_dim = dec_hid_dim self.output_dim = output_dim self.dropout = dropout self.attention = attention
self.embedding=nn.Embedding(output_dim,emb_dim) self.rnn=nn.GRU((enc_hid_dim*2)+emb_dim,dec_hid_dim) self.out=nn.Linear((enc_hid_dim*2)+dec_hid_dim+emb_dim,output_dim) self.dropout=nn.Dropout(dropout)
def forward(self,input,hidden,encoder_outputs): input=input.unsqueeze(0) embedded=self.dropout(self.embedding(input)) a=self.attention(hidden,encoder_outputs) a=a.unsqueeze(1) encoder_outputs=encoder_outputs.permute(1,0,2) weighted = torch.bmm(a, encoder_outputs) weighted = weighted.permute(1, 0, 2) rnn_input = torch.cat((embedded, weighted), dim = 2) output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0)) assert (output == hidden).all()
embedded = embedded.squeeze(0) output = output.squeeze(0) weighted = weighted.squeeze(0)
output = self.out(torch.cat((output, weighted, embedded), dim = 1))
#output = [bsz, output dim] return output, hidden.squeeze(0)
class Seq2Seq(nn.Module): def __init__(self, encoder, decoder, device): super().__init__()
self.encoder = encoder self.decoder = decoder self.device = device
def forward(self, src, trg, teacher_forcing_ratio=0.5): batch_size=src.shape[1] max_len=trg.shape[0] trg_vocab_size=self.decoder.output_dim
outputs=torch.zeros(max_len, batch_len,trg_vocab_size).to(self.device) encoder_outputs, hidden=self.encoder(src)
output=trg[0,:] for t in range(1,max_len): output, hidden=self.decoder(output, hidden, encoder_outputs) outputs[t]=output teacher_force = random.random() < teacher_forcing_ratio top1=output.max(1)[1] output=(trg[t] if teacher_force else top1)
再來看一下PadMaskAttention與Attention的區別。
為什麼要進行壓緊?
Encoder改變是在forward方法,這裡接收源句長度。
attention用來計算源句attention值。
例子: ["hello", "how", "are", "you", "?", , ]->[1, 1, 1, 1, 1, 0, 0]。
在計算注意力之後但在通過softmax函數對其進行歸一化之前應用蒙版。它使用masked_fill應用。這將填充第一個參數(mask == 0)為true的每個元素的張量,其值由第二個參數(-1e10)給出。換句話說,它將採用未標準化的注意力值,並將填充元素上的注意力值更改為-1e10。由於這些數字與其他值相比微不足道,因此當通過softmax層時它們將變為零,從而確保源語句中的填充令牌不會受到關注。
Decoder只做了一點點更改,它需要接受源句子上的掩碼並將其傳遞給注意模塊。由於我們想要在推理期間查看注意力的值,我們也會返回注意力張量。
class Encoder(nn.Module): def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout): super(Encoder, self).__init__()
self.input_dim=input_dim self.emb_dim=emb_dim self.enc_hid_dim=enc_hid_dim self.dec_hid_dim=dec_hid_dim self.dropout=dropout
self.embedding=nn.Embedding(input_dim,emb_dim) self.rnn=nn.GRU(emb_dim, enc_hid_dim, bidirectional=True) self.fc=nn.Linear(enc_hid_dim*2,dec_hid_dim) self.dropout=nn.Dropout(dropout)
def forward(self, src, src_len): embedded=self.dropout(self.embedding(src)) # 增加了一個壓緊的操作 # 此時,返回的hidden(h_last和c_last)就是剔除padding字元後的hidden state和cell state,都是Variable類型的。 # 代表的意思如下(各個句子的表示,GRU只會作用到它實際長度的句子,而不是通過無用的padding字元) # 返回的output是PackedSequence類型, 得到的_代表各個句子的長度 packed_embedded=nn.utils.rnn.pack_padded_sequence(embedded, src_len) packed_outputs, hidden=self.rnn(packed_embedded) outputs, _=nn.utils.rnn.pad_packed_sequence(packed_outputs)
# hidden[-2,:,:]和hidden[-1,:,:]分別代表前向和後向,通過tanh來激活 hidden=torch.tanh(self.fc(torch.cat((hidden[-2,:,:],hidden[-1,:,:]),dim=1)))
class Attention(nn.Module): def __init__(self, enc_hid_dim, dec_hid_dim): super(Attention,self).__init__() self.enc_hid_dim = enc_hid_dim self.dec_hid_dim = dec_hid_dim
self.attn=nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim) self.v=nn.Parameter(torch.rand(dec_hid_dim))
def forward(self,hidden,encoder_outputs,mask): batch_size=encoder_outputs.shape[1] src_len=encoder_outputs.shape[0] hidden=hidden.unsqueeze(1).repeat(1,src_len,1) encoder_outputs = encoder_outputs.permute(1, 0, 2) energy=torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim = 2))) # permute將tensor的維度換位。 energy=energy.permute(0,2,1) v= self.v.repeat(batch_size, 1).unsqueeze(1) attention=torch.bmm(v,energy).squeeze(1)
# 和之前的沒有區別,就是在這裡進行了處理 attention=attention.masked_fill(mask==0,-1e10)
return F.softmax(attention, dim = 1)
# 增加了注意力張量a return output, hidden.squeeze(0),a.squeeze(0)
class Seq2Seq(nn.Module): def __init__(self, encoder,decoder,pad_idx,sos_idx, eos_idx, device): super().__init__()
self.encoder = encoder self.decoder = decoder self.pad_idx = pad_idx self.sos_idx = sos_idx self.eos_idx = eos_idx self.device = device def create_mask(self, src): mask=(src!=self.pad_idx).permute(1,0) return mask def forward(self, src, src_len, trg, teacher_forcing_ratio = 0.5):
#src = [src sent len, batch size] #src_len = [batch size] #trg = [trg sent len, batch size] #teacher_forcing_ratio is probability to use teacher forcing #e.g. if teacher_forcing_ratio is 0.75 we use teacher forcing 75% of the time
if trg is None: assert teacher_forcing_ratio == 0, "Must be zero during inference" inference = True trg = torch.zeros((100, src.shape[1])).long().fill_(self.sos_idx).to(src.device) else: inference = False batch_size = src.shape[1] max_len = trg.shape[0] trg_vocab_size = self.decoder.output_dim
#tensor to store decoder outputs outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)
#tensor to store attention attentions = torch.zeros(max_len, batch_size, src.shape[0]).to(self.device)
#encoder_outputs is all hidden states of the input sequence, back and forwards #hidden is the final forward and backward hidden states, passed through a linear layer encoder_outputs, hidden = self.encoder(src, src_len)
#first input to the decoder is the <sos> tokens output = trg[0,:]
mask = self.create_mask(src)
#mask = [batch size, src sent len]
for t in range(1, max_len): output, hidden, attention = self.decoder(output, hidden, encoder_outputs, mask) outputs[t] = output attentions[t] = attention teacher_force = random.random() < teacher_forcing_ratio top1 = output.max(1)[1] output = (trg[t] if teacher_force else top1) if inference and output.item() == self.eos_idx: return outputs[:t], attentions[:t]
return outputs, attentions
The English source sentence is encoded (top) and we compute all attention values for the four German target words (center) simultaneously. Our attentions are just dot products between decoder context representations (bottom left) and encoder representations. We add the conditional inputs computed by the attention (center right) to the decoder states which then predict the target words (bottom right). The sigmoid and multiplicative boxes illustrate Gated Linear Units. - 上左encoder部分:通過層疊的卷積抽取輸入源語言(英語)sequence的特徵,圖中直進行了一層卷積。卷積之後經過GLU激活做為encoder輸出。 - 下左decoder部分:採用層疊卷積抽取輸出目標語言(德語)sequence的特徵,經過GLU激活做為decoder輸出。 - 中左attention部分:把decoder和encoder的輸出做點乘,做為輸入源語言(英語)sequence中每個詞權重。 - 中右Residual connection殘差連接:把attention計算的權重與輸入序列相乘,加入到decoder的輸出中輸出輸出序列。
看了這個結構,和剛才的PadPackMaskAttention模型比較一下,會發現差別蠻大了,再往前,Transformer似乎開始有一些類似的地方。
歸一策略
OK,我們來實現一下,超級複雜
class Encoder(nn.Module): def __init__(self, input_dim, emb_dim, hid_dim, n_layers, kernel_size, dropout, device): super(Encoder, self).__init__()
assert kernel_size%2==1,"Kernel size must be odd" self.input_dim=input_dim self.emb_dim=emb_dim self.hid_dim=hid_dim self.kernel_size=kernel_size self.dropout=dropout self.device=device
# 殘差連接,歸一策略 self.scale=torch.sqrt(torch.FloatTensor([0.5])).to(device) self.tok_embedding=nn.Embedding(input_dim,emb_dim) self.pos_embedding=nn.Embedding(100,emb_dim)
self.emb2hid=nn.Linear(emb_dim,hid_dim) self.hid2emb=nn.Linear(hid_dim,emb_dim)
self.convs = nn.ModuleList([nn.Conv1d(in_channels = hid_dim, out_channels = 2 * hid_dim, kernel_size = kernel_size, padding = (kernel_size - 1) // 2) for _ in range(n_layers)]) self.dropout=nn.Dropout(dropout)
def forward(self,src): # 對tok和pos都做詞嵌入 pos=torch.arange(0,src.shape[1]).unsqueeze(0).repeat(src.shape[0],1).to(self.device) tok_embedded=self.tok_embedding(src) pos_embedded=self.pos_embedding(pos)
embedded=self.dropout(tok_embedded+pos_embedded) # 通過linear層將嵌入好的數據傳入Linear轉為hid_dim conv_input=self.emb2hid(embedded) #conv_input = [batch size, hid dim, src sent len]
for i , conv in enumerate(self.convs): conved=conv(self.dropout(conv_input)) conved=F.glu(conved,dim=1) conved=(conved+conv_input)*self.scale
conv_input=conved
conved=self.hid2emb(conved.permute(0,2,1)) combined=(conved+embedded)*self.scale
return conved, combined
class Decoder(nn.Module): def __init__(self, output_dim, emb_dim, hid_dim, n_layers, kernel_size, dropout, pad_idx, device): super(Decoder, self).__init__()
self.output_dim = output_dim self.emb_dim = emb_dim self.hid_dim = hid_dim self.kernel_size = kernel_size self.dropout = dropout self.pad_idx = pad_idx self.device = device
self.scale=torch.sqrt(torch.FloatTensor([0.5])).to(device)
self.tok_embedding = nn.Embedding(output_dim, emb_dim) self.pos_embedding = nn.Embedding(100, emb_dim)
self.emb2hid = nn.Linear(emb_dim, hid_dim) self.hid2emb = nn.Linear(hid_dim, emb_dim)
self.attn_hid2emb=nn.Linear(hid_dim,emb_dim) self.attn_emb2hid=nn.Linear(emb_dim,hid_dim)
self.out=nn.Linear(emb_dim,output_dim) self.convs = nn.ModuleList([nn.Conv1d(hid_dim, 2*hid_dim, kernel_size) for _ in range(n_layers)])
self.dropout = nn.Dropout(dropout)
def calculate_attention(self, embedded, conved, encoder_conved,encoder_combined): conved_emb=self.attn_hid2emb(conved.permute(0,2,1)) combined=(embedded+conved_emb)*self.scale energy=torch.matmul(combined, encoder_conved.permute(0,2,1)) attention=F.softmax(energy, dim=2)
attended_encoding=torch.matmul(attention,(encoder_conved+encoder_combined)) attended_encoding = self.attn_emb2hid(attended_encoding)
attended_combined = (conved + attended_encoding.permute(0, 2, 1)) * self.scale
return attention, attended_combined
def forward(self, trg, encoder_conved, encoder_combined): pos=torch.arange(0, trg.shape[1]).unsqueeze(0).repeat(trg.shape[0], 1).to(device)
tok_embedded = self.tok_embedding(trg) pos_embedded = self.pos_embedding(pos) #tok_embedded = [batch size, trg sent len, emb dim] #pos_embedded = [batch size, trg sent len, emb dim]
embedded = self.dropout(tok_embedded + pos_embedded) conv_input=self.emb2hid(embedded) conv_input=conv_input.permute(0,2,1)
for i, conv in enumerate(self.convs): conv_input=self.dropout(conv_input) padding = torch.zeros(conv_input.shape[0], conv_input.shape[1], self.kernel_size-1).fill_(self.pad_idx).to(device) padded_conv_input = torch.cat((padding, conv_input), dim=2)
conved=conv(padded_conv_input) conved=F.glu(conved, dim=1)
attention, conved=self.calculate_attention(embedded, conved, encoder_conved, encoder_combined)
conved=(conved+conv_input)*self.scale conv_input=conved
conved=self.hid2emb(conved.permute(0,2,1)) output=self.out(self.dropout(conved))
return output, attention
def forward(self, src, trg): encoder_conved, encoder_combined = self.encoder(src) output, attention = self.decoder(trg, encoder_conved, encoder_combined)
推薦閱讀: