Pytorch学习记录-torchtext和Pytorch的实例4
在完成基本的torchtext之后,找到了这个教程,《基于Pytorch和torchtext来理解和实现seq2seq模型》。 这个项目主要包括了6个子项目 1. ~~使用神经网路训练Seq2Seq~~ 2. ~~使用RNN encoder-decoder训练短语表示用于统计机器翻译~~ 3. ~~使用共同学习完成NMT的构建和翻译~~ 4. ~~打包填充序列、掩码和推理~~ 5. ~~卷积Seq2Seq~~ 6. ~~Transformer~~
结束Transformer之后隔了两天没有学习,这两天对几个模型进行对比和总结吧,在完成前三个模型的时候曾经写过一个总结,今天主要是看一下六个模型的变化以及实现。关键是实现,用了15天,但是模型实现部分只能看懂一般Seq2Seq……
六个模型都是Seq2Seq,都包含有Encoder和Decoder两部分,只是模型核心不同,并且在层与层或是Encoder与Decoder之间不断加新东西分别是:LSTM->多层GRU->Attention->PadMaskAttention->CNN->Transformer
注意图中Decoder部分。 OK,这里实现一下。
import torch import torch.nn as nn
class Encoder(nn.Module): def __init__(self, input_dim, hid_dim, emb_dim, dropout): super(Encoder, self).__init__()
self.input_dim=input_dim self.emb_dim=emb_dim self.hid_dim=hid_dim self.dropout=dropout
self.embedding=nn.Embedding(input_dim, emb_dim) self.rnn=nn.GRU(emb_dim,hid_dim) self.dropout=nn.Dropout(dropout)
def forward(self, src): embedded=self.dropout(self.embedding(src)) outputs, hidden=self.rnn(embedded)
return hidden
class Decoder(nn.Module): def __init__(self, output_dim, hid_dim, emb_dim, dropout): super(Decoder.self).__init__()
self.output_dim=output_dim self.emb_dim=emb_dim self.hid_dim=hid_dim self.dropout=dropout
self.embedding=nn.Embedding(output_dim,emb_dim) # 在实现的时候,通过将$y_t$和$z$串联传入GRU,所以输入的维度应该是emb_dim+ hid_dim self.rnn=nn.GRU(emb_dim+hid_dim,hid_dim) # linear层输入的是 $y_t, s_t$ 和 $z$串联,而隐藏状态和上下文向量都是$h$维度相同,所以输入的维度是emb_dim+hid_dim*2 self.out=nn.Linear(emb_dim+hid_dim*2,output_dim) self.dropout=nn.Dropout(dropout) def forward(self, input, hidden, context): input=input.unsqueeze(0) embedded=self.dropout(self.embedding(input)) emb_context=torch.cat((embedded,context),dim=2) output, hidden=self.rnn(emb_context, hidden) output=torch.cat((embedded.squeeze(0), hidden.squeeze(0), context.squeeze(0)), dim = 1)
prediction=self.out(output)
return prediction, hidden
class Seq2Seq(nn.Module): def __init__(self, encoder, decoder, device): super(Seq2Seq,self).__init__() self.encoder=encoder self.decoder=decoder self.device=device assert encoder.hid_dim == decoder.hid_dim, "Hidden dimensions of encoder and decoder must be equal!"
def forward(self,src, trg, teacher_forcing_ratio=0.5): batch_size=trg.shape[1] max_len=trg.shape[0] trg_vocab_size=self.decoder.output_dim
outputs=torch.zeros(max_len,batch_size,trg_vocab_size).to(self.device) context=self.encoder(src) hidden=context input=trg[0,:] for t in range(1,max_len): output,hidden=self.decoder(input,hidden,context) outputs[t]=output teacher_force=random.random()<teacher_forcing_ratio top1=output.max(1)[1] input=(trg[t] if teacher_force else top1)
return outputs
但是这样干的一个问题是,Decoder获取的是上下文向量是Encoder输入的所有单词信息,但是当需要用到具体某一个时间节点的信息时,没有,仍然是整个句子的全局信息。于是,就有了Attention
有了上面的需求,就需要将Encoder所有时间节点的隐藏层状态输出,然后进行加权求和。
权值是与Decoder当前时间节点相关联的一套数值(这个的意思是,对于解码器的每个时间节点,对于编码器所有时间节点的hidden state的加权系数是不一样的),权值即为attention vector,记作a。
就是说,现在通过attention vector来关注Encoder时间节点信息,通过将Encoder每个时间节点输出的hidden state和attention vector加权求和之后,得到的w(t),上下文向量输入到RNN和线性预测层(要注意的是:在Decoder的第1个时间节点,输入RNN层的hidden state并不是w而是h,即Encoder最后一个时间节点输出的隐藏状态)。
另外在Encoder部分使用的是bidirectional RNN。通过bidirectional RNN,每层可以有两个RNN网路。 - 前向RNN从左到右处理句子(图中绿色) - 后向RNN从右到左处理句子(图中黄色) 在这里要做的就是设置 bidirectional = True ,然后输入嵌入好的句子。
由于Decoder不是双向的,它只需要一个上下文向量$ z $作为其初始隐藏状态$ s_0 $,而Encoder提供有两个,前向和后向($ z ^ ightarrow = h_T ^ ightarrow $和$ z ^ leftarrow = h_T ^ leftarrow $)。通过将两个上下文向量连接在一起,通过线性层$ g $并应用$ anh $激活函数来解决这个问题。$$z= anh(g(h_T^ ightarrow, h_T^leftarrow)) = anh(g(z^ ightarrow, z^leftarrow)) = s_0$$
由于我们希望我们的模型回顾整个源句,我们返回输出,源句中每个标记的堆叠前向和后向隐藏状态。我们还返回hidden,它在解码器中充当我们的初始隐藏状态。
OK,来实现一下看看区别,你会发现,Decoder基本类似,
class Encoder(nn.Module): def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout): super(Encoder,self).__init__() self.input_dim=input_dim self.emb_dim=emb_dim self.enc_hid_dim=enc_hid_dim self.dec_hid_dim=dec_hid_dim self.dropout=dropout
self.embedding=nn.Embedding(input_dim, emb_dim) self.rnn=nn.GRU(emb_dim, enc_hid_dim,bidirectional=True) self.fc=nn.Linear(enc_hid_dim*2, dec_hid_dim) self.dropout=nn.Dropout(dropout)
def forward(self, src): embedded=self.dropout(self.embedding(src)) outputs, hidden=self.rnn(embedded) # hidden[-2,:,:]和hidden[-1,:,:]分别代表前向和后向,通过tanh来激活 hidden=torch.tanh(self.fc(torch.cat((hidden[-2,:,:],hidden[-1,:,:]),dim=1)))
return outputs, hidden
class Attention(nn.Module): def __init__(self, enc_hid_dim,dec_hid_dim): super(Attention,self).__init__() self.enc_hid_dim=enc_hid_dim self.dec_hid_dim=dec_hid_dim
self.attn=nn.Linear((enc_hid_dim*2)+dec_hid_dim,dec_hid_dim) self.v=nn.Parameter(torch.rand(dec_hid_dim))
def forward(self, hidden, encoder_outputs): batch_size=encoder_outputs.shape[1] src_len=encoder_outputs.shape[0] hidden=hidden.unsqueeze(1).repeat(1,src_len,1) encoder_outputs=encoder_outputs.permute(1,0,2) energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim = 2))) energy = energy.permute(0, 2, 1) v = self.v.repeat(batch_size, 1).unsqueeze(1) attention = torch.bmm(v, energy).squeeze(1)
return F.softmax(attention, dim=1)
class Decoder(nn.Module): def __init__(self,output_dim,emb_dim,enc_hid_dim,dec_hid_dim,dropout, attention): super(Decoder,self).__init__() self.emb_dim = emb_dim self.enc_hid_dim = enc_hid_dim self.dec_hid_dim = dec_hid_dim self.output_dim = output_dim self.dropout = dropout self.attention = attention
self.embedding=nn.Embedding(output_dim,emb_dim) self.rnn=nn.GRU((enc_hid_dim*2)+emb_dim,dec_hid_dim) self.out=nn.Linear((enc_hid_dim*2)+dec_hid_dim+emb_dim,output_dim) self.dropout=nn.Dropout(dropout)
def forward(self,input,hidden,encoder_outputs): input=input.unsqueeze(0) embedded=self.dropout(self.embedding(input)) a=self.attention(hidden,encoder_outputs) a=a.unsqueeze(1) encoder_outputs=encoder_outputs.permute(1,0,2) weighted = torch.bmm(a, encoder_outputs) weighted = weighted.permute(1, 0, 2) rnn_input = torch.cat((embedded, weighted), dim = 2) output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0)) assert (output == hidden).all()
embedded = embedded.squeeze(0) output = output.squeeze(0) weighted = weighted.squeeze(0)
output = self.out(torch.cat((output, weighted, embedded), dim = 1))
#output = [bsz, output dim] return output, hidden.squeeze(0)
class Seq2Seq(nn.Module): def __init__(self, encoder, decoder, device): super().__init__()
self.encoder = encoder self.decoder = decoder self.device = device
def forward(self, src, trg, teacher_forcing_ratio=0.5): batch_size=src.shape[1] max_len=trg.shape[0] trg_vocab_size=self.decoder.output_dim
outputs=torch.zeros(max_len, batch_len,trg_vocab_size).to(self.device) encoder_outputs, hidden=self.encoder(src)
output=trg[0,:] for t in range(1,max_len): output, hidden=self.decoder(output, hidden, encoder_outputs) outputs[t]=output teacher_force = random.random() < teacher_forcing_ratio top1=output.max(1)[1] output=(trg[t] if teacher_force else top1)
再来看一下PadMaskAttention与Attention的区别。
为什么要进行压紧?
Encoder改变是在forward方法,这里接收源句长度。
attention用来计算源句attention值。
例子: ["hello", "how", "are", "you", "?", , ]->[1, 1, 1, 1, 1, 0, 0]。
在计算注意力之后但在通过softmax函数对其进行归一化之前应用蒙版。它使用masked_fill应用。这将填充第一个参数(mask == 0)为true的每个元素的张量,其值由第二个参数(-1e10)给出。换句话说,它将采用未标准化的注意力值,并将填充元素上的注意力值更改为-1e10。由于这些数字与其他值相比微不足道,因此当通过softmax层时它们将变为零,从而确保源语句中的填充令牌不会受到关注。
Decoder只做了一点点更改,它需要接受源句子上的掩码并将其传递给注意模块。由于我们想要在推理期间查看注意力的值,我们也会返回注意力张量。
class Encoder(nn.Module): def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout): super(Encoder, self).__init__()
self.input_dim=input_dim self.emb_dim=emb_dim self.enc_hid_dim=enc_hid_dim self.dec_hid_dim=dec_hid_dim self.dropout=dropout
self.embedding=nn.Embedding(input_dim,emb_dim) self.rnn=nn.GRU(emb_dim, enc_hid_dim, bidirectional=True) self.fc=nn.Linear(enc_hid_dim*2,dec_hid_dim) self.dropout=nn.Dropout(dropout)
def forward(self, src, src_len): embedded=self.dropout(self.embedding(src)) # 增加了一个压紧的操作 # 此时,返回的hidden(h_last和c_last)就是剔除padding字元后的hidden state和cell state,都是Variable类型的。 # 代表的意思如下(各个句子的表示,GRU只会作用到它实际长度的句子,而不是通过无用的padding字元) # 返回的output是PackedSequence类型, 得到的_代表各个句子的长度 packed_embedded=nn.utils.rnn.pack_padded_sequence(embedded, src_len) packed_outputs, hidden=self.rnn(packed_embedded) outputs, _=nn.utils.rnn.pad_packed_sequence(packed_outputs)
# hidden[-2,:,:]和hidden[-1,:,:]分别代表前向和后向,通过tanh来激活 hidden=torch.tanh(self.fc(torch.cat((hidden[-2,:,:],hidden[-1,:,:]),dim=1)))
class Attention(nn.Module): def __init__(self, enc_hid_dim, dec_hid_dim): super(Attention,self).__init__() self.enc_hid_dim = enc_hid_dim self.dec_hid_dim = dec_hid_dim
self.attn=nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim) self.v=nn.Parameter(torch.rand(dec_hid_dim))
def forward(self,hidden,encoder_outputs,mask): batch_size=encoder_outputs.shape[1] src_len=encoder_outputs.shape[0] hidden=hidden.unsqueeze(1).repeat(1,src_len,1) encoder_outputs = encoder_outputs.permute(1, 0, 2) energy=torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim = 2))) # permute将tensor的维度换位。 energy=energy.permute(0,2,1) v= self.v.repeat(batch_size, 1).unsqueeze(1) attention=torch.bmm(v,energy).squeeze(1)
# 和之前的没有区别,就是在这里进行了处理 attention=attention.masked_fill(mask==0,-1e10)
return F.softmax(attention, dim = 1)
# 增加了注意力张量a return output, hidden.squeeze(0),a.squeeze(0)
class Seq2Seq(nn.Module): def __init__(self, encoder,decoder,pad_idx,sos_idx, eos_idx, device): super().__init__()
self.encoder = encoder self.decoder = decoder self.pad_idx = pad_idx self.sos_idx = sos_idx self.eos_idx = eos_idx self.device = device def create_mask(self, src): mask=(src!=self.pad_idx).permute(1,0) return mask def forward(self, src, src_len, trg, teacher_forcing_ratio = 0.5):
#src = [src sent len, batch size] #src_len = [batch size] #trg = [trg sent len, batch size] #teacher_forcing_ratio is probability to use teacher forcing #e.g. if teacher_forcing_ratio is 0.75 we use teacher forcing 75% of the time
if trg is None: assert teacher_forcing_ratio == 0, "Must be zero during inference" inference = True trg = torch.zeros((100, src.shape[1])).long().fill_(self.sos_idx).to(src.device) else: inference = False batch_size = src.shape[1] max_len = trg.shape[0] trg_vocab_size = self.decoder.output_dim
#tensor to store decoder outputs outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)
#tensor to store attention attentions = torch.zeros(max_len, batch_size, src.shape[0]).to(self.device)
#encoder_outputs is all hidden states of the input sequence, back and forwards #hidden is the final forward and backward hidden states, passed through a linear layer encoder_outputs, hidden = self.encoder(src, src_len)
#first input to the decoder is the <sos> tokens output = trg[0,:]
mask = self.create_mask(src)
#mask = [batch size, src sent len]
for t in range(1, max_len): output, hidden, attention = self.decoder(output, hidden, encoder_outputs, mask) outputs[t] = output attentions[t] = attention teacher_force = random.random() < teacher_forcing_ratio top1 = output.max(1)[1] output = (trg[t] if teacher_force else top1) if inference and output.item() == self.eos_idx: return outputs[:t], attentions[:t]
return outputs, attentions
The English source sentence is encoded (top) and we compute all attention values for the four German target words (center) simultaneously. Our attentions are just dot products between decoder context representations (bottom left) and encoder representations. We add the conditional inputs computed by the attention (center right) to the decoder states which then predict the target words (bottom right). The sigmoid and multiplicative boxes illustrate Gated Linear Units. - 上左encoder部分:通过层叠的卷积抽取输入源语言(英语)sequence的特征,图中直进行了一层卷积。卷积之后经过GLU激活做为encoder输出。 - 下左decoder部分:采用层叠卷积抽取输出目标语言(德语)sequence的特征,经过GLU激活做为decoder输出。 - 中左attention部分:把decoder和encoder的输出做点乘,做为输入源语言(英语)sequence中每个词权重。 - 中右Residual connection残差连接:把attention计算的权重与输入序列相乘,加入到decoder的输出中输出输出序列。
看了这个结构,和刚才的PadPackMaskAttention模型比较一下,会发现差别蛮大了,再往前,Transformer似乎开始有一些类似的地方。
归一策略
OK,我们来实现一下,超级复杂
class Encoder(nn.Module): def __init__(self, input_dim, emb_dim, hid_dim, n_layers, kernel_size, dropout, device): super(Encoder, self).__init__()
assert kernel_size%2==1,"Kernel size must be odd" self.input_dim=input_dim self.emb_dim=emb_dim self.hid_dim=hid_dim self.kernel_size=kernel_size self.dropout=dropout self.device=device
# 残差连接,归一策略 self.scale=torch.sqrt(torch.FloatTensor([0.5])).to(device) self.tok_embedding=nn.Embedding(input_dim,emb_dim) self.pos_embedding=nn.Embedding(100,emb_dim)
self.emb2hid=nn.Linear(emb_dim,hid_dim) self.hid2emb=nn.Linear(hid_dim,emb_dim)
self.convs = nn.ModuleList([nn.Conv1d(in_channels = hid_dim, out_channels = 2 * hid_dim, kernel_size = kernel_size, padding = (kernel_size - 1) // 2) for _ in range(n_layers)]) self.dropout=nn.Dropout(dropout)
def forward(self,src): # 对tok和pos都做词嵌入 pos=torch.arange(0,src.shape[1]).unsqueeze(0).repeat(src.shape[0],1).to(self.device) tok_embedded=self.tok_embedding(src) pos_embedded=self.pos_embedding(pos)
embedded=self.dropout(tok_embedded+pos_embedded) # 通过linear层将嵌入好的数据传入Linear转为hid_dim conv_input=self.emb2hid(embedded) #conv_input = [batch size, hid dim, src sent len]
for i , conv in enumerate(self.convs): conved=conv(self.dropout(conv_input)) conved=F.glu(conved,dim=1) conved=(conved+conv_input)*self.scale
conv_input=conved
conved=self.hid2emb(conved.permute(0,2,1)) combined=(conved+embedded)*self.scale
return conved, combined
class Decoder(nn.Module): def __init__(self, output_dim, emb_dim, hid_dim, n_layers, kernel_size, dropout, pad_idx, device): super(Decoder, self).__init__()
self.output_dim = output_dim self.emb_dim = emb_dim self.hid_dim = hid_dim self.kernel_size = kernel_size self.dropout = dropout self.pad_idx = pad_idx self.device = device
self.scale=torch.sqrt(torch.FloatTensor([0.5])).to(device)
self.tok_embedding = nn.Embedding(output_dim, emb_dim) self.pos_embedding = nn.Embedding(100, emb_dim)
self.emb2hid = nn.Linear(emb_dim, hid_dim) self.hid2emb = nn.Linear(hid_dim, emb_dim)
self.attn_hid2emb=nn.Linear(hid_dim,emb_dim) self.attn_emb2hid=nn.Linear(emb_dim,hid_dim)
self.out=nn.Linear(emb_dim,output_dim) self.convs = nn.ModuleList([nn.Conv1d(hid_dim, 2*hid_dim, kernel_size) for _ in range(n_layers)])
self.dropout = nn.Dropout(dropout)
def calculate_attention(self, embedded, conved, encoder_conved,encoder_combined): conved_emb=self.attn_hid2emb(conved.permute(0,2,1)) combined=(embedded+conved_emb)*self.scale energy=torch.matmul(combined, encoder_conved.permute(0,2,1)) attention=F.softmax(energy, dim=2)
attended_encoding=torch.matmul(attention,(encoder_conved+encoder_combined)) attended_encoding = self.attn_emb2hid(attended_encoding)
attended_combined = (conved + attended_encoding.permute(0, 2, 1)) * self.scale
return attention, attended_combined
def forward(self, trg, encoder_conved, encoder_combined): pos=torch.arange(0, trg.shape[1]).unsqueeze(0).repeat(trg.shape[0], 1).to(device)
tok_embedded = self.tok_embedding(trg) pos_embedded = self.pos_embedding(pos) #tok_embedded = [batch size, trg sent len, emb dim] #pos_embedded = [batch size, trg sent len, emb dim]
embedded = self.dropout(tok_embedded + pos_embedded) conv_input=self.emb2hid(embedded) conv_input=conv_input.permute(0,2,1)
for i, conv in enumerate(self.convs): conv_input=self.dropout(conv_input) padding = torch.zeros(conv_input.shape[0], conv_input.shape[1], self.kernel_size-1).fill_(self.pad_idx).to(device) padded_conv_input = torch.cat((padding, conv_input), dim=2)
conved=conv(padded_conv_input) conved=F.glu(conved, dim=1)
attention, conved=self.calculate_attention(embedded, conved, encoder_conved, encoder_combined)
conved=(conved+conv_input)*self.scale conv_input=conved
conved=self.hid2emb(conved.permute(0,2,1)) output=self.out(self.dropout(conved))
return output, attention
def forward(self, src, trg): encoder_conved, encoder_combined = self.encoder(src) output, attention = self.decoder(trg, encoder_conved, encoder_combined)
推荐阅读: