Pytorch学习记录-torchtext和Pytorch的实例对比
0. PyTorch Seq2Seq项目介绍
在完成基本的torchtext之后,找到了这个教程,《基于Pytorch和torchtext来理解和实现seq2seq模型》。
这个项目主要包括了6个子项目
使用神经网络训练Seq2Seq使用RNN encoder-decoder训练短语表示用于统计机器翻译使用共同学习完成NMT的构建和翻译打包填充序列、掩码和推理卷积Seq2SeqTransformer
结束Transformer之后隔了两天没有学习,这两天对几个模型进行对比和总结吧,在完成前三个模型的时候曾经写过一个总结,今天主要是看一下六个模型的变化以及实现。关键是实现,用了15天,但是模型实现部分只能看懂一般Seq2Seq……
7. 总结,从一般Seq2Seq到Transformer
六个模型都是Seq2Seq,都包含有Encoder和Decoder两部分,只是模型核心不同,并且在层与层或是Encoder与Decoder之间不断加新东西分别是:LSTM->多层GRU->Attention->PadMaskAttention->CNN->Transformer
- 1和2是一般Seq2Seq,分别使用了LSTM和它的变体GRU
- 3和4是对Attention的补充,增加了打包、填充、掩码
- 5是使用CNN
- 6是all-attention,什么高端的都给你用上
7.5 模型5、6对比
<center class="half">
<img src="https://upload-images.jianshu.io/upload_images/14340919-3cd19da0da351933.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240" width="600"/><img src="https://upload-images.jianshu.io/upload_images/14340919-9e7d518eea914b5c.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240" width="600"/>
</center>
最后一个模型了,对比一下,最大的区别是Transformer是全attention结构,抛弃了RNN和CNN,在之前张俊林的分析中已经说过在“语义特征提取能力、长距离特征捕获能力、任务综合特征抽取能力、并行计算能力及运行效率”等方面,均要优于CNN和RNN。在长文本处理方面,Transforme-XL解决的就是输入特别长的问题。基于Pytorch的项目可以看看这个 。
整体结构还是有一些相似点,CNNSeq2Seq的Encoder和Decoder采用的是层叠卷积抽取特征,使用GLU作为激活函数,中间attention使用点乘的方式,最后用残差连接,把attention计算的权重与输入序列相乘,加入到decoder的输出中输出输出序列。而Transformer在Encoder和Decoder中使用multi head self-attention机制代替CNN。在对输入序列进行对齐中使用的是padding mask和sequence mask。mask掩码。最后使用层归一和残差连接结束。在Encoder和Decoder连接处使用multi head context-attention。
照例,来实现一下
class Encoder(nn.Module):
def __init__(self, input_dim, hid_dim, n_layers, n_heads, pf_dim, encoder_layer,
self_attention, positionwise_feedfoward, dropout, device):
super(Encoder, self).__init__()
self.input_dim=input_dim
self.hid_dim=hid_dim
self.n_layers=n_layers
self.n_heads=n_heads
self.pf_dim=pf_dim
self.encoder_layer=encoder_layer
self.self_attention=self_attention
self.positionwise_feedforward=positionwise_feedforward
self.dropout=dropout
self.device=device
self.tok_embedding=nn.Embedding(input_dim, emb_dim)
# word embedding的定义nn.Embedding(1000, hid_dim),这里的1000表示有1000个词,hid_dim表示维度,其实也就是一个1000xhid_dim的矩阵。
self.pos_embedding=nn.Embedding(1000, hid_dim)
self.layers=nn.ModuleList([encoder_layer(hid_dim,n_heads,pf_dim,self_attention,
positionwise_feedforward,dropout,device)
for _ in range(n_layers)])
self.do=nn.Dropout(dropout)
self.scale=torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
# 除了src还增加了src_mask,就是使用padding mask进行处理
def forward(self, src, src_mask):
pos=torch.arange(0, src.shape[1]).unsqueeze(0).repeat(src.shape[0]).to(self.device)
# 进行scale之后相加
src=self.do(self.tok_embedding(src)*self.scale)+self.pos_embedding(pos)
for layer in self.layers:
src=layer(src,src_mask)
return src
# 层归一处理
class EncoderLayer(nn.Module):
def __init__(self, hid_dim, n_heads, pf_dim, self_attention, postionwise_feedforward,dropout,device):
super(EncoderLayer,self).__init__()
self.ln=nn.LayerNorm(hid_dim)
self.sa=self_attention(hid_dim,n_heads,dropout,device)
self.pf=postionwise_feedforward(hid_dim, pf_dim,dropout)
self.do=nn.Dropout(dropout)
def forward(self, src, src_mask):
src=self.ln(src+self.do(self.sa(src,src,src,src_mask)))
src=self.ln(src+self.do(self.pf(src)))
return src
# self-attention
class SelfAttention(nn.Module):
def __init__(self, hid_dim, n_heads, dropout, device):
super(SelfAttention,self).__init__()
self.hid_dim=hid_dim
self.n_heads=n_heads
assert hid_dim%n_heads==0
self.w_q=nn.Linear(hid_dim,hid_dim)
self.w_k=nn.Linear(hid_dim, hid_dim)
self.w_v=nn.Linear(hid_dim, hid_dim)
self.fc=nn.Linear(hid_dim,hid_dim)
self.do=nn.Dropout(dropout)
self.scale=torch.sqrt(torch.FloatTensor([hid_dim//n_heads])).to(device)
def forward(self, query, key, value, mask=None):
bsz=query.shape[0]
#query = key = value [batch size, sent len, hid dim]
Q=self.w_q(query)
K=self.w_k(key)
V=self.w_v(value)
#Q, K, V = [batch size, sent len, hid dim]
Q = Q.view(bsz, -1, self.n_heads, self.hid_dim // self.n_heads).permute(0, 2, 1, 3)
K = K.view(bsz, -1, self.n_heads, self.hid_dim // self.n_heads).permute(0, 2, 1, 3)
V = V.view(bsz, -1, self.n_heads, self.hid_dim // self.n_heads).permute(0, 2, 1, 3)
#Q, K, V = [batch size, n heads, sent len, hid dim // n heads]
# 实现attentionQ*K^T/D
energy=torch.matmul(Q,K.permute(0,1,3,2))/self.scale
#energy = [batch size, n heads, sent len, sent len]
if mask is not None:
energy=energy.masked_fill(mask==0, -1e10)
# 实现softmax部分
attention=self.do(F.softmax(energy, dim=-1))
#attention = [batch size, n heads, sent len, sent len]
x=torch.matmul(attention,V)
#x = [batch size, n heads, sent len, hid dim // n heads]
x=x.permute(0,2,1,3).contiguous()
#x = [batch size, sent len, n heads, hid dim // n heads]
x=x.view(bsz, -1, self.n_heads*(self.hid_dim//self.n_heads))
#x = [batch size, src sent len, hid dim]
x=self.fc(x)
return x
class PositionwiseFeedforward(nn.Module):
def __init__(self, hid_dim, pf_dim, dropout):
super(PositionwiseFeedforward,self).__init__()
self.hid_dim=hid_dim
self.pf_dim=pf_dim
self.fc_1=nn.Conv1d(hid_dim,pf_dim,1)
self.fc_2=nn.Conv1d(pf_dim, hid_dim, 1)
self.do=nn.Dropout(dropout)
def forward(self,x):
#x = [batch size, sent len, hid dim]
x = x.permute(0, 2, 1)
#x = [batch size, hid dim, sent len]
x = self.do(F.relu(self.fc_1(x)))
#x = [batch size, ff dim, sent len]
x = self.fc_2(x)
#x = [batch size, hid dim, sent len]
x = x.permute(0, 2, 1)
#x = [batch size, sent len, hid dim]
return x
class Decoder(nn.Module):
def __init__(self, output_dim, hid_dim,n_layers,n_heads,pf_dim,decoder_layer,self_attention,positionwise_feedforward,dropout,device):
super(Decoder,self).__init__()
self.output_dim=output_dim
self.hid_dim=hid_dim
self.n_layers=n_layers
self.n_heads = n_heads
self.pf_dim = pf_dim
self.decoder_layer = decoder_layer
self.self_attention = self_attention
self.positionwise_feedforward = positionwise_feedforward
self.dropout = dropout
self.device = device
self.tok_embedding=nn.Embedding(output_dim, hid_dim)
self.pos_embedding=nn.Embedding(1000,hid_dim)
self.layers=nn.ModuleList([decoder_layer(hid_dim,n_heads,pf_dim,self_attention,positionwise_feedforward,dropout,device) for _ in range(n_layers)])
self.fc=nn.Linear(hid_dim, output_dim)
self.do=nn.Dropout(dropout)
self.scale=torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
def forward(self, trg, src, trg_mask, src_mask):
#trg = [batch_size, trg sent len]
#src = [batch_size, src sent len]
#trg_mask = [batch size, trg sent len]
#src_mask = [batch size, src sent len]
pos=torch.arange(0, trg.shape[1]).unsqueeze(0).repeat(trg.shape[0], 1).to(self.device)
trg=self.do((self.tok_embedding(trg)*self.scale)+self.pos_embedding(pos))
for layer in self.layers:
trg=layer(trg,src,trg_mask,src_mask)
return self.fc(trg)
# Decoder的层归一,可以看到trg和src的mask,在下面Seq2Seq部分具体实现的时候是有区别的
class DecoderLayer(nn.Module):
def __init__(self, hid_dim, n_heads, pf_dim, self_attention, positionwise_feedforward, dropout, device):
super().__init__()
self.ln = nn.LayerNorm(hid_dim)
self.sa = self_attention(hid_dim, n_heads, dropout, device)
self.ea = self_attention(hid_dim, n_heads, dropout, device)
self.pf = positionwise_feedforward(hid_dim, pf_dim, dropout)
self.do = nn.Dropout(dropout)
def forward(self, trg, src, trg_mask, src_mask):
#trg = [batch size, trg sent len, hid dim]
#src = [batch size, src sent len, hid dim]
#trg_mask = [batch size, trg sent len]
#src_mask = [batch size, src sent len]
trg = self.ln(trg + self.do(self.sa(trg, trg, trg, trg_mask)))
trg = self.ln(trg + self.do(self.ea(trg, src, src, src_mask)))
trg = self.ln(trg + self.do(self.pf(trg)))
return trg
class Seq2Seq(nn.Module):
def __init__(self, encoder, decoder, pad_idx, device):
super().__init__()
self.encoder = encoder
self.decoder = decoder
self.pad_idx = pad_idx
self.device = device
# mask机制
def make_masks(self, src, trg):
#src = [batch size, src sent len]
#trg = [batch size, trg sent len]
src_mask = (src != self.pad_idx).unsqueeze(1).unsqueeze(2)
trg_pad_mask = (trg != self.pad_idx).unsqueeze(1).unsqueeze(3)
trg_len = trg.shape[1]
trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), dtype=torch.uint8, device=self.device))
trg_mask = trg_pad_mask & trg_sub_mask
return src_mask, trg_mask
def forward(self, src, trg):
#src = [batch size, src sent len]
#trg = [batch size, trg sent len]
src_mask, trg_mask = self.make_masks(src, trg)
enc_src = self.encoder(src, src_mask)
#enc_src = [batch size, src sent len, hid dim]
out = self.decoder(trg, enc_src, trg_mask, src_mask)
#out = [batch size, trg sent len, output dim]
return out
这部分就算是结束了,用了16天完成6个模型,学习了torchtext,环境从本地换到了Colab,Transformer模型也跑通了。
但是,这都是别人的东西,怎么才能消化成自己的呢?
接下来的目标是2017年的论文《Deep Context Model for Grammatical Error Correction》,这条路好孤单啊……没有组会没有团队,自己一个人玩……