import torch.nn as nn
import torch
import numpy as np
from .attention import MultiHeadAttention #Introduction of multi head attention module
from .module import PositionalEncoding, PositionwiseFeedForward #Position coding and feedforward network
from .utils import get_non_pad_mask, get_attn_pad_mask #padding mask: padding makes the input length the same. attention mask:
class Encoder(nn.Module):
"""Encoder of Transformer including self-attention and feed forward.
"""
def __init__(self, d_input=320, n_layers=6, n_head=8, d_k=64, d_v=64,
d_model=512, d_inner=2048, dropout=0.1, pe_maxlen=5000):
super(Encoder, self).__init__()
# parameters
self.d_input = d_input #Input dimension
self.n_layers = n_layers #Number of encoding and decoding layers
self.n_head = n_head #Number of self attention heads
self.d_k = d_k #Key matrix dimension
self.d_v = d_v #Value matrix dimension
self.d_model = d_model #Model dimension
self.d_inner = d_inner #Number of hidden layer neurons of feedforward network (dimension)
self.dropout_rate = dropout #Information leakage rate
self.pe_maxlen = pe_maxlen #Maximum length of position code
# use linear transformation with layer norm to replace input embedding
self.linear_in = nn.Linear(d_input, d_model) #Full connection, input 320 and output 512 dimensions
self.layer_norm_in = nn.LayerNorm(d_model) #Layer normalization
self.positional_encoding = PositionalEncoding(d_model, max_len=pe_maxlen) #Location coding
self.dropout = nn.Dropout(dropout) #dropout
self.w_pes1 = nn.Linear(d_model, n_head * d_v) #Define the weight matrix of position coding
self.w_pes2 = nn.Linear(d_model, n_head * d_v)
nn.init.normal_(self.w_pes2.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_v))) #Initialize weight
nn.init.normal_(self.w_pes1.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_v)))
self.layer_stack = nn.ModuleList([
EncoderLayer(d_model,d_model, d_inner, n_head, d_k, d_v, dropout=dropout)
for _ in range(n_layers)]) #Implementation n_layers secondary encoder
#nn.ModuleList, which is a container that stores different modules and automatically adds the parameters of each module to the network.
def forward(self, padded_input, input_lengths, return_attns=False):
"""
Args:
padded_input: N x T x D
input_lengths: N
Returns:
enc_output: N x T x H
"""
enc_slf_attn_list = []
d_k, d_v, n_head = self.d_k, self.d_v, self.n_head
sz_b, len_v, _ = padded_input.size()
# Prepare masks
non_pad_mask = get_non_pad_mask(padded_input, input_lengths=input_lengths) #Populate input data
length = padded_input.size(1) #Get fill length
slf_attn_mask = get_attn_pad_mask(padded_input, input_lengths, length) #Attention filling
# Forward
# Data processing before entering the encoder
# enc_output = self.dropout(
# self.layer_norm_in(self.linear_in(padded_input)) +
# self.positional_encoding(padded_input))
# The data is normalized after linear transformation (changing the input of 320 dimensions to 512 dimensions), and then the data after position coding is added for dropout
enc_output = self.dropout(self.layer_norm_in(self.linear_in(padded_input)))
pe = self.positional_encoding(padded_input)
pe1 = self.w_pes1(pe).view(sz_b, len_v, n_head, d_v)
pe2 = self.w_pes2(pe).view(sz_b, len_v, n_head, d_v)
pe1 = pe1.permute(2, 0, 1, 3).contiguous().view(-1, len_v, d_v)
pe2 = pe2.permute(2, 0, 1, 3).contiguous().view(-1, len_v, d_v)
pe = torch.bmm(pe1, pe2.transpose(1, 2))
for enc_layer in self.layer_stack: #Enter encoder
enc_output, enc_slf_attn = enc_layer(
enc_output, pe,
non_pad_mask=non_pad_mask,
slf_attn_mask=slf_attn_mask)#Output the encoding result and attention through the encoder
if return_attns: #By default, the attention of each layer is not listed
enc_slf_attn_list += [enc_slf_attn]
if return_attns: #The default is false
return enc_output, enc_slf_attn_list
return enc_output, #Returns the encoder output of the last layer
class EncoderLayer(nn.Module):
"""Compose with two sub-layers.
1. A multi-head self-attention mechanism
2. A simple, position-wise fully connected feed-forward network.
"""
def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.1):
super(EncoderLayer, self).__init__()
self.slf_attn = MultiHeadAttention(
n_head, d_model, d_k, d_v , dropout=dropout) #Bull attention instantiation
self.pos_ffn = PositionwiseFeedForward(
d_model, d_inner, dropout=dropout) #Feed forward network instantiation
def forward(self, enc_input, pe,non_pad_mask=None, slf_attn_mask=None):
enc_output, enc_slf_attn = self.slf_attn(
enc_input, enc_input, enc_input, pe, mask=slf_attn_mask) #Gain the output of multi head attention
enc_output *= non_pad_mask #Prevent the length of data from changing after passing through the attention layer
enc_output = self.pos_ffn(enc_output) #Output of feedforward network
enc_output *= non_pad_mask
return enc_output, enc_slf_attn #Returns the output of an encoder
import numpy as np
import torch
import torch.nn as nn
class MultiHeadAttention(nn.Module):
''' Multi-Head Attention module '''
def __init__(self, n_head, d_model, d_k, d_v , dropout=0.1):
super().__init__()
self.n_head = n_head
self.d_k = d_k
self.d_v = d_v
self.w_qs = nn.Linear(d_model, n_head * d_k) #Input 512 dimensions, output 512 dimensions
self.w_ks = nn.Linear(d_model, n_head * d_k)
self.w_vs = nn.Linear(d_model, n_head * d_v)
nn.init.normal_(self.w_qs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k))) #Weight initialization. mean=0: mean value of normal distribution; std: standard deviation
nn.init.normal_(self.w_ks.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k)))#np.sqrt(): open root
nn.init.normal_(self.w_vs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_v)))
self.attention = ScaledDotProductAttention(temperature=np.power(2 * d_k, 0.5),
attn_dropout=dropout) #NP. Power (a, b): power B of a
self.layer_norm = nn.LayerNorm(d_model) #normalization
self.fc = nn.Linear(n_head * d_v, d_model) #Linear transformation, input 512 dimension, output 512 dimension
nn.init.xavier_normal_(self.fc.weight) #Initializes the weight of the linear transformation
self.dropout = nn.Dropout(dropout) #Information leakage rate
def forward(self, q, k, v , pe , mask=None):
d_k, d_v, n_head = self.d_k, self.d_v, self.n_head
sz_b, len_q, _ = q.size() #Gets the size and length of the query matrix
sz_b, len_k, _ = k.size()
sz_b, len_v, _ = v.size()
residual = q
q = self.w_qs(q).view(sz_b, len_q, n_head, d_k) #Reshape the shape of the tensor
k = self.w_ks(k).view(sz_b, len_k, n_head, d_k)
v = self.w_vs(v).view(sz_b, len_v, n_head, d_v)
#. permute(): change the tensor dimension; The contiguous method changes the storage order of multi-dimensional arrays in memory for use with the view method
#The torch.contiguous() method first copies the address of a tensor in memory, and then arranges the address according to the semantics of the tensor after the shape is changed.
q = q.permute(2, 0, 1, 3).contiguous().view(-1, len_q, d_k) # (n*b) x lq x dk
k = k.permute(2, 0, 1, 3).contiguous().view(-1, len_k, d_k) # (n*b) x lk x dk
v = v.permute(2, 0, 1, 3).contiguous().view(-1, len_v, d_v) # (n*b) x lv x dv
if mask is not None:
mask = mask.repeat(n_head, 1, 1) # (n*b) x .. x ..
output, attn = self.attention(q, k, v, pe,mask=mask) #Enter attention to get the relationship between output and attention
output = output.view(n_head, sz_b, len_q, d_v) #Reshape output
output = output.permute(1, 2, 0, 3).contiguous().view(sz_b, len_q, -1) # b x lq x (n*dv)
output = self.dropout(self.fc(output)) #The output is linearly transformed and dropout
output = self.layer_norm(output + residual) #Residual connection and normalization
return output, attn
class ScaledDotProductAttention(nn.Module):
''' Scaled Dot-Product Attention '''
def __init__(self, temperature, attn_dropout=0.1):
super().__init__()
self.temperature = temperature
self.dropout = nn.Dropout(attn_dropout)
self.softmax = nn.Softmax(dim=2)
def forward(self, q, k, v, pe, mask=None):
# print(torch.bmm(q, k.transpose(1, 2)).size())
attn = torch.bmm(q, k.transpose(1, 2).cuda()) #Get the transpose of q times k
attn = torch.add(attn, pe).cuda() #Add location information and word embedding information
#Three dimensional matrix multiplication (QxK) and. Transfer () realize the transformation of dimensions. One dimensional data and two-dimensional data exchange positions, which is transpose here
attn = attn / (self.temperature) #QxK divided by the dimension of root k
if mask is not None:
attn = attn.masked_fill(mask.bool(), -np.inf) #-np.inf: floating point number with negative infinity
attn = self.softmax(attn)
attn = self.dropout(attn)
output = torch.bmm(attn, v)
return output, attn
class MultiHeadAttention1(nn.Module): #The attention class called by the decoder is unchanged
''' Multi-Head Attention module '''
def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
super().__init__()
self.n_head = n_head
self.d_k = d_k
self.d_v = d_v
self.w_qs = nn.Linear(d_model, n_head * d_k)
self.w_ks = nn.Linear(d_model, n_head * d_k)
self.w_vs = nn.Linear(d_model, n_head * d_v)
nn.init.normal_(self.w_qs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k)))
nn.init.normal_(self.w_ks.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k)))
nn.init.normal_(self.w_vs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_v)))
self.attention = ScaledDotProductAttention1(temperature=np.power(d_k, 0.5),
attn_dropout=dropout)
self.layer_norm = nn.LayerNorm(d_model)
self.fc = nn.Linear(n_head * d_v, d_model)
nn.init.xavier_normal_(self.fc.weight)
self.dropout = nn.Dropout(dropout)
def forward(self, q, k, v, mask=None):
d_k, d_v, n_head = self.d_k, self.d_v, self.n_head
sz_b, len_q, _ = q.size()
sz_b, len_k, _ = k.size()
sz_b, len_v, _ = v.size()
residual = q
q = self.w_qs(q).view(sz_b, len_q, n_head, d_k)
k = self.w_ks(k).view(sz_b, len_k, n_head, d_k)
v = self.w_vs(v).view(sz_b, len_v, n_head, d_v)
q = q.permute(2, 0, 1, 3).contiguous().view(-1, len_q, d_k) # (n*b) x lq x dk
k = k.permute(2, 0, 1, 3).contiguous().view(-1, len_k, d_k) # (n*b) x lk x dk
v = v.permute(2, 0, 1, 3).contiguous().view(-1, len_v, d_v) # (n*b) x lv x dv
if mask is not None:
mask = mask.repeat(n_head, 1, 1) # (n*b) x .. x ..
output, attn = self.attention(q, k, v, mask=mask)
output = output.view(n_head, sz_b, len_q, d_v)
output = output.permute(1, 2, 0, 3).contiguous().view(sz_b, len_q, -1) # b x lq x (n*dv)
output = self.dropout(self.fc(output))
output = self.layer_norm(output + residual)
return output, attn
class ScaledDotProductAttention1(nn.Module):
''' Scaled Dot-Product Attention '''
def __init__(self, temperature, attn_dropout=0.1):
super().__init__()
self.temperature = temperature
self.dropout = nn.Dropout(attn_dropout)
self.softmax = nn.Softmax(dim=2)
def forward(self, q, k, v, mask=None):
attn = torch.bmm(q, k.transpose(1, 2))
attn = attn / self.temperature
torch.set_printoptions(profile="full")
if mask is not None:
attn = attn.masked_fill(mask.bool(), -np.inf)
# print(mask)
attn = self.softmax(attn)
attn = self.dropout(attn)
output = torch.bmm(attn, v)
return output, attn