是由sin/cos函數生成的固定的值,而在這裡代碼實現中是跟普通word embedding一樣隨機生成的,可以訓練的。作者這裡這樣選擇的原因可能是BERT訓練的數據比Transformer那篇大很多,完全可以讓模型自己去學習。
def embedding_postprocessor ( input_tensor , # [batch_size, seq_length, embedding_size]
use_token_type = False ,
token_type_ids = None ,
token_type_vocab_size = 16 , # 一般是2
token_type_embedding_name = "token_type_embeddings" ,
use_position_embeddings = True ,
position_embedding_name = "position_embeddings" ,
initializer_range = 0.02 ,
max_position_embeddings = 512 , #最大位置編碼,必須大於等於max_seq_len
dropout_prob = 0.1 ):
input_shape = get_shape_list ( input_tensor , expected_rank = 3 ) #【batch_size,seq_length,embedding_size】
batch_size = input_shape [ 0 ]
seq_length = input_shape [ 1 ]
width = input_shape [ 2 ]
output = input_tensor
# Segment position信息
if use_token_type :
if token_type_ids is None :
raise ValueError ( "`token_type_ids` must be specified if"
"`use_token_type` is True." )
token_type_table = tf . get_variable (
name = token_type_embedding_name ,
shape = [ token_type_vocab_size , width ],
initializer = create_initializer ( initializer_range ))
# 由於token-type-table比較小,所以這裡採用one-hot的embedding方式加速
flat_token_type_ids = tf . reshape ( token_type_ids , [ - 1 ])
one_hot_ids = tf . one_hot ( flat_token_type_ids , depth = token_type_vocab_size )
token_type_embeddings = tf . matmul ( one_hot_ids , token_type_table )
token_type_embeddings = tf . reshape ( token_type_embeddings ,
[ batch_size , seq_length , width ])
output += token_type_embeddings
# Position embedding信息
if use_position_embeddings :
# 確保seq_length小於等於max_position_embeddings
assert_op = tf . assert_less_equal ( seq_length , max_position_embeddings )
with tf . control_dependencies ([ assert_op ]):
full_position_embeddings = tf . get_variable (
name = position_embedding_name ,
shape = [ max_position_embeddings , width ],
initializer = create_initializer ( initializer_range ))
# 這裡position embedding是可學習的參數,[max_position_embeddings, width]
# 但是通常實際輸入序列沒有達到max_position_embeddings
# 所以為了提高訓練速度,使用tf.slice取出句子長度的embedding
position_embeddings = tf . slice ( full_position_embeddings , [ 0 , 0 ],
[ seq_length , - 1 ])
num_dims = len ( output . shape . as_list ())
# word embedding之後的tensor是[batch_size, seq_length, width]
# 因為位置編碼是與輸入內容無關,它的shape總是[seq_length, width]
# 我們無法把位置Embedding加到word embedding上
# 因此我們需要擴展位置編碼為[1, seq_length, width]
# 然後就能通過broadcasting加上去了。
position_broadcast_shape = []
for _ in range ( num_dims - 2 ):
position_broadcast_shape . append ( 1 )
position_broadcast_shape . extend ([ seq_length , width ])
position_embeddings = tf . reshape ( position_embeddings ,
position_broadcast_shape )
output += position_embeddings
output = layer_norm_and_dropout ( output , dropout_prob )
return output
4、構造attention_mask
該部分代碼的作用是構造attention可視域的attention_mask,因為每個樣本都經過padding過程,在做self-attention的時候padding的部分不能attend到其他部分上。
輸入為形狀為【batch_size, from_seq_length,…】的padding好的input_ids和形狀為【batch_size, to_seq_length】的mask標記向量。def create_attention_mask_from_input_mask ( from_tensor , to_mask ):
from_shape = get_shape_list ( from_tensor , expected_rank = [ 2 , 3 ])
batch_size = from_shape [ 0 ]
from_seq_length = from_shape [ 1 ]
to_shape = get_shape_list ( to_mask , expected_rank = 2 )
to_seq_length = to_shape [ 1 ]
to_mask = tf . cast (
tf . reshape ( to_mask , [ batch_size , 1 , to_seq_length ]), tf . float32 )
broadcast_ones = tf . ones (
shape = [ batch_size , from_seq_length , 1 ], dtype = tf . float32 )
mask = broadcast_ones * to_mask
return mask
5、注意力層(attention layer)
這部分代碼是multi-head attention 的實現,主要來自《Attention is all you need》這篇論文。考慮key-query-value
形式的attention,輸入的from_tensor
當做是query, to_tensor
當做是key和value,當兩者相同的時候即為self-attention。關於attention更詳細的介紹可以轉到理解Attention機制原理及模型。
def attention_layer ( from_tensor , # 【batch_size, from_seq_length, from_width】
to_tensor , #【batch_size, to_seq_length, to_width】
attention_mask = None , #【batch_size,from_seq_length, to_seq_length】
num_attention_heads = 1 , # attention head numbers
size_per_head = 512 , # 每個head的大小
query_act = None , # query變換的激活函數
key_act = None , # key變換的激活函數
value_act = None , # value變換的激活函數
attention_probs_dropout_prob = 0.0 , # attention層的dropout
initializer_range = 0.02 , # 初始化取值範圍
do_return_2d_tensor = False , # 是否返回2d張量。
#如果True,輸出形狀【batch_size*from_seq_length,num_attention_heads*size_per_head】
#如果False,輸出形狀【batch_size, from_seq_length, num_attention_heads*size_per_head】
batch_size = None , #如果輸入是3D的,
#那麼batch就是第一維,但是可能3D的壓縮成了2D的,所以需要告訴函數batch_size
from_seq_length = None , # 同上
to_seq_length = None ): # 同上
def transpose_for_scores ( input_tensor , batch_size , num_attention_heads ,
seq_length , width ):
output_tensor = tf . reshape (
input_tensor , [ batch_size , seq_length , num_attention_heads , width ])
output_tensor = tf . transpose ( output_tensor , [ 0 , 2 , 1 , 3 ]) #[batch_size, num_attention_heads, seq_length, width]
return output_tensor
from_shape = get_shape_list ( from_tensor , expected_rank = [ 2 , 3 ])
to_shape = get_shape_list ( to_tensor , expected_rank = [ 2 , 3 ])
if len ( from_shape ) != len ( to_shape ):
raise ValueError (
"The rank of `from_tensor` must match the rank of `to_tensor`." )
if len ( from_shape ) == 3 :
batch_size = from_shape [ 0 ]
from_seq_length = from_shape [ 1 ]
to_seq_length = to_shape [ 1 ]
elif len ( from_shape ) == 2 :
if ( batch_size is None or from_seq_length is None or to_seq_length is None ):
raise ValueError (
"When passing in rank 2 tensors to attention_layer, the values "
"for `batch_size`, `from_seq_length`, and `to_seq_length` "
"must all be specified." )
# 為了方便備註shape,採用以下簡寫:
# B = batch size (number of sequences)
# F = `from_tensor` sequence length
# T = `to_tensor` sequence length
# N = `num_attention_heads`
# H = `size_per_head`
# 把from_tensor和to_tensor壓縮成2D張量
from_tensor_2d = reshape_to_matrix ( from_tensor ) # 【B*F, hidden_size】
to_tensor_2d = reshape_to_matrix ( to_tensor ) # 【B*T, hidden_size】
# 將from_tensor輸入全連接層得到query_layer
# `query_layer` = [B*F, N*H]
query_layer = tf . layers . dense (
from_tensor_2d ,
num_attention_heads * size_per_head ,
activation = query_act ,
name = "query" ,
kernel_initializer = create_initializer ( initializer_range ))
# 將from_tensor輸入全連接層得到query_layer
# `key_layer` = [B*T, N*H]
key_layer = tf . layers . dense (
to_tensor_2d ,
num_attention_heads * size_per_head ,
activation = key_act ,
name = "key" ,
kernel_initializer = create_initializer ( initializer_range ))
# 同上
# `value_layer` = [B*T, N*H]
value_layer = tf . layers . dense (
to_tensor_2d ,
num_attention_heads * size_per_head ,
activation = value_act ,
name = "value" ,
kernel_initializer = create_initializer ( initializer_range ))
# query_layer轉成多頭:[B*F, N*H]==>[B, F, N, H]==>[B, N, F, H]
query_layer = transpose_for_scores ( query_layer , batch_size ,
num_attention_heads , from_seq_length ,
size_per_head )
# key_layer轉成多頭:[B*T, N*H] ==> [B, T, N, H] ==> [B, N, T, H]
key_layer = transpose_for_scores ( key_layer , batch_size , num_attention_heads ,
to_seq_length , size_per_head )
# 將query與key做點積,然後做一個scale,公式可以參見原始論文
# `attention_scores` = [B, N, F, T]
attention_scores = tf . matmul ( query_layer , key_layer , transpose_b = True )
attention_scores = tf . multiply ( attention_scores ,
1.0 / math . sqrt ( float ( size_per_head )))
if attention_mask is not None :
# `attention_mask` = [B, 1, F, T]
attention_mask = tf . expand_dims ( attention_mask , axis = [ 1 ])
# 如果attention_mask裏的元素為1,則通過下面運算有(1-1)*-10000,adder就是0
# 如果attention_mask裏的元素為0,則通過下面運算有(1-0)*-10000,adder就是-10000
adder = ( 1.0 - tf . cast ( attention_mask , tf . float32 )) * - 10000.0
# 我們最終得到的attention_score一般不會很大,
#所以上述操作對mask為0的地方得到的score可以認為是負無窮
attention_scores += adder
# 負無窮經過softmax之後為0,就相當於mask為0的位置不計算attention_score
# `attention_probs` = [B, N, F, T]
attention_probs = tf . nn . softmax ( attention_scores )
# 對attention_probs進行dropout,這雖然有點奇怪,但是Transforme原始論文就是這麼做的
attention_probs = dropout ( attention_probs , attention_probs_dropout_prob )
# `value_layer` = [B, T, N, H]
value_layer = tf . reshape (
value_layer ,
[ batch_size , to_seq_length , num_attention_heads , size_per_head ])
# `value_layer` = [B, N, T, H]
value_layer = tf . transpose ( value_layer , [ 0 , 2 , 1 , 3 ])
# `context_layer` = [B, N, F, H]
context_layer = tf . matmul ( attention_probs , value_layer )
# `context_layer` = [B, F, N, H]
context_layer = tf . transpose ( context_layer , [ 0 , 2 , 1 , 3 ])
if do_return_2d_tensor :
# `context_layer` = [B*F, N*H]
context_layer = tf . reshape (
context_layer ,
[ batch_size * from_seq_length , num_attention_heads * size_per_head ])
else :
# `context_layer` = [B, F, N*H]
context_layer = tf . reshape (
context_layer ,
[ batch_size , from_seq_length , num_attention_heads * size_per_head ])
return context_layer
總結一下,attention layer的主要流程:
對輸入的tensor進行形狀校驗,提取batch_size、from_seq_length 、to_seq_length
輸入如果是3d張量則轉化成2d矩陣
from_tensor作為query, to_tensor作為key和value,經過一層全連接層後得到query_layer、key_layer 、value_layer
將上述張量通過transpose_for_scores
轉化成multi-head
根據論文公式計算attention_score以及attention_probs(注意attention_mask的trick):
將得到的attention_probs與value相乘,返回2D或3D張量
6、Transformer
接下來的代碼就是大名鼎鼎的Transformer的核心代碼了,可以認為是"Attention is All You Need"原始代碼重現。可以參見原始論文和原始代碼。
def transformer_model ( input_tensor , # 【batch_size, seq_length, hidden_size】
attention_mask = None , # 【batch_size, seq_length, seq_length】
hidden_size = 768 ,
num_hidden_layers = 12 ,
num_attention_heads = 12 ,
intermediate_size = 3072 ,
intermediate_act_fn = gelu , # feed-forward層的激活函數
hidden_dropout_prob = 0.1 ,
attention_probs_dropout_prob = 0.1 ,
initializer_range = 0.02 ,
do_return_all_layers = False ):
# 這裡注意,因為最終要輸出hidden_size, 我們有num_attention_head個區域,
# 每個head區域有size_per_head多的隱層
# 所以有 hidden_size = num_attention_head * size_per_head
if hidden_size % num_attention_heads != 0 :
raise ValueError (
"The hidden size ( %d ) is not a multiple of the number of attention "
"heads ( %d )" % ( hidden_size , num_attention_heads ))
attention_head_size = int ( hidden_size / num_attention_heads )
input_shape = get_shape_list ( input_tensor , expected_rank = 3 )
batch_size = input_shape [ 0 ]
seq_length = input_shape [ 1 ]
input_width = input_shape [ 2 ]
# 因為encoder中有殘差操作,所以需要shape相同
if input_width != hidden_size :
raise ValueError ( "The width of the input tensor ( %d ) != hidden size ( %d )" %
( input_width , hidden_size ))
# reshape操作在CPU/GPU上很快,但是在TPU上很不友好
# 所以為了避免2D和3D之間的頻繁reshape,我們把所有的3D張量用2D矩陣表示
prev_output = reshape_to_matrix ( input_tensor )
all_layer_outputs = []
for layer_idx in range ( num_hidden_layers ):
with tf . variable_scope ( "layer_ %d " % layer_idx ):
layer_input = prev_output
with tf . variable_scope ( "attention" ):
# multi-head attention
attention_heads = []
with tf . variable_scope ( "self" ):
# self-attention
attention_head = attention_layer (
from_tensor = layer_input ,
to_tensor = layer_input ,
attention_mask = attention_mask ,
num_attention_heads = num_attention_heads ,
size_per_head = attention_head_size ,
attention_probs_dropout_prob = attention_probs_dropout_prob ,
initializer_range = initializer_range ,
do_return_2d_tensor = True ,
batch_size = batch_size ,
from_seq_length = seq_length ,
to_seq_length = seq_length )
attention_heads . append ( attention_head )
attention_output = None
if len ( attention_heads ) == 1 :
attention_output = attention_heads [ 0 ]
else :
# 如果有多個head,將他們拼接起來
attention_output = tf . concat ( attention_heads , axis =- 1 )
# 對attention的輸出進行線性映射, 目的是將shape變成與input一致
# 然後dropout+residual+norm
with tf . variable_scope ( "output" ):
attention_output = tf . layers . dense (
attention_output ,
hidden_size ,
kernel_initializer = create_initializer ( initializer_range ))
attention_output = dropout ( attention_output , hidden_dropout_prob )
attention_output = layer_norm ( attention_output + layer_input )
# feed-forward
with tf . variable_scope ( "intermediate" ):
intermediate_output = tf . layers . dense (
attention_output ,
intermediate_size ,
activation = intermediate_act_fn ,
kernel_initializer = create_initializer ( initializer_range ))
# 對feed-forward層的輸出使用線性變換變回『hidden_size』
# 然後dropout + residual + norm
with tf . variable_scope ( "output" ):
layer_output = tf . layers . dense (
intermediate_output ,
hidden_size ,
kernel_initializer = create_initializer ( initializer_range ))
layer_output = dropout ( layer_output , hidden_dropout_prob )
layer_output = layer_norm ( layer_output + attention_output )
prev_output = layer_output
all_layer_outputs . append ( layer_output )
if do_return_all_layers :
final_outputs = []
for layer_output in all_layer_outputs :
final_output = reshape_from_matrix ( layer_output , input_shape )
final_outputs . append ( final_output )
return final_outputs
else :
final_output = reshape_from_matrix ( prev_output , input_shape )
return final_output
配上下圖一同使用效果更佳,因為BERT裏只有encoder,所有decoder沒有姓名
7、函數入口(init)
BertModel類的構造函數,有了上面幾節的鋪墊,我們就可以來實現BERT模型了。
def __init__ ( self ,
config , # BertConfig對象
is_training ,
input_ids , # 【batch_size, seq_length】
input_mask = None , # 【batch_size, seq_length】
token_type_ids = None , # 【batch_size, seq_length】
use_one_hot_embeddings = False , # 是否使用one-hot;否則tf.gather()
scope = None ):
config = copy . deepcopy ( config )
if not is_training :
config . hidden_dropout_prob = 0.0
config . attention_probs_dropout_prob = 0.0
input_shape = get_shape_list ( input_ids , expected_rank = 2 )
batch_size = input_shape [ 0 ]
seq_length = input_shape [ 1 ]
# 不做mask,即所有元素為1
if input_mask is None :
input_mask = tf . ones ( shape = [ batch_size , seq_length ], dtype = tf . int32 )
if token_type_ids is None :
token_type_ids = tf . zeros ( shape = [ batch_size , seq_length ], dtype = tf . int32 )
with tf . variable_scope ( scope , default_name = "bert" ):
with tf . variable_scope ( "embeddings" ):
# word embedding
( self . embedding_output , self . embedding_table ) = embedding_lookup (
input_ids = input_ids ,
vocab_size = config . vocab_size ,
embedding_size = config . hidden_size ,
initializer_range = config . initializer_range ,
word_embedding_name = "word_embeddings" ,
use_one_hot_embeddings = use_one_hot_embeddings )
# 添加position embedding和segment embedding
# layer norm + dropout
self . embedding_output = embedding_postprocessor (
input_tensor = self . embedding_output ,
use_token_type = True ,
token_type_ids = token_type_ids ,
token_type_vocab_size = config . type_vocab_size ,
token_type_embedding_name = "token_type_embeddings" ,
use_position_embeddings = True ,
position_embedding_name = "position_embeddings" ,
initializer_range = config . initializer_range ,
max_position_embeddings = config . max_position_embeddings ,
dropout_prob = config . hidden_dropout_prob )
with tf . variable_scope ( "encoder" ):
# input_ids是經過padding的word_ids: [25, 120, 34, 0, 0]
# input_mask是有效詞標記: [1, 1, 1, 0, 0]
attention_mask = create_attention_mask_from_input_mask (
input_ids , input_mask )
# transformer模塊疊加
# `sequence_output` shape = [batch_size, seq_length, hidden_size].
self . all_encoder_layers = transformer_model (
input_tensor = self . embedding_output ,
attention_mask = attention_mask ,
hidden_size = config . hidden_size ,
num_hidden_layers = config . num_hidden_layers ,
num_attention_heads = config . num_attention_heads ,
intermediate_size = config . intermediate_size ,
intermediate_act_fn = get_activation ( config . hidden_act ),
hidden_dropout_prob = config . hidden_dropout_prob ,
attention_probs_dropout_prob = config . attention_probs_dropout_prob ,
initializer_range = config . initializer_range ,
do_return_all_layers = True )
# `self.sequence_output`是最後一層的輸出,shape為【batch_size, seq_length, hidden_size】
self . sequence_output = self . all_encoder_layers [ - 1 ]
# 『pooler』部分將encoder輸出【batch_size, seq_length, hidden_size】
# 轉成【batch_size, hidden_size】
with tf . variable_scope ( "pooler" ):
# 取最後一層的第一個時刻[CLS]對應的tensor, 對於分類任務很重要
# sequence_output[:, 0:1, :]得到的是[batch_size, 1, hidden_size]
# 我們需要用squeeze把第二維去掉
first_token_tensor = tf . squeeze ( self . sequence_output [:, 0 : 1 , :], axis = 1 )
# 然後再加一個全連接層,輸出仍然是[batch_size, hidden_size]
self . pooled_output = tf . layers . dense (
first_token_tensor ,
config . hidden_size ,
activation = tf . tanh ,
kernel_initializer = create_initializer ( config . initializer_range ))
總結一哈
有了以上對源碼的深入瞭解之後,我們在使用BertModel的時候就會更加得心應手。舉個模型使用的簡單栗子:
# 假設輸入已經經過分詞變成word_ids. shape=[2, 3]
input_ids = tf . constant ([[ 31 , 51 , 99 ], [ 15 , 5 , 0 ]])
input_mask = tf . constant ([[ 1 , 1 , 1 ], [ 1 , 1 , 0 ]])
# segment_emebdding. 表示第一個樣本前兩個詞屬於句子1,後一個詞屬於句子2.
# 第二個樣本的第一個詞屬於句子1, 第二次詞屬於句子2,第三個元素0表示padding
token_type_ids = tf . constant ([[ 0 , 0 , 1 ], [ 0 , 2 , 0 ]])
# 創建BertConfig實例
config = modeling . BertConfig ( vocab_size = 32000 , hidden_size = 512 ,
num_hidden_layers = 8 , num_attention_heads = 6 , intermediate_size = 1024 )
# 創建BertModel實例
model = modeling . BertModel ( config = config , is_training = True ,
input_ids = input_ids , input_mask = input_mask , token_type_ids = token_type_ids )
label_embeddings = tf . get_variable ( ... )
#得到最後一層的第一個Token也就是[CLS]向量表示,可以看成是一個句子的embedding
pooled_output = model . get_pooled_output ()
logits = tf . matmul ( pooled_output , label_embeddings )
在BERT模型構建這一塊的主要流程:
對輸入序列進行Embedding(三個),接下去就是『Attention is all you need』的內容了
簡單一點就是將embedding輸入transformer得到輸出結果
詳細一點就是embedding --> N *【multi-head attention --> Add(Residual) &Norm--> Feed-Forward --> Add(Residual) &Norm】
哈,是不是很簡單~
源碼中還有一些其他的輔助函數,不是很難理解,這裡就不再囉嗦。
reference
bert repository
谷歌BERT預訓練源碼解析(一):訓練數據生成
Bert系列(一)——demo運行
以上~
知乎裏的代碼咋不能高亮顯示....
推薦閱讀: