FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_string(data_path,, Path expression to tf.Example.)
tf.app.flags.DEFINE_string(mode, train, train/dev/test mode)
tf.app.flags.DEFINE_string(acc, acc, acc/cer)
tf.app.flags.DEFINE_string(encoder_type, pblstm, lstm/bilstm/pblstm)
tf.app.flags.DEFINE_string(data_source, aishell, aishell/chths30)
tf.app.flags.DEFINE_string(data_dir, ./, 數據源地址)
tf.app.flags.DEFINE_bool(keep_model, True, 1是繼續訓練模型/0是重新訓練模型)
tf.app.flags.DEFINE_bool(beam_search, True, True使用beamsearch)
HParams = namedtuple(HParams,
mode, data_source,data_dir,keep_model,beam_search,beam_size ,acc_method,num_epochs,learning_rate,weight_noise_rate,
batch_size, encode_type,encoder_num_layers,attention_size,
encoder_rnn_hidden_unit,
decoding_embedding_size, max_grad_norm,
keep_probability,l2_norm_rate,audio_dim)
hps = HParams(
mode=FLAGS.mode, # train, eval, decode
data_source = FLAGS.data_source,
data_dir = FLAGS.data_dir,
keep_model = FLAGS.keep_model,
beam_search = FLAGS.beam_search,
beam_size = 4,
acc_method = FLAGS.acc,
num_epochs = 10000,
learning_rate=0.001, # learning rate
weight_noise_rate = 0.01,
batch_size=100,
encode_type = FLAGS.encoder_type,
encoder_num_layers=2,
attention_size = 256,
encoder_rnn_hidden_unit=256, # for rnn cell
decoding_embedding_size=128, # If 0, dont use embedding
max_grad_norm=1,
keep_probability = 0.5,
l2_norm_rate = 0.00001,
audio_dim=240)
#列印所有屬性名稱和值
pprint (vars(hps))

1.數據預處理篇:

語音識別訓練數據使用的是aishell,單獨使用thchs30訓練會過擬合,可能是我的模型參數過多。

文本文件只保存在aishell_transcript_v0.8.txt
語音文件存放在多個文件夾

數據分語音文件和文本文件,aishell中語音文件是放在多個文件夾中,有編號,文本文件只保存在一個文件中

第一步操作:需要做一次映射,將語音文件的path對應文本文件中的文字,這麼做的目的是方便後續操作。

語音path和文字的映射

第二步操作:語音文件和文字內容按照映射提取出來,語音文件生成fbank特徵,提取的是80維的,再計算delta和deltadelta,合併起來就是240維,統計fbank特徵的最大幀數,目的是為後續填充到等長準備(其實這一步有了變長保存後,可以不用統計,直接保存到tfrecord中,在模型中只需要對batchsize內進行填充即可)再統計所有文字的字典和最大文字長度,目的也是為了padding到等長,字典的目的是為了對文字編碼

第三步操作:對語音重新提取fbank特徵,生成240維,對文字進行編碼,對語音和文字均填充到各自的最大長度,保存到tfrecord

第二步和第三步操作詳見另一篇知文,這篇文章不僅包括將特徵保存到tfrecord,還包括讀取操作。

2.語音識別模型篇

  • encoder和decoder

這個比較簡單,網上的介紹特別多,這裡不講了(如果還想繼續看本人寫的,可以打開該網址)

  • "Attention is all your need"

這裡也不加解釋了,基於seq2seq 的語音識別不加attention就訓練不出來了

  • pblstm金字塔encoder

這裡需要注意的是對encoder輸出的處理,使用3層雙向lstm時,最後輸入給decoder的state和提供給attention的output都是最頂層的,除非decoder也是多層,還沒嘗試多層decoder的情況,這裡就不亂講了.這裡再講一下state的維度,LSTM的state分c和h,如果是單向LSTM,那麼c和h的維度都是LSTM的hidden size,當前使用的是biLSTM雙向的,那麼它的state中的c和h就都是兩倍的hidden size ,這也就要求decoder的hidden size = encoder的hidden size *2才能接起來

cell_fw_1 = tf.nn.rnn_cell.LSTMCell(rnn_size,initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=123),state_is_tuple=True)
cell_fw_2 = tf.nn.rnn_cell.LSTMCell(rnn_size,initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=123),state_is_tuple=True)
cell_fw_3 = tf.nn.rnn_cell.LSTMCell(rnn_size,initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=123),state_is_tuple=True)
cell_bw_1 = tf.nn.rnn_cell.LSTMCell(rnn_size,initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=113),state_is_tuple=True)
cell_bw_2 = tf.nn.rnn_cell.LSTMCell(rnn_size,initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=113),state_is_tuple=True)
cell_bw_3 = tf.nn.rnn_cell.LSTMCell(rnn_size,initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=113),state_is_tuple=True)
encoder_f_cell_1 = DropoutWrapper(cell_fw_1, output_keep_prob=keep_prob)
encoder_f_cell_2 = DropoutWrapper(cell_fw_2, output_keep_prob=keep_prob)
encoder_f_cell_3 = DropoutWrapper(cell_fw_3, output_keep_prob=keep_prob)
encoder_b_cell_1 = DropoutWrapper(cell_bw_1, output_keep_prob=keep_prob)
encoder_b_cell_2 = DropoutWrapper(cell_bw_2, output_keep_prob=keep_prob)
encoder_b_cell_3 = DropoutWrapper(cell_bw_3, output_keep_prob=keep_prob)

(encoder_fw_outputs_1, encoder_bw_outputs_1), (_, _) =
tf.nn.bidirectional_dynamic_rnn(encoder_f_cell_1, encoder_b_cell_1, inputs=emb_encoder_inputs, dtype=tf.float32,
sequence_length=source_sequence_length,scope="L1")
outputs_l1 = tf.reshape(tf.concat((encoder_fw_outputs_1, encoder_bw_outputs_1),axis=2), [-1, max_seq_length / 2, rnn_size * 4])

(encoder_fw_outputs_2, encoder_bw_outputs_2), (_, _) =
tf.nn.bidirectional_dynamic_rnn(encoder_f_cell_2, encoder_b_cell_2, inputs=outputs_l1, dtype=tf.float32,
sequence_length=source_sequence_length/2, scope="L2")
outputs_l2 = tf.reshape(tf.concat((encoder_fw_outputs_2, encoder_bw_outputs_2),axis=2), [-1, max_seq_length / 4, rnn_size * 4])

(encoder_fw_outputs_3, encoder_bw_outputs_3), (encoder_fw_final_state, encoder_bw_final_state) =
tf.nn.bidirectional_dynamic_rnn(encoder_f_cell_3, encoder_b_cell_3, inputs=outputs_l2, dtype=tf.float32,
sequence_length=source_sequence_length/4,scope="L3")
emb_encoder_outputs = tf.reshape(tf.concat((encoder_fw_outputs_3, encoder_bw_outputs_3),axis=2), [-1, max_seq_length / 8, rnn_size * 4])

encoder_final_state_c = tf.concat((encoder_fw_final_state.c, encoder_bw_final_state.c), 1)
encoder_final_state_h = tf.concat((encoder_fw_final_state.h, encoder_bw_final_state.h), 1)
encoder_final_state = LSTMStateTuple(
c=encoder_final_state_c,
h=encoder_final_state_h
)

關於encoder decoder attention所有參數量的計算,下次找個篇章專門講,講這個的話需要對LSTM以及attention的內部結構有一定了解。

這裡再說一個我認為需要在數據預處理階段關心的問題: 送給encoder的數據幀數(代碼中的max_seq_length)要能被8整除,為什麼呢?因為金字塔的每一層都是下一層幀數的一半,這裡需要把下一層reshape到幀數的一半,三層總共做了三次,所以需要能被8整除了。

  • 添加beamsearch注意事項

beamsearch是在decoder進行的處理,decoder接收encoder的state和attention作為輸入,attention接收encoder的output作為輸入,所以beamsearch添加需要對state和output做改變,主要就是將數據複製成beam_size份,這樣decoder的輸出纔能有beam_size個

if self.hps.beam_search:
#for attention_cell
encoder_output = tf.contrib.seq2seq.tile_batch(encoder_output, self.hps.beam_size)
source_sequence_length = tf.contrib.seq2seq.tile_batch(source_sequence_length, self.hps.beam_size)
attention_cell = self.decoder_attn(encoder_rnn_hidden_unit, attention_size,source_sequence_length, encoder_output, dec_cell)
#for dec_init_state
encoder_state = tf.contrib.seq2seq.tile_batch(encoder_state, self.hps.beam_size)
init_state = attention_cell.zero_state(batch_size*self.hps.beam_size, tf.float32).clone(cell_state=encoder_state)
#init_state = tf.contrib.seq2seq.tile_batch(encoder_state, multiplier=hps.beam_size),
inference_decoder = tf.contrib.seq2seq.BeamSearchDecoder(cell=attention_cell, embedding=dec_embeddings,
start_tokens=start_tokens, end_token=end_of_sequence_id,
initial_state= init_state,
beam_width=self.hps.beam_size,
output_layer=output_layer)
dec_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder=inference_decoder,impute_finished = False,
maximum_iterations=max_target_sequence_length)

beamsearch的原理就不展開介紹了,其實比較簡單,就是每次都取top(beamsize)

  • 添加weight noise的注意事項

weight noise以前沒有玩過,以前只知道對數據預處理的時候添加noise,這裡是對權重添加noise,這裡並不是每一個batch都要計算,可以每隔10個或者100個batch計算一次,所以需要使用一些tensoflow的判斷(cond和equal),這裡control_dependencies有坑,主要是graph的坑,要注意。

def loss(training_logits, dec_output, masks, global_steps):

def weight_noise(noise_rate, var_list):
with tf.variable_scope("weight_noise"):
if not noise_rate:
return [tf.no_op()]
noise_ops = []

for v in var_list:
with tf.device(v._ref().device):
scale = noise_rate
noise = tf.truncated_normal(v.shape) * scale
noise_op = v.assign_add(noise)
noise_ops.append(noise_op)
return noise_ops

def weight_decay(decay_rate, var_list, skip_biases=True):
with tf.variable_scope("L2_loss"):
if not decay_rate:
return 0.
weight_decays = []
for v in var_list:
is_bias = len(v.shape.as_list()) == 1 and v.name.endswith("bias:0")
if not (skip_biases and is_bias):
with tf.device(v.device):
v_loss = tf.nn.l2_loss(v)
weight_decays.append(v_loss)

return tf.add_n(weight_decays) * decay_rate

cost = tf.contrib.seq2seq.sequence_loss(training_logits, dec_output, masks)
vars = [v for v in tf.trainable_variables()]

# 增加weight noise ,最好能每隔100個batch 進行一次
def update_weight():
weight_noise_ops = weight_noise(self.hps.weight_noise_rate, vars)
with tf.control_dependencies(weight_noise_ops):
return tf.identity(cost)

cost = tf.cond(tf.equal(global_steps % self.weight_noise_train_interval, 0), update_weight,
lambda: tf.identity(cost))
weight_decay_loss = weight_decay(self.hps.l2_norm_rate, vars, skip_biases=True)
cost = cost + weight_decay_loss
return cost

3.多GPU訓練篇:

修改訓練數據的讀取方法:由原來的queue改成了tf.dataset(見這裡)

與單GPU相比,多GPU模型需要手動指定OP的device,各個device之間的變數共享,最後在CPU上綜合loss與梯度、準確率等,CPU上應用梯度更新權值。

解決多GPU並行的問題時需要額外處理的問題:數據讀取方法要從定長改成可變長讀取,數據讀取變成獨立模塊,只提供類似於yield這樣介面,;多gpu並行時每個gpu只計算梯度,cpu收集多個gpu的梯度計算結果,然後apply

def feed_all_gpu(self,inp_dict, max_seq_length,max_text_length,models,next_train_element ,train_graph_sess,keep_prob_value):
data_len = 0
for i in range(len(models)):
train_dec_input, train_dec_output, train_real_text_length, train_real_seq_length, train_source_feature =
train_graph_sess.run(next_train_element)

input_data, dec_input, dec_output, keep_prob,source_sequence_length,
target_sequence_length, training_logits, inference_logits,loss, gradients = models[i]
inp_dict[input_data] = train_source_feature
inp_dict[dec_input] = train_dec_input
inp_dict[dec_output] = train_dec_output
#inp_dict[lr] = self.hps.learning_rate
inp_dict[keep_prob] = keep_prob_value
inp_dict[source_sequence_length] = np.tile([np.array(max_seq_length, dtype=np.int32)],self.hps.batch_size)
inp_dict[target_sequence_length] = np.tile([np.array(max_text_length, dtype=np.int32)],self.hps.batch_size)
data_len += len(train_dec_input)
return inp_dict,data_len

opt = tf.train.AdamOptimizer(learning_rate=self.hps.learning_rate)
models_para = []
for gpu_id in xrange(self.hps.gpu_num):
with tf.device(/gpu:%d % gpu_id):
with tf.variable_scope(model, reuse=gpu_id > 0):
input_data, dec_input, dec_output, keep_prob, source_sequence_length, target_sequence_length, global_steps =
self.model_inputs(max_text_length)

training_logits, inference_logits = self.seq2seq_model(input_data=input_data,
dec_input=dec_input,
dec_output=dec_output,
keep_prob=keep_prob,
batch_size=self.hps.batch_size,
source_sequence_length=source_sequence_length,
target_sequence_length=target_sequence_length,
max_target_sequence_length=max_text_length,
dec_embedding_size=self.hps.decoding_embedding_size,
encoder_rnn_hidden_unit=self.hps.encoder_rnn_hidden_unit,
attention_size=self.hps.attention_size,
encoder_num_layers=self.hps.encoder_num_layers,
target_vocab_to_int=target_vocab_to_int,
max_seq_length=max_seq_length,
encode_type=self.hps.encode_type)

masks = tf.sequence_mask(target_sequence_length, max_text_length,
dtype=tf.float32, name=masks)
cost = loss(training_logits, dec_output, masks, global_steps)
gradients = opt.compute_gradients(cost)
models_para.append((input_data, dec_input, dec_output, keep_prob,
source_sequence_length, target_sequence_length, training_logits, inference_logits,cost, gradients))

with tf.variable_scope("apply_gradient"):
tower_input_data, tower_dec_input,tower_dec_output, _,_,_,tower_train_preds,tower_inference_logits, tower_losses, tower_grads = zip(*models_para)
aver_loss_op = tf.reduce_mean(tower_losses)
grads = self.average_gradients(tower_grads)
apply_gradient_op = opt.apply_gradients(grads, global_step=global_steps)

推薦閱讀:

相關文章