Issue
I have written a custom Encoder and Decoder layers that implements the architecture described in the Attention Is All You Need paper. Everything works fine until I trying compiling it, I get one error. If I run it with a sample data it compiles but then when I call the fit
method to train the model it throws another error. I'm going to provide the blocks that I might be implementing incorrectly and let me know if more code is needed to debug.
TF Version: 2.14.0
Multi-Head Sub Layer and Positional Encoding Layer:
class MhaSubLayer(Layer):
def __init__(self, units, **kwargs):
super().__init__()
self.mha = MultiHeadAttention(key_dim=units, **kwargs)
self.inner_dense = TimeDistributed(Dense(2048, activation='relu'))
self.outer_dense = TimeDistributed(Dense(units))
self.layernorm_mha = LayerNormalization()
self.layernorm_ff = LayerNormalization()
self.add = Add()
def call(self, x, context, **kwargs):
### Calculate Attention Output
attn_out, attn_scores = self.mha(query=x, value=context, return_attention_scores=True, **kwargs)
attn_resid_cnxt = self.add([x, attn_out]) ## Residual connection
attn_layer_norm = self.layernorm_mha(attn_resid_cnxt)
attn_scores = tf.reduce_mean(attn_scores, axis=1)
self.last_attention_weights = attn_scores
### Pass the Attention output to the Dense Layer
dense_out = self.outer_dense(self.inner_dense(attn_layer_norm))
dense_resid_cnxt = self.add([attn_layer_norm, dense_out]) ### Feed forward residual connection
dense_layer_norm = self.layernorm_ff(dense_resid_cnxt)
return dense_layer_norm
class PositionalEncodingLayer(Layer):
def __init__(self, **kwargs):
super().__init__()
self.add = Add()
def get_positional_encodings(self, x):
seq_len = x.shape[0]
d = x.shape[1]
P = np.zeros((seq_len, d))
for k in range(seq_len):
for i in np.arange(int(d/2)):
denominator = np.power(10000, 2*i/d)
P[k, 2*i] = np.sin(k/denominator)
P[k, 2*i+1] = np.cos(k/denominator)
return P
def call(self, x):
# pos_enc = []
pos_enc = tf.map_fn(fn=self.get_positional_encodings, elems=x)
# for n, elm in enumerate(x):
# p = self.get_positional_encodings(elm)
# pos_enc.append(p)
# pos_enc = tf.convert_to_tensor(pos_enc)
pos_embeddings = self.add([x, pos_enc])
return pos_embeddings
Encoder-Decoder Block:
class Encoder(Layer):
def __init__(self, units, embed_input_dim, name='encoder', **kwargs):
super().__init__()
### Encoder Input Embedding and Layer
self.embedding = Embedding(input_dim=embed_input_dim, output_dim=units, name='en_embed_layer')
self.pos_embedding = PositionalEncodingLayer(name='en_positional_embed_layer')
### Encoder Multi-Head Self Attention Sub Layer
self.mha_sub_layer1 = MhaSubLayer(units, num_heads=8, name='en_mha_layer_1')
self.mha_sub_layer2 = MhaSubLayer(units, num_heads=8, name='en_mha_layer_2')
self.mha_sub_layer3 = MhaSubLayer(units, num_heads=8, name='en_mha_layer_3')
self.mha_sub_layer4 = MhaSubLayer(units, num_heads=8, name='en_mha_layer_4')
self.mha_sub_layer5 = MhaSubLayer(units, num_heads=8, name='en_mha_layer_5')
self.mha_sub_layer6 = MhaSubLayer(units, num_heads=8, name='en_mha_layer_6')
### Encoder MHA Dropout Layer
self.dropout = Dropout(rate=0.1, name='en_dropout_pos_enc')
self.dropout1 = Dropout(rate=0.1, name='en_dropout_layer1')
self.dropout2 = Dropout(rate=0.1, name='en_dropout_layer2')
self.dropout3 = Dropout(rate=0.1, name='en_dropout_layer3')
self.dropout4 = Dropout(rate=0.1, name='en_dropout_layer4')
self.dropout5 = Dropout(rate=0.1, name='en_dropout_layer5')
self.dropout6 = Dropout(rate=0.1, name='en_dropout_layer6')
def call(self, x):
embedding_output = self.embedding(x)
positional_embedding = self.pos_embedding(embedding_output)
postitional_embedding = self.dropout(positional_embedding)
### First MHa Sub-Layer
sub_layer1_out = self.mha_sub_layer1(positional_embedding, positional_embedding)
sub_layer1_out = self.dropout1(sub_layer1_out)
### Second MHa Sub-Layer
sub_layer2_out = self.mha_sub_layer2(sub_layer1_out, sub_layer1_out)
sub_layer2_out = self.dropout2(sub_layer2_out)
### Third MHa Sub-Layer
sub_layer3_out = self.mha_sub_layer3(sub_layer2_out, sub_layer2_out)
sub_layer3_out = self.dropout3(sub_layer3_out)
### Fourth MHa Sub-Layer
sub_layer4_out = self.mha_sub_layer4(sub_layer3_out, sub_layer3_out)
sub_layer4_out = self.dropout4(sub_layer4_out)
### Fifth MHa Sub-Layer
sub_layer5_out = self.mha_sub_layer5(sub_layer4_out, sub_layer4_out)
sub_layer5_out = self.dropout5(sub_layer5_out)
### Sixth MHa Sub-Layer
sub_layer6_out = self.mha_sub_layer6(sub_layer5_out, sub_layer5_out)
sub_layer6_out = self.dropout6(sub_layer6_out)
return sub_layer6_out
class Decoder(Layer):
def __init__(self, units, embed_input_dim, name='decoder', **kwargs):
super().__init__()
### Decoder Input Embedding Layer
self.embedding = Embedding(input_dim=embed_input_dim, output_dim=units, name='de_embed_layer')
self.pos_embedding = PositionalEncodingLayer(name='de_positional_embed_layer')
### Decoder Multi-Head Attention Sub Layer
self.mha_sub_layer1 = MhaSubLayer(units, num_heads=8, name='de_mha_layer_1')
self.mha_sub_layer2 = MhaSubLayer(units, num_heads=8, name='de_mha_layer_2')
self.mha_sub_layer3 = MhaSubLayer(units, num_heads=8, name='de_mha_layer_3')
self.mha_sub_layer4 = MhaSubLayer(units, num_heads=8, name='de_mha_layer_4')
self.mha_sub_layer5 = MhaSubLayer(units, num_heads=8, name='de_mha_layer_5')
self.mha_sub_layer6 = MhaSubLayer(units, num_heads=8, name='de_mha_layer_6')
### Decoder MHA Droput Layer
self.dropout = Dropout(rate=0.1, name='de_dropout_pos_enc')
self.dropout1 = Dropout(rate=0.1, name='de_dropout_layer1')
self.dropout2 = Dropout(rate=0.1, name='de_dropout_layer2')
self.dropout3 = Dropout(rate=0.1, name='de_dropout_layer3')
self.dropout4 = Dropout(rate=0.1, name='de_dropout_layer4')
self.dropout5 = Dropout(rate=0.1, name='de_dropout_layer5')
self.dropout6 = Dropout(rate=0.1, name='de_dropout_layer6')
### Dense Output Layer
self.output_dense_layer = TimeDistributed(Dense(1), name="output_layer")
def call(self, x, en_context):
embedding_output = self.embedding(x)
positional_embedding = self.pos_embedding(embedding_output)
postitional_embedding = self.dropout(positional_embedding)
### First MHA Sub-Layer
sub_layer1_out = self.mha_sub_layer1(positional_embedding, positional_embedding)
sub_layer1_out = self.dropout1(sub_layer1_out)
### Second MHA Sub-Layer
sub_layer2_out = self.mha_sub_layer2(sub_layer1_out, en_context)
sub_layer2_out = self.dropout2(sub_layer2_out)
### Third MHA Sub-Layer
sub_layer3_out = self.mha_sub_layer3(sub_layer2_out, en_context)
sub_layer3_out = self.dropout3(sub_layer3_out)
### Fourth MHA Sub-Layer
sub_layer4_out = self.mha_sub_layer4(sub_layer3_out, en_context)
sub_layer4_out = self.dropout4(sub_layer4_out)
### Fifth MHA Sub-Layer
sub_layer5_out = self.mha_sub_layer5(sub_layer4_out, en_context)
sub_layer5_out = self.dropout5(sub_layer5_out)
### Sixth MHA Sub-Layer
sub_layer6_out = self.mha_sub_layer6(sub_layer5_out, en_context)
sub_layer6_out = self.dropout6(sub_layer6_out)
### Output Dense Layer
output = self.output_dense_layer(sub_layer6_out)
output = tf.round(tf.abs(output))
return output
Sample Data:
np.random.seed(42)
trainX = np.random.randint(0, high=250, size=(5,12))
trainXt_in = np.random.randint(0, high=250, size=(5,3))
trainY = np.random.randint(0, high=250, size=(5,3,1))
Modelling Block:
Training shape: ((1616304, 12), (1616304, 3), (1616304, 3, 1))
## The Model Sub-Class
class Trxster(Model):
def __init__(self, units, en_embed_dim, de_embed_dim, name='Trxster', **kwargs):
super().__init__()
self.encoder = Encoder(units, en_embed_dim)
self.decoder = Decoder(units, de_embed_dim)
def call(self, inputs):
context_vec, target_in = inputs
context = self.encoder(context_vec)
preds = self.decoder(target_in, context)
return preds
forecastor = Trxster(hsize, embed_dim, embed_dim)
forecastor.build(((12, 1),(3, 1)))
forecastor.summary()
Error-1:
TypeError: Error converting shape to a TensorShape: Dimension value must be integer or None or have an index method, got value '(12, 1)' with type '<class 'tuple'>'.
If run the model with an example:
hsize = 512
embed_dim = 268
forecastor = Trxster(hsize, embed_dim, embed_dim)
forecastor((trainX, trainXt_in))
Model: "trxster_11"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
encoder_13 (Encoder) multiple 63156224
decoder_13 (Decoder) multiple 63156737
=================================================================
Total params: 126312961 (481.85 MB)
Trainable params: 126312961 (481.85 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
### Fit the Model
batch_size = 64
epochs = 100
steps = trainX.shape[0]//batch_size
warmup_steps = steps//25
class MyLRSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
def __init__(self, d_model, warmup_steps):
self.d_model = d_model
self.warmup_steps = warmup_steps
def __call__(self, step):
step_num = step.numpy()
self.lr = []
denom = self.d_model**(-0.5)
numer = min(step_num**(-0.5), step_num*(self.warmup_steps**(-1.5)))
lrate = np.divide(numer, denom)
self.lr.append(lrate)
return lrate
opt = tf.keras.optimizers.Adam(learning_rate=MyLRSchedule(hsize, warmup_steps), beta_1=0.9, beta_2=0.98, epsilon=1e-8)
### Configure Trxster
checkpoint_filepath = './training_ckpt'
cb = [tf.keras.callbacks.EarlyStopping(patience=10,
monitor='val_loss',
restore_best_weights=True),
tf.keras.callbacks.ModelCheckpoint(
filepath=checkpoint_filepath,
save_weights_only=True,
monitor='val_loss',
mode='min',
verbose=1,
save_best_only=True)]
loss = tf.keras.losses.MeanSquaredError()
metrics = [tf.keras.metrics.Accuracy(), tf.keras.losses.MeanAbsoluteError()]
forecastor.compile(optimizer=opt,
loss='mean_squared_error',
metrics=['acc','mean_absolute_error'])
history = forecastor.fit((trainX, trainXt_in), trainY,
batch_size=batch_size,
steps_per_epoch=steps,
epochs=1,
validation_data=((valX, ValXt_in), valY),
callbacks=cb)
Error-2: Providing few lines of error trace:
ValueError: No gradients provided for any variable: (['trxster_11/encoder_13/en_embed_layer/embeddings:0', 'trxster_11/encoder_13/mha_sub_layer_157/en_mha_layer_1/query/kernel:0', 'trxster_11/encoder_13/mha_sub_layer_157/en_mha_layer_1/query/bias:0', 'trxster_11/encoder_13/mha_sub_layer_157/en_mha_layer_1/key/kernel:0', 'trxster_11/encoder_13/mha_sub_layer_157/en_mha_layer_1/key/bias:0', 'trxster_11/encoder_13/mha_sub_layer_157/en_mha_layer_1/value/kernel:0', 'trxster_11/encoder_13/mha_sub_layer_157/en_mha_layer_1/value/bias:0', 'trxster_11/encoder_13/mha_sub_layer_157/en_mha_layer_1/attention_output/kernel:0', 'trxster_11/encoder_13/mha_sub_layer_157/en_mha_layer_1/attention_output/bias:0', 'trxster_11/encoder_13/mha_sub_layer_157/time_distributed_314/kernel:0'
Every examples I see tells me that it should work just the way I have written but it isn't so.
Solution
I figured it out! Gradient calculations fail when there is a tensorflow function in the graph which is the case in my network where I have applied tf.round
and tf.abs
in the output layer of the Decoder
. That was failing the gradient calculations. I removed them and it the model trains as expected. Here is the link to the issue https://github.com/tensorflow/tensorflow/issues/1511.
Decoder:
class Decoder(Layer):
def __init__(self, units, embed_input_dim, name='decoder', **kwargs):
super().__init__()
### Decoder Input Embedding Layer
self.embedding = Embedding(input_dim=embed_input_dim, output_dim=units, name='de_embed_layer')
self.pos_embedding = PositionalEncodingLayer(name='de_positional_embed_layer')
### Decoder Multi-Head Attention Sub Layer
self.mha_sub_layer1 = MhaSubLayer(units, num_heads=8, name='de_mha_layer_1')
self.mha_sub_layer2 = MhaSubLayer(units, num_heads=8, name='de_mha_layer_2')
self.mha_sub_layer3 = MhaSubLayer(units, num_heads=8, name='de_mha_layer_3')
self.mha_sub_layer4 = MhaSubLayer(units, num_heads=8, name='de_mha_layer_4')
self.mha_sub_layer5 = MhaSubLayer(units, num_heads=8, name='de_mha_layer_5')
self.mha_sub_layer6 = MhaSubLayer(units, num_heads=8, name='de_mha_layer_6')
### Decoder MHA Droput Layer
self.dropout = Dropout(rate=0.1, name='de_dropout_pos_enc')
self.dropout1 = Dropout(rate=0.1, name='de_dropout_layer1')
self.dropout2 = Dropout(rate=0.1, name='de_dropout_layer2')
self.dropout3 = Dropout(rate=0.1, name='de_dropout_layer3')
self.dropout4 = Dropout(rate=0.1, name='de_dropout_layer4')
self.dropout5 = Dropout(rate=0.1, name='de_dropout_layer5')
self.dropout6 = Dropout(rate=0.1, name='de_dropout_layer6')
### Dense Output Layer
self.output_dense_layer = TimeDistributed(Dense(1), name="output_layer")
def call(self, x, en_context):
embedding_output = self.embedding(x)
positional_embedding = self.pos_embedding(embedding_output)
postitional_embedding = self.dropout(positional_embedding)
### First MHA Sub-Layer
sub_layer1_out = self.mha_sub_layer1(positional_embedding, positional_embedding)
sub_layer1_out = self.dropout1(sub_layer1_out)
### Second MHA Sub-Layer
sub_layer2_out = self.mha_sub_layer2(sub_layer1_out, en_context)
sub_layer2_out = self.dropout2(sub_layer2_out)
### Third MHA Sub-Layer
sub_layer3_out = self.mha_sub_layer3(sub_layer2_out, en_context)
sub_layer3_out = self.dropout3(sub_layer3_out)
### Fourth MHA Sub-Layer
sub_layer4_out = self.mha_sub_layer4(sub_layer3_out, en_context)
sub_layer4_out = self.dropout4(sub_layer4_out)
### Fifth MHA Sub-Layer
sub_layer5_out = self.mha_sub_layer5(sub_layer4_out, en_context)
sub_layer5_out = self.dropout5(sub_layer5_out)
### Sixth MHA Sub-Layer
sub_layer6_out = self.mha_sub_layer6(sub_layer5_out, en_context)
sub_layer6_out = self.dropout6(sub_layer6_out)
### Output Dense Layer
output = self.output_dense_layer(sub_layer6_out)
return output
Answered By - Krishnang K Dalal
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.