Issue
I'm working on building a seq2seq model using encoder-decoder architecture for which I have built a tf.data.Dataset
pipeline that reads the text from the directories, vectorizes using them tf.keras.layers.TextVectorization
and preprocess it to be fed for model training. I'm not able to format my labels
such that it is of the shape (None, seq_len, target_vocab_size)
. I tried using to map tf.utils.to_categorical
to the labels but it won't work on the tensors. Strangely there is no material out there where there was a similar problem discussed. Below is my implementation:
BUFFER_SIZE = len(articles)
BATCH_SIZE = 64
train_raw = (tf.data.Dataset
.from_tensor_slices((articles[is_train], summaries[is_train]))
.shuffle(BUFFER_SIZE)
.batch(BATCH_SIZE))
val_raw = (tf.data.Dataset
.from_tensor_slices((articles[~is_train], summaries[~is_train]))
.shuffle(BUFFER_SIZE)
.batch(BATCH_SIZE))
context_vectorizer = tf.keras.layers.TextVectorization(
standardize = tf_lower_and_split_punct,
max_tokens = MAX_VOCAB_SIZE,
ragged=True)
target_vectorizer = tf.keras.layers.TextVectorization(
standardize=tf_lower_and_split_punct,
max_tokens=MAX_VOCAB_SIZE,
ragged=True)
context_vectorizer.adapt(train_raw.map(lambda context, target: context))
target_vectorizer.adapt(train_raw.map(lambda context, target: target))
def preprocess_text(context, target):
context = context_vectorizer(context).to_tensor()
target = target_vectorizer(target)
target_in = target[:,:-1].to_tensor()
target_out = target[:,1:].to_tensor()
# target_out = target[:,:-1]
return (context, target_in), target_out
train_ds = train_raw.map(preprocess_text, tf.data.AUTOTUNE)
val_ds = val_raw.map(preprocess_text, tf.data.AUTOTUNE)
def encoder(hsize, embed_dim=200):
en_input_layer = Input(shape=(None,), name='encoder_input_layer', ragged=True)
en_embed = Embedding(context_vectorizer.vocabulary_size()+1, output_dim=embed_dim, name='encoder_embedding_layer')
en_embed_out = en_embed(en_input_layer)
en_gru_1 = GRU(hsize, return_sequences=True, return_state=True, name='encoder_gru_layer_1')
en_gru_1_out, en_gru_states = en_gru_1(en_embed_out)
return en_input_layer, en_gru_1_out, en_gru_states
def decoder(hsize, encoder_states, embed_dim=200):
de_input_layer = Input(shape=(None,), name='decoder_input_layer', ragged=True)
de_embed = Embedding(target_vectorizer.vocabulary_size()+1, output_dim=embed_dim, name='decode_embedding_layer')
de_embed_out = de_embed(de_input_layer)
de_gru_1 = GRU(hsize, return_sequences=True, name='decoder_gru_layer_1')
de_gru_1_out = de_gru_1(de_embed_out, initial_state=encoder_states)
de_dense = TimeDistributed(Dense(target_vectorizer.vocabulary_size(), activation='softmax'), name='time_distributed_output_layer')
de_preds = de_dense(de_gru_1_out)
return de_input_layer, de_preds
hsize = 256
def create_model(hsize):
en_input_layer, enc_out, enc_states = encoder(hsize)
de_input_layer, de_preds = decoder(hsize, enc_states)
model = Model(inputs=[en_input_layer, de_input_layer], outputs=de_preds)
model.compile(optimizer='adam', loss='categorical_crossentropy',
metrics=["acc"])
return model
### Model training
m = create_model(hsize)
history = m.fit(
train_ds.repeat(),
steps_per_epoch=100,
epochs=100,
validation_data=val_ds,
callbacks=[
tf.keras.callbacks.ModelCheckpoint('./checkpoints_trial_1',
save_weights_only=True),
tf.keras.callbacks.EarlyStopping(patience=3)])
The model summary is below:
Layer (type) Output Shape Param # Connected to
==================================================================================================
encoder_input_layer (Input [(None, None)] 0 []
Layer)
decoder_input_layer (Input [(None, None)] 0 []
Layer)
encoder_embedding_layer (E (None, None, 200) 437200 ['encoder_input_layer[0][0]']
mbedding)
decode_embedding_layer (Em (None, None, 200) 244200 ['decoder_input_layer[0][0]']
bedding)
encoder_gru_layer_1 (GRU) [(None, None, 256), 351744 ['encoder_embedding_layer[0][0
(None, 256)] ]']
decoder_gru_layer_1 (GRU) (None, None, 256) 351744 ['decode_embedding_layer[0][0]
',
'encoder_gru_layer_1[0][1]']
time_distributed_output_la (None, None, 1220) 313540 ['decoder_gru_layer_1[0][0]']
yer (TimeDistributed)
==================================================================================================
Total params: 1698428 (6.48 MB)
Trainable params: 1698428 (6.48 MB)
Non-trainable params: 0 (0.00 Byte)
__________________________________________________________________________________________________
The model compile's fine but when I run the fit
method I get the following error:
ValueError: Shapes (None, None) and (None, None, 1220) are incompatible
I'm struggling with defining the model's Input
layers correctly, or preprocess_text
output that would work with the model definition.
Solution
Reposting from above comment, to fix the above issue, we can either change the loss method that works on sparse vector or transform the target label to one-hot encoded. Below is the complete working code with some dummy data.
make_one_hot = False # params: True, False
num_articles = 1000
num_summaries = 1000
MAX_VOCAB_SIZE = 5000
articles = np.array([f"Article {i}" for i in range(num_articles)])
summaries = np.array([f"Summary {i}" for i in range(num_summaries)])
is_train = np.random.rand(len(articles)) < 0.8
def tf_lower_and_split_punct(text):
text = tf.strings.lower(text)
text = tf.strings.regex_replace(text, '[.?!,¿]', ' ')
text = tf.strings.strip(text)
text = tf.strings.join([' ', text, ' '])
return text
BUFFER_SIZE = len(articles)
BATCH_SIZE = 64
train_raw = (tf.data.Dataset
.from_tensor_slices((articles[is_train], summaries[is_train]))
.shuffle(BUFFER_SIZE)
.batch(BATCH_SIZE))
val_raw = (tf.data.Dataset
.from_tensor_slices((articles[~is_train], summaries[~is_train]))
.shuffle(BUFFER_SIZE)
.batch(BATCH_SIZE))
context_vectorizer = tf.keras.layers.TextVectorization(
standardize = tf_lower_and_split_punct,
max_tokens = MAX_VOCAB_SIZE,
ragged=True)
target_vectorizer = tf.keras.layers.TextVectorization(
standardize=tf_lower_and_split_punct,
max_tokens=MAX_VOCAB_SIZE,
ragged=True)
context_vectorizer.adapt(train_raw.map(lambda context, target: context))
target_vectorizer.adapt(train_raw.map(lambda context, target: target))
def preprocess_text(context, target):
context = context_vectorizer(context).to_tensor()
target = target_vectorizer(target)
target_in = target[:,:-1].to_tensor()
target_out = target[:,1:].to_tensor()
if make_one_hot:
target_out = tf.one_hot(
target_out,
depth=tf.cast(
target_vectorizer.vocabulary_size(), dtype='int32'
)
)
return (context, target_in), target_out
train_ds = train_raw.map(preprocess_text, tf.data.AUTOTUNE)
val_ds = val_raw.map(preprocess_text, tf.data.AUTOTUNE)
def create_model(hsize):
en_input_layer, enc_out, enc_states = encoder(hsize)
de_input_layer, de_preds = decoder(hsize, enc_states)
model = Model(inputs=[en_input_layer, de_input_layer], outputs=de_preds)
if make_one_hot:
loss_fn = 'categorical_crossentropy'
else:
loss_fn = 'sparse_categorical_crossentropy'
model.compile(
optimizer='adam',
loss=loss_fn,
metrics=["acc"]
)
return model
model.fit(train_ds)
5s 24ms/step - loss: 6.7114 - acc: 0.003
<keras.callbacks.History at 0x7bfef0423f40>
Reference
Selecting loss and metrics for Tensorflow model
Answered By - Innat
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.