Thursday, December 7, 2023

[FIXED] How to format Ragged Tensor for Encoder-Decoder model?

December 07, 2023 deep-learning, keras, nlp, python, tensorflow No comments

Issue

I'm working on building a seq2seq model using encoder-decoder architecture for which I have built a tf.data.Dataset pipeline that reads the text from the directories, vectorizes using them tf.keras.layers.TextVectorization and preprocess it to be fed for model training. I'm not able to format my labels such that it is of the shape (None, seq_len, target_vocab_size). I tried using to map tf.utils.to_categorical to the labels but it won't work on the tensors. Strangely there is no material out there where there was a similar problem discussed. Below is my implementation:

BUFFER_SIZE = len(articles)
BATCH_SIZE = 64

train_raw = (tf.data.Dataset
             .from_tensor_slices((articles[is_train], summaries[is_train]))
             .shuffle(BUFFER_SIZE)
             .batch(BATCH_SIZE))

val_raw = (tf.data.Dataset
           .from_tensor_slices((articles[~is_train], summaries[~is_train]))
           .shuffle(BUFFER_SIZE)
           .batch(BATCH_SIZE))

context_vectorizer = tf.keras.layers.TextVectorization(
    standardize = tf_lower_and_split_punct,
    max_tokens = MAX_VOCAB_SIZE,
    ragged=True)

target_vectorizer = tf.keras.layers.TextVectorization(
    standardize=tf_lower_and_split_punct,
    max_tokens=MAX_VOCAB_SIZE,
    ragged=True)

context_vectorizer.adapt(train_raw.map(lambda context, target: context))
target_vectorizer.adapt(train_raw.map(lambda context, target: target))

def preprocess_text(context, target):
    context = context_vectorizer(context).to_tensor()
    target = target_vectorizer(target)

    target_in = target[:,:-1].to_tensor()
    target_out = target[:,1:].to_tensor()
    # target_out = target[:,:-1]
    return (context, target_in), target_out

train_ds = train_raw.map(preprocess_text, tf.data.AUTOTUNE)
val_ds = val_raw.map(preprocess_text, tf.data.AUTOTUNE)

def encoder(hsize, embed_dim=200):
    en_input_layer = Input(shape=(None,), name='encoder_input_layer', ragged=True)
    en_embed = Embedding(context_vectorizer.vocabulary_size()+1, output_dim=embed_dim, name='encoder_embedding_layer')
    en_embed_out = en_embed(en_input_layer)
    en_gru_1 = GRU(hsize, return_sequences=True, return_state=True, name='encoder_gru_layer_1')
    en_gru_1_out, en_gru_states = en_gru_1(en_embed_out)
    return en_input_layer, en_gru_1_out, en_gru_states

def decoder(hsize, encoder_states, embed_dim=200):
    de_input_layer = Input(shape=(None,), name='decoder_input_layer', ragged=True)
    de_embed = Embedding(target_vectorizer.vocabulary_size()+1, output_dim=embed_dim, name='decode_embedding_layer')
    de_embed_out = de_embed(de_input_layer)
    de_gru_1 = GRU(hsize, return_sequences=True, name='decoder_gru_layer_1')
    de_gru_1_out = de_gru_1(de_embed_out, initial_state=encoder_states)
    de_dense = TimeDistributed(Dense(target_vectorizer.vocabulary_size(), activation='softmax'), name='time_distributed_output_layer')
    de_preds = de_dense(de_gru_1_out)
    return de_input_layer, de_preds

hsize = 256

def create_model(hsize):
    en_input_layer, enc_out, enc_states = encoder(hsize)
    de_input_layer, de_preds = decoder(hsize, enc_states)
    model = Model(inputs=[en_input_layer, de_input_layer], outputs=de_preds)
    model.compile(optimizer='adam', loss='categorical_crossentropy',
                    metrics=["acc"])
    return model

### Model training

m = create_model(hsize)

history = m.fit(
        train_ds.repeat(),
        steps_per_epoch=100,
        epochs=100,
        validation_data=val_ds,
        callbacks=[
            tf.keras.callbacks.ModelCheckpoint('./checkpoints_trial_1',
                                                save_weights_only=True),
            tf.keras.callbacks.EarlyStopping(patience=3)])

The model summary is below:

 Layer (type)                Output Shape                 Param #   Connected to                  
==================================================================================================
 encoder_input_layer (Input  [(None, None)]               0         []                            
 Layer)                                                                                           
                                                                                                  
 decoder_input_layer (Input  [(None, None)]               0         []                            
 Layer)                                                                                           
                                                                                                  
 encoder_embedding_layer (E  (None, None, 200)            437200    ['encoder_input_layer[0][0]'] 
 mbedding)                                                                                        
                                                                                                  
 decode_embedding_layer (Em  (None, None, 200)            244200    ['decoder_input_layer[0][0]'] 
 bedding)                                                                                         
                                                                                                  
 encoder_gru_layer_1 (GRU)   [(None, None, 256),          351744    ['encoder_embedding_layer[0][0
                              (None, 256)]                          ]']                           
                                                                                                  
 decoder_gru_layer_1 (GRU)   (None, None, 256)            351744    ['decode_embedding_layer[0][0]
                                                                    ',                            
                                                                     'encoder_gru_layer_1[0][1]'] 
                                                                                                  
 time_distributed_output_la  (None, None, 1220)           313540    ['decoder_gru_layer_1[0][0]'] 
 yer (TimeDistributed)                                                                            
                                                                                                  
==================================================================================================
Total params: 1698428 (6.48 MB)
Trainable params: 1698428 (6.48 MB)
Non-trainable params: 0 (0.00 Byte)
__________________________________________________________________________________________________

The model compile's fine but when I run the fit method I get the following error:

ValueError: Shapes (None, None) and (None, None, 1220) are incompatible

I'm struggling with defining the model's Input layers correctly, or preprocess_text output that would work with the model definition.

Solution

Reposting from above comment, to fix the above issue, we can either change the loss method that works on sparse vector or transform the target label to one-hot encoded. Below is the complete working code with some dummy data.

make_one_hot = False # params: True, False

num_articles = 1000
num_summaries = 1000
MAX_VOCAB_SIZE = 5000
articles = np.array([f"Article {i}" for i in range(num_articles)])
summaries = np.array([f"Summary {i}" for i in range(num_summaries)])
is_train = np.random.rand(len(articles)) < 0.8

def tf_lower_and_split_punct(text):
    text = tf.strings.lower(text)
    text = tf.strings.regex_replace(text, '[.?!,¿]', ' ')
    text = tf.strings.strip(text)
    text = tf.strings.join([' ', text, ' '])
    return text

BUFFER_SIZE = len(articles)
BATCH_SIZE = 64

train_raw = (tf.data.Dataset
             .from_tensor_slices((articles[is_train], summaries[is_train]))
             .shuffle(BUFFER_SIZE)
             .batch(BATCH_SIZE))

val_raw = (tf.data.Dataset
           .from_tensor_slices((articles[~is_train], summaries[~is_train]))
           .shuffle(BUFFER_SIZE)
           .batch(BATCH_SIZE))

context_vectorizer = tf.keras.layers.TextVectorization(
    standardize = tf_lower_and_split_punct,
    max_tokens = MAX_VOCAB_SIZE,
    ragged=True)

target_vectorizer = tf.keras.layers.TextVectorization(
    standardize=tf_lower_and_split_punct,
    max_tokens=MAX_VOCAB_SIZE,
    ragged=True)

context_vectorizer.adapt(train_raw.map(lambda context, target: context))
target_vectorizer.adapt(train_raw.map(lambda context, target: target))

def preprocess_text(context, target):
    context = context_vectorizer(context).to_tensor()
    target = target_vectorizer(target)

    target_in = target[:,:-1].to_tensor()
    target_out = target[:,1:].to_tensor()
    
    if make_one_hot:
        target_out = tf.one_hot(
            target_out, 
            depth=tf.cast(
                target_vectorizer.vocabulary_size(), dtype='int32'
            )
        )
    return (context, target_in), target_out

train_ds = train_raw.map(preprocess_text, tf.data.AUTOTUNE)
val_ds = val_raw.map(preprocess_text, tf.data.AUTOTUNE)

def create_model(hsize):
    en_input_layer, enc_out, enc_states = encoder(hsize)
    de_input_layer, de_preds = decoder(hsize, enc_states)
    model = Model(inputs=[en_input_layer, de_input_layer], outputs=de_preds)
    
    if make_one_hot:
        loss_fn = 'categorical_crossentropy'
    else:
        loss_fn = 'sparse_categorical_crossentropy'
    
    model.compile(
        optimizer='adam', 
        loss=loss_fn,
        metrics=["acc"]
    )
    return model


model.fit(train_ds)
5s 24ms/step - loss: 6.7114 - acc: 0.003
<keras.callbacks.History at 0x7bfef0423f40>

Reference

Selecting loss and metrics for Tensorflow model

Answered By - Innat

This Answer collected from stackoverflow and tested by PythonFixing community admins, is licensed under cc by-sa 2.5 , cc by-sa 3.0 and cc by-sa 4.0

Thursday, December 7, 2023

[FIXED] How to format Ragged Tensor for Encoder-Decoder model?

Issue

Solution

0 comments:

Post a Comment

Popular Posts

Labels