Issue
I'm trying to write a custom training loop. After creating the model, I have added some extra trainable parameter to some layers of my model. I have used these extra parameters to update my original parameter on every forward pass. But when I'm calculating the gradient, it's giving None for the extra parameter that i have added last. Code is given below:
model = Sequential()
model.add(tf.keras.layers.Flatten(input_shape=(1,1)))
model.add(Dense(1, activation='relu'))
model.add(Dense(2, activation='softmax'))
model.layers[1].add_weight(name="x1", shape=(1,), initializer=tf.keras.initializers.Constant(value=1.0),trainable=True)
dataset = tf.data.Dataset.from_tensor_slices((feature, labels))
for i, (x_batch_train, y_batch_train) in enumerate(dataset):
with tf.GradientTape() as tape:
for par in model.layers[1].trainable_weights:
if "x1" in par.name:
bits = tf.convert_to_tensor(par)
for par in model.layers[1].trainable_weights:
if "kernel" in par.name:
par = bits + 1.0
x = model(x_batch_train, training = True)
loss = tf.keras.losses.SparseCategoricalCrossentropy(y_batch_train, x)
val = tape.gradient(loss, model.trainable_weights)
for v in val:
print(v)
Here, I have added one extra parameter called x1
and it's updating the kernel
of Dense layer. But I'm getting None gradient for x1
parameter. The output is:
tf.Tensor([[0.]], shape=(1, 1), dtype=float32)
tf.Tensor([-0.], shape=(1,), dtype=float32)
None
tf.Tensor([[0. 0.]], shape=(1, 2), dtype=float32)
tf.Tensor([-0.5 0.5], shape=(2,), dtype=float32)
Why it's happening?
Solution
The problem is that the changes you are making to the layer's weights have no direct connection to the output of the model in the context of tf.GradientTape
and are therefore not tracked. You could solve this with a simple custom layer:
import tensorflow as tf
class DenseLayer(tf.keras.layers.Layer):
def __init__(self, units=1):
super(DenseLayer, self).__init__()
self.units = units
def build(self, input_shape):
self.w = self.add_weight("kernel",
shape=[int(input_shape[-1]),
self.units], trainable=True)
self.b = self.add_weight(shape=(self.units,), initializer="zeros", trainable=True)
self.bits = self.add_weight(name="x1", shape=[int(input_shape[-1]),
self.units], initializer=tf.keras.initializers.ones(), trainable=True)
def call(self, inputs):
return tf.nn.relu(tf.matmul(inputs, (self.w + self.bits + 1.0)) + self.b)
dense_layer = DenseLayer(1)
model = tf.keras.Sequential()
model.add(tf.keras.layers.Flatten(input_shape=(1,1)))
model.add(dense_layer)
model.add(tf.keras.layers.Dense(2, activation='softmax'))
print(model.summary())
dataset = tf.data.Dataset.from_tensor_slices((tf.random.normal((50, 1, 1)), tf.random.uniform((50, ), maxval=2, dtype=tf.int32))).batch(2)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy()
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
for i, (x_batch_train, y_batch_train) in enumerate(dataset):
with tf.GradientTape() as tape:
y = model(x_batch_train, training = True)
loss = loss_fn(y_batch_train, y)
val = tape.gradient(loss, model.trainable_weights)
for v in val:
print(v)
optimizer.apply_gradients(zip(val, model.trainable_variables))
Answered By - AloneTogether
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.