Issue
How can one calculate the gradient on a variable with respect to another variable used in a linear combination? The following code is executed in TensorFlow eager mode.
Some more digging in older questions, a similar question showed up. However, it is not clear on how to solve this issue.
Another related question is this one, but here the same variable is reused and TensorFlow v1
.
I also read in this question that tf.assign
(v1?) does not support gradients and a potential solution is provided there.
However, I'd apply it in context of internal model weights of neural networks, but I don't know how to apply that tensor-approach in practice.
a = tf.Variable(1.0, name='a')
b = tf.Variable(2.0, name='b')
c = tf.Variable(3.0, name='c')
with tf.GradientTape() as tape:
c.assign(a + b)
loss = tf.reduce_mean(c**2)
print(tape.gradient(loss, b)) # prints None
# or another attempt
with tf.GradientTape(watch_accessed_variables=False) as tape:
tape.watch([b,c])
c.assign(a + b)
loss = tf.reduce_mean(c**2)
print(tape.gradient(loss, b)) # also outputs None
# Working, but c is a variable in my use case
with tf.GradientTape() as tape:
c = a + b
loss = tf.reduce_mean(c**2)
print(tape.gradient(loss, b)) # Works
Extension:
import tensorflow as tf
a = [tf.Variable(1.0, name='a'), tf.Variable(4.0, name='aa')]
b = [tf.Variable(2.0, name='b'), tf.Variable(9.0, name='bb')]
c = [tf.Variable(3.0, name='c'), tf.Variable(0.0, name='cc')]
x = tf.Variable(0.01)
with tf.GradientTape(persistent=True) as tape:
c_ = tf.nest.map_structure(lambda _a, _b: (1-x)*_a+ x*_b, a, b)
tf.nest.map_structure(lambda x, y: x.assign(y), c, c_)
loss = tf.norm(c) # scalar
# This works as expected
print(tape.gradient(loss,c,output_gradients=tape.gradient(c_,b)))
# [<tf.Tensor: shape=(), dtype=float32, numpy=0.0024197185>, <tf.Tensor: shape=(), dtype=float32, numpy=0.009702832>]
# Here I would expect a 1D gradient to use the Gradient Descent method?
print(tape.gradient(loss,c,output_gradients=tape.gradient(c_,x)))
# [<tf.Tensor: shape=(), dtype=float32, numpy=1.4518311>, <tf.Tensor: shape=(), dtype=float32, numpy=5.8216996>]
# Example what I'd like to achieve;
with tf.GradientTape() as tape:
c_ = tf.nest.map_structure(lambda _a, _b: (1-x)*_a+ x*_b, a, b)
loss = tf.norm(c_) # scalar
print(tape.gradient(loss,x))
# tf.Tensor(5.0933886, shape=(), dtype=float32)
A more sophisticated issue:
import tensorflow as tf
a = [tf.Variable([1.0, 2.0], name='a'), tf.Variable([5.0], name='aa'), tf.Variable(7.0, name='aaa')]
b = [tf.Variable([3.0, 4.0], name='b'), tf.Variable([6.0], name='bb'), tf.Variable(8.0, name='aaa')]
c = [tf.Variable([1.0, 1.0], name='c'), tf.Variable([1.0], name='cc'), tf.Variable(1.0, name='ccc')]
x = tf.Variable(0.5, name='x')
with tf.GradientTape(persistent=True) as tape:
c_ = tf.nest.map_structure(lambda _a, _b: (1-x)*_a+ x*_b, a, b)
tf.nest.map_structure(lambda x, y: x.assign(y), c, c_)
loss = tf.norm(tf.nest.map_structure(lambda e: tf.norm(e), c))
loss_without_assign = tf.norm(tf.nest.map_structure(lambda e: tf.norm(e), c_))
print(loss, loss_without_assign)
# tf.Tensor(9.974969, shape=(), dtype=float32) tf.Tensor(9.974969, shape=(), dtype=float32)
# Gives same result
#partial_grads = tf.nest.map_structure(lambda d, e: tf.nest.map_structure(lambda f, g: tape.gradient(loss, f, output_gradients=tape.gradient(g, x)), d, e), c, c_)
partial_grads = tf.nest.map_structure(lambda d, e: tape.gradient(loss, d, output_gradients=tape.gradient(e, x)), c, c_)
# Should not use mean?
print(tf.reduce_sum(tf.nest.map_structure(lambda z: tf.reduce_mean(z), partial_grads)))
print(tape.gradient(loss_without_assign, x))
# Rather close
# tf.Tensor(2.3057716, shape=(), dtype=float32)
# tf.Tensor(2.3057709, shape=(), dtype=float32)
Solution
Maybe you can try as following:
import tensorflow as tf
a = tf.Variable(1.0, name='a')
b = tf.Variable(2.0, name='b')
c = tf.Variable(3.0, name='c')
with tf.GradientTape(persistent=True) as tape:
c_ = a + 2*b
c.assign(c_)
loss = tf.reduce_mean(c**2)
print(tape.gradient(loss,c,output_gradients=tape.gradient(c_,b)))
# tf.Tensor(20.0, shape=(), dtype=float32)
P.S. output_gradients
is a parameter of tf.GradientTape.gradient
that hidden in the corner and rarely found, which can be used to manually build cascade differentiation.
- For Extension:
import tensorflow as tf
a = [tf.Variable(1.0, name='a'), tf.Variable(4.0, name='aa')]
b = [tf.Variable(2.0, name='b'), tf.Variable(9.0, name='bb')]
c = [tf.Variable(3.0, name='c'), tf.Variable(0.0, name='cc')]
x = tf.Variable(0.0, name='x')
with tf.GradientTape(persistent=True) as tape:
c_ = tf.nest.map_structure(lambda _a, _b: (1-x)*_a+ x*_b, a, b)
tf.nest.map_structure(lambda x, y: x.assign(y), c, c_)
loss = tf.norm(c) # scalar
print(tape.gradient(loss,c[0],output_gradients=tape.gradient(c_[0],x))+\
tape.gradient(loss,c[1],output_gradients=tape.gradient(c_[1],x)))
# tf.Tensor(5.0932484, shape=(), dtype=float32)
Explaination:
Because tf.GradientTape is based on the matrix differential theory
, but will collect all derivation of a same variable (add them to a whole) after .gradient()
. Such as derive a vector
with respect to a scalar
, in matrix theory, we will get a vector
derivation, but in tf.GradientTape
, a reduce_sum like
will be applied then to got a summated scalar.
Here tape.gradient(loss,c,output_gradients=tape.gradient(c_,x))
acctually did:
tape.gradient(loss,c[0],output_gradients=tape.gradient(c_,x)[0]),
tape.gradient(loss,c[1],output_gradients=tape.gradient(c_,x)[1])
but
tape.gradient(c_,x)[0] != tape.gradient(c_[0],x)
tape.gradient(c_,x)[1] != tape.gradient(c_[1],x)
So tape.gradient(loss,c,output_gradients=tape.gradient(c_,x))
contrary to our original intention.
- For the more sophisticated issue:
jacobian
is needed
import tensorflow as tf
tf.keras.utils.set_random_seed(0)
a = [tf.Variable(tf.random.normal(shape=[2])),tf.Variable(tf.random.normal(shape=[1])),tf.Variable(tf.random.normal(shape=[]))]
b = [tf.Variable(tf.random.normal(shape=[2])),tf.Variable(tf.random.normal(shape=[1])),tf.Variable(tf.random.normal(shape=[]))]
c = [tf.Variable(tf.random.normal(shape=[2])),tf.Variable(tf.random.normal(shape=[1])),tf.Variable(tf.random.normal(shape=[]))]
x = tf.Variable(tf.random.normal(shape=[]), name='x')
with tf.GradientTape(persistent=True) as tape:
c_ = tf.nest.map_structure(lambda _a, _b: (1-x)*_a+ x*_b, a, b)
tf.nest.map_structure(lambda x, y: x.assign(y), c, c_)
loss = tf.norm(tf.nest.map_structure(lambda e: tf.norm(e), c))
loss_without_assign = tf.norm(tf.nest.map_structure(lambda e: tf.norm(e), c_))
print(loss, loss_without_assign)
print(tf.reduce_sum([
tf.reduce_sum(tape.jacobian(c_[0],x)*tape.gradient(loss,c[0])),
tf.reduce_sum(tape.jacobian(c_[1],x)*tape.gradient(loss,c[1])),
tf.reduce_sum(tape.jacobian(c_[2],x)*tape.gradient(loss,c[2]))
]))
# tf.Tensor(0.7263656, shape=(), dtype=float32)
print(tape.gradient(loss_without_assign, x))
# tf.Tensor(0.7263656, shape=(), dtype=float32)
Answered By - Little Train
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.