Issue
I'm working on developing a neural network from scratch. The issue seems to maybe be with my relu back-propagation. When I train the model it sometimes outputs -0 and sometimes outputs good predictions (relatively). Can someone tell me if I'm doing my back propagation incorrectly or if there's a reason why my relu would be predicting -0?
-- [edit]
Fixed the issue of predicting -0, but now it just predicts 0 for all inputs for the XOR. Can someone look over my backpropagation?
import numpy as np
# Each layer in our neural network
class NeuralLayer:
def __init__(self, input_neurons, output_neurons):
self.weights = np.random.randn(input_neurons, output_neurons)* np.sqrt(2. / input_neurons)
self.bias = np.ones((1,output_neurons)) * 0.5
# Two different activations, sigmoid by default
def sigmoid(self, neurons):
self.act = 1.0/(1.0 + np.exp(-neurons))
return self.act
def sigmoidBackward(self, grad):
return grad * self.act * (1 - self.act)
def relu(self, neurons):
self.act = (neurons > 0)
return neurons * self.act
def reluBackward(self, grad):
return grad * self.act
# Forward pass for this layer
def forward(self, input, activation):
self.input = np.atleast_2d(input)
if activation == 'sigmoid':
return self.sigmoid(input @ self.weights + self.bias)
else:
return self.relu(input @ self.weights + self.bias)
# backward pass for this layer
def backward(self, grad, activation):
if activation == 'sigmoid':
grad = self.sigmoidBackward(np.atleast_2d(grad))
else:
grad = self.reluBackward(np.atleast_2d(grad))
self.grad_weights = np.matmul(self.input.T, grad)
self.grad_bias = grad.sum()
return grad @ self.weights.T
def step(self, step_size):
self.weights -= step_size*self.grad_weights
self.bias -= step_size*self.grad_bias
# Our neural net
class NeuralNetwork:
# Dynamically create all layers
def __init__(self, input_neurons, hidden_neurons, layer_count, activation, output_neurons = 1):
self.activation = activation
# Used to ensure input neurons match inputted data
self.neuron_safety = input_neurons
assert layer_count >= 2 and output_neurons >= 1
# Input layer
self.layers = [NeuralLayer(input_neurons, hidden_neurons)]
# Hidden Layers
for i in range(layer_count - 2):
self.layers.append(NeuralLayer(hidden_neurons, hidden_neurons))
# Output layer
self.layers.append(NeuralLayer(hidden_neurons, output_neurons))
# Forward pass for each layer
def forward(self, inp):
assert inp.shape[0] == self.neuron_safety
for layer in self.layers:
inp = layer.forward(inp, self.activation)
return inp
def backward(self, grad):
for layer in reversed(self.layers):
grad = layer.backward(grad, self.activation)
def step(self, step_size = 0.01):
for layer in self.layers:
layer.step(step_size)
# loss function - only 1 output neuron
def meanSquaredError(self, preds, labels):
self.labels = labels
self.preds = preds
return (self.preds - self.labels)**2
def meanSquaredErrorGrad(self):
return 2 * (self.preds - self.labels)
# Create a neural network with 2 inputs, 2 hidden neurons in each layer, and 2 layers
net = NeuralNetwork(2,16,4, 'relu')
epochs = 5000
# Input data (A,B) for XOR
X = np.array([[0,0],[1,1], [1,0],[0,1]])
# Expected output data
Y = np.array([[0],[0],[1],[1]])
for i in range(epochs):
preds = []
for idx, x in enumerate(X):
predictions = net.forward(x)
preds.append(predictions)
loss = net.meanSquaredError(predictions, Y[idx])
loss_grad = net.meanSquaredErrorGrad()
net.backward(loss_grad)
net.step()
print("Model predicted: {}\nactual values: {} ".format(preds, Y.T))
Output:
Model predicted: [array([[-0.]]), array([[-0.]]), array([[1.]]), array([[-0.]])]
actual values: [[0 0 1 1]]
Sometimes the predictions are perfect, but most of the time at least one prediction will be -0
Solution
The bias gradient is incorrect. You are using self.grad_bias = grad.sum()
. This will compute the sum of the entire matrix. It needs to be self.grad_bias = grad.sum(axis=0, keepdims=True)
to compute a 1 x output_neurons
array that will properly update the bias vector. Otherwise, grad.sum()
provides a single number that you are using to update all of your biases, which is not correct.
Also, make sure you update your forward pass for your ReLU to np.maximum(neurons, 0)
as described in the comments.
def relu(self, neurons):
self.act = (neurons > 0)
return np.maximum(neurons, 0)
The gradient of the activations will be 0 or 1 depending on which parts of the inputs were positive.
Finally, for the XOR problem you typically do not use ReLU as the activation for the output layer because it is not bounded between [0-1] as per the XOR problem. The reason why you got good results with the sigmoid activation function is that the dynamic range of that activation function suits the XOR problem well. As an experiment, you can modify the output layer to be sigmoid, and the hidden layers to be ReLU. If you do this, you should get just as good a performance as using sigmoid all the way.
Answered By - rayryeng
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.