改善深层神经网络-week1编程题（初始化、正则化、梯度校验）

初始化

分别使用0、随机数和抑梯度异常初始化参数，比较发现抑梯度异常初始化参数可以得到更高的准确度。

原始数据：

import numpy as np
import matplotlib.pyplot as plt
import sklearn
import sklearn.datasets
from init_utils import sigmoid, relu, compute_loss, forward_propagation, backward_propagation
from init_utils import update_parameters, predict, load_dataset, plot_decision_boundary, predict_dec
from math import sqrt

# %matplotlib inline
plt.rcParams['figure.figsize'] = (7.0, 4.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# load image dataset: blue/red dots in circles
train_X, train_Y, test_X, test_Y = load_dataset()

使用抑梯度异常初始化代码如下：

  1 #three layers
  2 def model(X, Y, learning_rate=0.01, num_iterations=15000, print_cost=True, initialization="he"):
  3     """
  4     Implements a three-layer neural network: LINEAR->RELU->LINEAR->RELU->LINEAR->SIGMOID.
  5 
  6     Arguments:
  7     X -- input data, of shape (2, number of examples)
  8     Y -- true "label" vector (containing 0 for red dots; 1 for blue dots), of shape (1, number of examples)
  9     learning_rate -- learning rate for gradient descent
 10     num_iterations -- number of iterations to run gradient descent
 11     print_cost -- if True, print the cost every 1000 iterations
 12     initialization -- flag to choose which initialization to use ("zeros","random" or "he")
 13 
 14     Returns:
 15     parameters -- parameters learnt by the model
 16     """
 17 
 18     grads = {}
 19     costs = []     # to keep track of the loss
 20     m = X.shape[1] # number of examples
 21     layers_dims = [X.shape[0], 10, 5, 1]
 22 
 23     # Initialize parameters dictionary.
 24     parameters = initialize_parameters_he(layers_dims)
 25 
 26     # Loop (gradient descent)
 27     for i in range(0, num_iterations):
 28 
 29         # Forward propagation: LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID.
 30         a3, cache = forward_propagation(X, parameters)
 31 
 32         # Loss
 33         cost = compute_loss(a3, Y)
 34 
 35         # Backward propagation.
 36         grads = backward_propagation(X, Y, cache)
 37 
 38         # Update parameters.
 39         parameters = update_parameters(parameters, grads, learning_rate)
 40 
 41         # Print the loss every 1000 iterations
 42         if print_cost and i % 1000 == 0:
 43             print("Cost after iteration {}: {}".format(i, cost))
 44             costs.append(cost)
 45 
 46     # plot the loss
 47     plt.plot(costs)
 48     plt.ylabel('cost')
 49     plt.xlabel('iterations (per hundreds)')
 50     plt.title("Learning rate =" + str(learning_rate))
 51     plt.show()
 52 
 53     return parameters
 54 
 55 
 56 # GRADED FUNCTION: initialize_parameters_he
 57 def initialize_parameters_he(layers_dims):
 58     """
 59     Arguments:
 60     layer_dims -- python array (list) containing the size of each layer.
 61 
 62     Returns:
 63     parameters -- python dictionary containing your parameters "W1", "b1", ..., "WL", "bL":
 64                     W1 -- weight matrix of shape (layers_dims[1], layers_dims[0])
 65                     b1 -- bias vector of shape (layers_dims[1], 1)
 66                     ...
 67                     WL -- weight matrix of shape (layers_dims[L], layers_dims[L-1])
 68                     bL -- bias vector of shape (layers_dims[L], 1)
 69     """
 70 
 71     np.random.seed(3)
 72     parameters = {}
 73     L = len(layers_dims) - 1 # integer representing the number of layers
 74 
 75     for l in range(1, L + 1):
 76         ### START CODE HERE ### (≈ 2 lines of code)
 77         parameters['W'+str(l)]=np.random.randn(layers_dims[l], layers_dims[l-1])*sqrt(2./layers_dims[l-1])
 78         parameters['b'+str(l)]=np.zeros((layers_dims[l], 1))
 79         ### END CODE HERE ###
 80     return parameters
 81 
 82 parameters = initialize_parameters_he([2, 4, 1])
 83 print("W1 = " + str(parameters["W1"]))
 84 print("b1 = " + str(parameters["b1"]))
 85 print("W2 = " + str(parameters["W2"]))
 86 print("b2 = " + str(parameters["b2"]))
 87 
 88 
 89 parameters = model(train_X, train_Y, initialization = "he")
 90 print("On the train set:")
 91 predictions_train = predict(train_X, train_Y, parameters)
 92 print("On the test set:")
 93 predictions_test = predict(test_X, test_Y, parameters)
 94 
 95 
 96 plt.title("Model with He initialization")
 97 axes = plt.gca()
 98 axes.set_xlim([-1.5, 1.5])
 99 axes.set_ylim([-1.5, 1.5])
100 plot_decision_boundary(lambda x: predict_dec(parameters, x.T), train_X, train_Y)

View Code

预测准确度0.96

L2正则化

原始数据：

1 from reg_utils import sigmoid, relu, plot_decision_boundary, initialize_parameters, load_2D_dataset, predict_dec
2 from reg_utils import compute_cost, predict, forward_propagation, backward_propagation, update_parameters
3 import scipy.io
4 from testCases_v3 import *
5 
6 train_X, train_Y, test_X, test_Y = load_2D_dataset()

如果不使用正则化：

 1 def model(X, Y, learning_rate = 0.3, num_iterations = 30000, print_cost = True, lambd = 0, keep_prob = 1):
 2     """
 3     Implements a three-layer neural network: LINEAR->RELU->LINEAR->RELU->LINEAR->SIGMOID.
 4 
 5     Arguments:
 6     X -- input data, of shape (input size, number of examples)
 7     Y -- true "label" vector (1 for blue dot / 0 for red dot), of shape (output size, number of examples)
 8     learning_rate -- learning rate of the optimization
 9     num_iterations -- number of iterations of the optimization loop
10     print_cost -- If True, print the cost every 10000 iterations
11     lambd -- regularization hyperparameter, scalar
12     keep_prob - probability of keeping a neuron active during drop-out, scalar.
13 
14     Returns:
15     parameters -- parameters learned by the model. They can then be used to predict.
16     """
17 
18     grads = {}
19     costs = []                            # to keep track of the cost
20     m = X.shape[1]                        # number of examples
21     layers_dims = [X.shape[0], 20, 3, 1]
22 
23     # Initialize parameters dictionary.
24     parameters = initialize_parameters(layers_dims)
25 
26     # Loop (gradient descent)
27     for i in range(0, num_iterations):
28 
29         # Forward propagation: LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID.
30         if keep_prob == 1:
31             a3, cache = forward_propagation(X, parameters)
32         elif keep_prob < 1:
33             a3, cache = forward_propagation_with_dropout(X, parameters, keep_prob)
34 
35         # Cost function
36         if lambd == 0:
37             cost = compute_cost(a3, Y)
38         else:
39             cost = compute_cost_with_regularization(a3, Y, parameters, lambd)
40 
41         # Backward propagation.
42         assert(lambd == 0 or keep_prob == 1)    # it is possible to use both L2 regularization and dropout,
43                                             # but this assignment will only explore one at a time
44         if lambd == 0 and keep_prob == 1:
45             grads = backward_propagation(X, Y, cache)
46         elif lambd != 0:
47             grads = backward_propagation_with_regularization(X, Y, cache, lambd)
48         elif keep_prob < 1:
49             grads = backward_propagation_with_dropout(X, Y, cache, keep_prob)
50 
51         # Update parameters.
52         parameters = update_parameters(parameters, grads, learning_rate)
53 
54         # Print the loss every 10000 iterations
55         if print_cost and i % 10000 == 0:
56             print("Cost after iteration {}: {}".format(i, cost))
57         if print_cost and i % 1000 == 0:
58             costs.append(cost)
59 
60     # plot the cost
61     plt.plot(costs)
62     plt.ylabel('cost')
63     plt.xlabel('iterations (x1,000)')
64     plt.title("Learning rate =" + str(learning_rate))
65     plt.show()
66 
67     return parameters
68 
69 
70 parameters = model(train_X, train_Y)
71 print("On the training set:")
72 predictions_train = predict(train_X, train_Y, parameters)
73 print("On the test set:")
74 predictions_test = predict(test_X, test_Y, parameters)
75 
76 
77 plt.title("Model without regularization")
78 axes = plt.gca()
79 axes.set_xlim([-0.75, 0.40])
80 axes.set_ylim([-0.75, 0.65])
81 plot_decision_boundary(lambda x: predict_dec(parameters, x.T), train_X, train_Y)

View Code

预测准确度0.915

使用了L2正则化：

 1 # GRADED FUNCTION: compute_cost_with_regularization
 2 def compute_cost_with_regularization(A3, Y, parameters, lambd):
 3     """
 4     Implement the cost function with L2 regularization. See formula (2) above.
 5     
 6     Arguments:
 7     A3 -- post-activation, output of forward propagation, of shape (output size, number of examples)
 8     Y -- "true" labels vector, of shape (output size, number of examples)
 9     parameters -- python dictionary containing parameters of the model
10     
11     Returns:
12     cost - value of the regularized loss function (formula (2))
13     """
14     m = Y.shape[1]
15     W1 = parameters["W1"]
16     W2 = parameters["W2"]
17     W3 = parameters["W3"]
18     
19     cross_entropy_cost = compute_cost(A3, Y) # This gives you the cross-entropy part of the cost
20     
21     ### START CODE HERE ### (approx. 1 line)
22     L2_regularization_cost=lambd/(2*m)*(np.sum(np.square(W1)) + np.sum(np.square(W2))  + np.sum(np.square(W3)))
23     ### END CODER HERE ###
24     
25     cost = cross_entropy_cost + L2_regularization_cost    
26     return cost
27 
28 A3, Y_assess, parameters = compute_cost_with_regularization_test_case()
29 print("cost = " + str(compute_cost_with_regularization(A3, Y_assess, parameters, lambd = 0.1)))
30 
31 
32 # GRADED FUNCTION: backward_propagation_with_regularization
33 def backward_propagation_with_regularization(X, Y, cache, lambd):
34     """
35     Implements the backward propagation of our baseline model to which we added an L2 regularization.
36     
37     Arguments:
38     X -- input dataset, of shape (input size, number of examples)
39     Y -- "true" labels vector, of shape (output size, number of examples)
40     cache -- cache output from forward_propagation()
41     lambd -- regularization hyperparameter, scalar
42     
43     Returns:
44     gradients -- A dictionary with the gradients with respect to each parameter, activation and pre-activation variables
45     """
46     
47     m = X.shape[1]
48     (Z1, A1, W1, b1, Z2, A2, W2, b2, Z3, A3, W3, b3) = cache
49     
50     dZ3 = A3 - Y
51     
52     ### START CODE HERE ### (approx. 1 line)
53     dW3=np.dot(dZ3,A2.T)/m+lambd*W3/m    
54     ### END CODE HERE ###
55     db3 = 1. / m * np.sum(dZ3, axis=1, keepdims=True)
56     
57     dA2 = np.dot(W3.T, dZ3)
58     dZ2 = np.multiply(dA2, np.int64(A2 > 0))
59     ### START CODE HERE ### (approx. 1 line）
60     dW2=np.dot(dZ2,A1.T)/m+lambd*W2/m    
61     ### END CODE HERE ###
62     db2 = 1. / m * np.sum(dZ2, axis=1, keepdims=True)
63     
64     dA1 = np.dot(W2.T, dZ2)
65     dZ1 = np.multiply(dA1, np.int64(A1 > 0))
66     ### START CODE HERE ### (approx. 1 line)
67     dW1=np.dot(dZ1,X.T)/m+lambd*W1/m    
68     ### END CODE HERE ###
69     db1 = 1. / m * np.sum(dZ1, axis=1, keepdims=True)
70     
71     gradients = {"dZ3": dZ3, "dW3": dW3, "db3": db3, "dA2": dA2,
72                  "dZ2": dZ2, "dW2": dW2, "db2": db2, "dA1": dA1, 
73                  "dZ1": dZ1, "dW1": dW1, "db1": db1}    
74     return gradients
75 
76 
77 X_assess, Y_assess, cache = backward_propagation_with_regularization_test_case()
78 grads = backward_propagation_with_regularization(X_assess, Y_assess, cache, lambd=0.7)
79 print ("dW1 = " + str(grads["dW1"]))
80 print ("dW2 = " + str(grads["dW2"]))
81 print ("dW3 = " + str(grads["dW3"]))
82 
83 
84 parameters = model(train_X, train_Y, lambd=0.7)
85 print("On the train set:")
86 predictions_train = predict(train_X, train_Y, parameters)
87 print("On the test set:")
88 predictions_test = predict(test_X, test_Y, parameters)
89 
90 
91 plt.title("Model with L2-regularization")
92 axes = plt.gca()
93 axes.set_xlim([-0.75,0.40])
94 axes.set_ylim([-0.75,0.65])
95 plot_decision_boundary(lambda x: predict_dec(parameters, x.T), train_X, train_Y)

View Code

预测准确率0.93

Dropout正则化（随机失活）

  1 # GRADED FUNCTION: forward_propagation_with_dropout
  2 def forward_propagation_with_dropout(X, parameters, keep_prob=0.5):
  3     """
  4     Implements the forward propagation: LINEAR -> RELU + DROPOUT -> LINEAR -> RELU + DROPOUT -> LINEAR -> SIGMOID.
  5     
  6     Arguments:
  7     X -- input dataset, of shape (2, number of examples)
  8     parameters -- python dictionary containing your parameters "W1", "b1", "W2", "b2", "W3", "b3":
  9                     W1 -- weight matrix of shape (20, 2)
 10                     b1 -- bias vector of shape (20, 1)
 11                     W2 -- weight matrix of shape (3, 20)
 12                     b2 -- bias vector of shape (3, 1)
 13                     W3 -- weight matrix of shape (1, 3)
 14                     b3 -- bias vector of shape (1, 1)
 15     keep_prob - probability of keeping a neuron active during drop-out, scalar
 16     
 17     Returns:
 18     A3 -- last activation value, output of the forward propagation, of shape (1,1)
 19     cache -- tuple, information stored for computing the backward propagation
 20     """    
 21     np.random.seed(1)
 22     
 23     # retrieve parameters
 24     W1 = parameters["W1"]
 25     b1 = parameters["b1"]
 26     W2 = parameters["W2"]
 27     b2 = parameters["b2"]
 28     W3 = parameters["W3"]
 29     b3 = parameters["b3"]
 30     
 31     # LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID
 32     Z1 = np.dot(W1, X) + b1
 33     A1 = relu(Z1)
 34     ### START CODE HERE ### (approx. 4 lines)         # Steps 1-4 below correspond to the Steps 1-4 described above. 
 35     D1=np.random.rand(A1.shape[0],A1.shape[1])        # Step 1: initialize matrix D1 = np.random.rand(..., ...)
 36     D1=D1<keep_prob                                   # Step 2: convert entries of D1 to 0 or 1 (using keep_prob as the threshold)
 37     A1=np.multiply(A1,D1)                             # Step 3: shut down some neurons of A1
 38     A1/=keep_prob                                     # Step 4: scale the value of neurons that haven't been shut down
 39     ### END CODE HERE ###
 40        
 41     Z2 = np.dot(W2, A1) + b2
 42     A2 = relu(Z2)
 43     ### START CODE HERE ### (approx. 4 lines)
 44     D2=np.random.rand(A2.shape[0], A2.shape[1])      # Step 1: initialize matrix D2 = np.random.rand(..., ...)
 45     D2=D2<keep_prob                                  # Step 2: convert entries of D2 to 0 or 1 (using keep_prob as the threshold)                           
 46     A2=np.multiply(A2,D2)                            # Step 3: shut down some neurons of A2
 47     A2/=keep_prob                                    # Step 4: scale the value of neurons that haven't been shut down
 48     ### END CODE HERE ###
 49     
 50     Z3 = np.dot(W3, A2) + b3
 51     A3 = sigmoid(Z3)
 52     
 53     cache = (Z1, D1, A1, W1, b1, Z2, D2, A2, W2, b2, Z3, A3, W3, b3)   
 54     return A3, cache
 55 
 56 X_assess, parameters = forward_propagation_with_dropout_test_case()
 57 A3, cache = forward_propagation_with_dropout(X_assess, parameters, keep_prob=0.7)
 58 print ("A3 = " + str(A3))
 59 
 60 
 61 # GRADED FUNCTION: backward_propagation_with_dropout
 62 def backward_propagation_with_dropout(X, Y, cache, keep_prob):
 63     """
 64     Implements the backward propagation of our baseline model to which we added dropout.
 65     
 66     Arguments:
 67     X -- input dataset, of shape (2, number of examples)
 68     Y -- "true" labels vector, of shape (output size, number of examples)
 69     cache -- cache output from forward_propagation_with_dropout()
 70     keep_prob - probability of keeping a neuron active during drop-out, scalar
 71     
 72     Returns:
 73     gradients -- A dictionary with the gradients with respect to each parameter, activation and pre-activation variables
 74     """
 75     
 76     m = X.shape[1]
 77     (Z1, D1, A1, W1, b1, Z2, D2, A2, W2, b2, Z3, A3, W3, b3) = cache
 78     
 79     dZ3 = A3 - Y
 80     dW3 = 1. / m * np.dot(dZ3, A2.T)
 81     db3 = 1. / m * np.sum(dZ3, axis=1, keepdims=True)
 82     
 83     dA2 = np.dot(W3.T, dZ3)
 84     ### START CODE HERE ### (≈ 2 lines of code)
 85     dA2 = dA2*D2               # Step 1: Apply mask D2 to shut down the same neurons as during the forward propagation
 86     dA2 = dA2/keep_prob        # Step 2: Scale the value of neurons that haven't been shut down
 87     ### END CODE HERE ###
 88     
 89     dZ2 = np.multiply(dA2, np.int64(A2 > 0))
 90     dW2 = 1. / m * np.dot(dZ2, A1.T)
 91     db2 = 1. / m * np.sum(dZ2, axis=1, keepdims=True)
 92     
 93     dA1 = np.dot(W2.T, dZ2)
 94     ### START CODE HERE ### (≈ 2 lines of code)
 95     dA1=dA1*D1                   # Step 1: Apply mask D1 to shut down the same neurons as during the forward propagation
 96     dA1=dA1/keep_prob            # Step 2: Scale the value of neurons that haven't been shut down
 97     ### END CODE HERE ###
 98     
 99     dZ1 = np.multiply(dA1, np.int64(A1 > 0))
100     dW1 = 1. / m * np.dot(dZ1, X.T)
101     db1 = 1. / m * np.sum(dZ1, axis=1, keepdims=True)
102     
103     gradients = {"dZ3": dZ3, "dW3": dW3, "db3": db3,"dA2": dA2,
104                  "dZ2": dZ2, "dW2": dW2, "db2": db2, "dA1": dA1, 
105                  "dZ1": dZ1, "dW1": dW1, "db1": db1}
106     
107     return gradients
108 
109 
110 X_assess, Y_assess, cache = backward_propagation_with_dropout_test_case()
111 gradients = backward_propagation_with_dropout(X_assess, Y_assess, cache, keep_prob=0.8)
112 print ("dA1 = " + str(gradients["dA1"]))
113 print ("dA2 = " + str(gradients["dA2"]))
114 
115 
116 parameters = model(train_X, train_Y, keep_prob=0.86, learning_rate=0.3)
117 print("On the train set:")
118 predictions_train = predict(train_X, train_Y, parameters)
119 print("On the test set:")
120 predictions_test = predict(test_X, test_Y, parameters)
121 
122 
123 plt.title("Model with dropout")
124 axes = plt.gca()
125 axes.set_xlim([-0.75, 0.40])
126 axes.set_ylim([-0.75, 0.65])
127 plot_decision_boundary(lambda x: predict_dec(parameters, x.T), train_X, train_Y)

View Code

预测准确度0.95

梯度校验

一维梯度校验：

 1 from testCases_v3 import gradient_check_n_test_case
 2 from gc_utils import sigmoid, relu, dictionary_to_vector, vector_to_dictionary, gradients_to_vector
 3 
 4 #一维梯度校验
 5 # GRADED FUNCTION: forward_propagation
 6 def forward_propagation(x, theta):
 7     """
 8     Implement the linear forward propagation (compute J) presented in Figure 1 (J(theta) = theta * x)
 9     
10     Arguments:
11     x -- a real-valued input
12     theta -- our parameter, a real number as well
13     
14     Returns:
15     J -- the value of function J, computed using the formula J(theta) = theta * x
16     """
17     
18     ### START CODE HERE ### (approx. 1 line)
19     J = np.dot(theta, x)
20     ### END CODE HERE ### 
21     return J
22 
23 x, theta = 2, 4
24 J = forward_propagation(x, theta)
25 print ("J = " + str(J))
26 
27 
28 # GRADED FUNCTION: backward_propagation
29 def backward_propagation(x, theta):
30     """
31     Computes the derivative of J with respect to theta (see Figure 1).
32     
33     Arguments:
34     x -- a real-valued input
35     theta -- our parameter, a real number as well
36     
37     Returns:
38     dtheta -- the gradient of the cost with respect to theta
39     """    
40     ### START CODE HERE ### (approx. 1 line)
41     dtheta=x
42     ### END CODE HERE ###   
43     return dtheta
44 
45 x, theta = 2, 4
46 dtheta = backward_propagation(x, theta)
47 print ("dtheta = " + str(dtheta))
48 
49 
50 # GRADED FUNCTION: gradient_check
51 def gradient_check(x, theta, epsilon=1e-7):
52     """
53     Implement the backward propagation presented in Figure 1.
54     
55     Arguments:
56     x -- a real-valued input
57     theta -- our parameter, a real number as well
58     epsilon -- tiny shift to the input to compute approximated gradient with formula(1)
59     
60     Returns:
61     difference -- difference (2) between the approximated gradient and the backward propagation gradient
62     """
63     
64     # Compute gradapprox using left side of formula (1). epsilon is small enough, you don't need to worry about the limit.
65     ### START CODE HERE ### (approx. 5 lines)
66     theta1=theta+epsilon                              # Step 1
67     theta2=theta-epsilon                              # Step 2
68     J1=forward_propagation(x, theta1)                 # Step 3
69     J2=forward_propagation(x, theta2)                 # Step 4
70     gradapprox=(J1-J2)/(2*epsilon)                    # Step 5
71     ### END CODE HERE ###
72     
73     # Check if gradapprox is close enough to the output of backward_propagation()
74     ### START CODE HERE ### (approx. 1 line)
75     grad = backward_propagation(x, theta)
76     ### END CODE HERE ###
77     
78     ### START CODE HERE ### (approx. 1 line)    
79     numerator = np.linalg.norm(grad - gradapprox)                      # Step 1'
80     denominator = np.linalg.norm(grad) + np.linalg.norm(gradapprox)    # Step 2'
81     difference = numerator / denominator                               # Step 3'
82     ### END CODE HERE ###
83     
84     if difference < 1e-7:
85         print("The gradient is correct!")
86     else:
87         print("The gradient is wrong!")
88     
89     return difference
90 
91 x, theta = 2, 4
92 difference = gradient_check(x, theta)
93 print("difference = " + str(difference))

View Code

输出：

The gradient is correct!
difference = 2.919335883291695e-10

N维梯度校验：

  1 #N维梯度校验
  2 def forward_propagation_n(X, Y, parameters):
  3     """
  4     Implements the forward propagation (and computes the cost) presented in Figure 3.
  5     
  6     Arguments:
  7     X -- training set for m examples
  8     Y -- labels for m examples 
  9     parameters -- python dictionary containing your parameters "W1", "b1", "W2", "b2", "W3", "b3":
 10                     W1 -- weight matrix of shape (5, 4)
 11                     b1 -- bias vector of shape (5, 1)
 12                     W2 -- weight matrix of shape (3, 5)
 13                     b2 -- bias vector of shape (3, 1)
 14                     W3 -- weight matrix of shape (1, 3)
 15                     b3 -- bias vector of shape (1, 1)
 16     
 17     Returns:
 18     cost -- the cost function (logistic cost for one example)
 19     """
 20     
 21     # retrieve parameters
 22     m = X.shape[1]
 23     W1 = parameters["W1"]
 24     b1 = parameters["b1"]
 25     W2 = parameters["W2"]
 26     b2 = parameters["b2"]
 27     W3 = parameters["W3"]
 28     b3 = parameters["b3"]
 29 
 30     # LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID
 31     Z1 = np.dot(W1, X) + b1
 32     A1 = relu(Z1)
 33     Z2 = np.dot(W2, A1) + b2
 34     A2 = relu(Z2)
 35     Z3 = np.dot(W3, A2) + b3
 36     A3 = sigmoid(Z3)
 37 
 38     # Cost
 39     logprobs = np.multiply(-np.log(A3), Y) + np.multiply(-np.log(1 - A3), 1 - Y)
 40     cost = 1. / m * np.sum(logprobs)
 41     
 42     cache = (Z1, A1, W1, b1, Z2, A2, W2, b2, Z3, A3, W3, b3)
 43     
 44     return cost, cache
 45 
 46 
 47 def backward_propagation_n(X, Y, cache):
 48     """
 49     Implement the backward propagation presented in figure 2.
 50     
 51     Arguments:
 52     X -- input datapoint, of shape (input size, 1)
 53     Y -- true "label"
 54     cache -- cache output from forward_propagation_n()
 55     
 56     Returns:
 57     gradients -- A dictionary with the gradients of the cost with respect to each parameter, activation and pre-activation variables.
 58     """
 59     
 60     m = X.shape[1]
 61     (Z1, A1, W1, b1, Z2, A2, W2, b2, Z3, A3, W3, b3) = cache
 62     
 63     dZ3 = A3 - Y
 64     dW3 = 1. / m * np.dot(dZ3, A2.T)
 65     db3 = 1. / m * np.sum(dZ3, axis=1, keepdims=True)
 66     
 67     dA2 = np.dot(W3.T, dZ3)
 68     dZ2 = np.multiply(dA2, np.int64(A2 > 0))
 69     dW2 = 1. / m * np.dot(dZ2, A1.T) * 2  # Should not multiply by 2
 70     db2 = 1. / m * np.sum(dZ2, axis=1, keepdims=True)
 71     
 72     dA1 = np.dot(W2.T, dZ2)
 73     dZ1 = np.multiply(dA1, np.int64(A1 > 0))
 74     dW1 = 1. / m * np.dot(dZ1, X.T)
 75     db1 = 4. / m * np.sum(dZ1, axis=1, keepdims=True) # Should not multiply by 4
 76     
 77     gradients = {"dZ3": dZ3, "dW3": dW3, "db3": db3,
 78                  "dA2": dA2, "dZ2": dZ2, "dW2": dW2, "db2": db2,
 79                  "dA1": dA1, "dZ1": dZ1, "dW1": dW1, "db1": db1}
 80     
 81     return gradients
 82 
 83 
 84 # GRADED FUNCTION: gradient_check_n
 85 def gradient_check_n(parameters, gradients, X, Y, epsilon=1e-7):
 86     """
 87     Checks if backward_propagation_n computes correctly the gradient of the cost output by forward_propagation_n
 88     
 89     Arguments:
 90     parameters -- python dictionary containing your parameters "W1", "b1", "W2", "b2", "W3", "b3":
 91     grad -- output of backward_propagation_n, contains gradients of the cost with respect to the parameters. 
 92     x -- input datapoint, of shape (input size, 1)
 93     y -- true "label"
 94     epsilon -- tiny shift to the input to compute approximated gradient with formula(1)
 95     
 96     Returns:
 97     difference -- difference (2) between the approximated gradient and the backward propagation gradient
 98     """
 99     
100     # Set-up variables
101     parameters_values, _ = dictionary_to_vector(parameters)
102     grad = gradients_to_vector(gradients)
103     num_parameters = parameters_values.shape[0]
104     J_plus = np.zeros((num_parameters, 1))
105     J_minus = np.zeros((num_parameters, 1))
106     gradapprox = np.zeros((num_parameters, 1))
107     
108     # Compute gradapprox
109     for i in range(num_parameters):
110         
111         # Compute J_plus[i]. Inputs: "parameters_values, epsilon". Output = "J_plus[i]".
112         # "_" is used because the function you have to outputs two parameters but we only care about the first one
113         ### START CODE HERE ### (approx. 3 lines)
114         theta1=np.copy(parameters_values)                       # Step 1
115         theta1[i][0]+=epsilon                                   # Step 2
116         J_plus[i],_=forward_propagation_n(X, Y, vector_to_dictionary(theta1))           # Step 3
117         ### END CODE HERE ###
118         
119         
120         # Compute J_minus[i]. Inputs: "parameters_values, epsilon". Output = "J_minus[i]".
121         ### START CODE HERE ### (approx. 3 lines)
122         theta2=np.copy(parameters_values)                       # Step 1
123         theta2[i][0]-=epsilon                                   # Step 2
124         J_minus[i],_=forward_propagation_n(X, Y, vector_to_dictionary(theta2))          # Step 3
125         ### END CODE HERE ###
126         
127         # Compute gradapprox[i]
128         ### START CODE HERE ### (approx. 1 line)
129         gradapprox[i]=(J_plus[i]-J_minus[i])/(2*epsilon)
130         ### END CODE HERE ###
131     
132     # Compare gradapprox to backward propagation gradients by computing difference.
133     ### START CODE HERE ### (approx. 1 line)
134     numerator = np.linalg.norm(grad - gradapprox)                      # Step 1'
135     denominator = np.linalg.norm(grad) + np.linalg.norm(gradapprox)    # Step 2'
136     difference = numerator / denominator                               # Step 3'                                            
137     ### END CODE HERE ###
138 
139     if difference > 1e-7:
140         print("33[93m" + "There is a mistake in the backward propagation! difference = " + str(difference) + "33[0m")
141     else:
142         print("33[92m" + "Your backward propagation works perfectly fine! difference = " + str(difference) + "33[0m")
143     
144     return difference
145 
146 
147 X, Y, parameters = gradient_check_n_test_case()
148 
149 cost, cache = forward_propagation_n(X, Y, parameters)
150 gradients = backward_propagation_n(X, Y, cache)
151 difference = gradient_check_n(parameters, gradients, X, Y)

View Code

输出：

There is a mistake in the backward propagation! difference = 0.2850931566540251