SML_Assignment4

Task 1: Neural Networks

1a) Multi-layer Perceptron

We use mean cross entropy loss, because it is good for multi-category problems;
We use ReLu as activation function due to excellent adaptability to multi-class classification. We have also tested sigmoid, but there are problems with vanishing gradients.
The input layer has 784 neurons, because the count of pixel is (28 imes 28);
The output layer has 10 neurons, because we need to classify 10 categories.
The selection strategy of learning rate is constantly changing in the process of network training. At the beginning, the parameters are relatively random, so we should choose a relatively large learning rate, so that loss will fall faster. When we train for a period of time, the parameter update should have a smaller range of change, and then we choose a smaller learning rate

Full code is below:

import numpy as np
from matplotlib import pyplot as plt


def initialize(hidden_dim, output_dim):
    # retrieve mnist data
    X_train = np.loadtxt('./dataSets/mnist_small_train_in.txt', delimiter=',')
    X_test = np.loadtxt('./dataSets/mnist_small_test_in.txt', delimiter=',')
    y_train = np.loadtxt('./dataSets/mnist_small_train_out.txt', delimiter=',', dtype="int32")
    y_test = np.loadtxt('./dataSets/mnist_small_test_out.txt', delimiter=',', dtype="int32")

    # normalize data
    X_train = (X_train - np.mean(X_train)) / np.std(X_train)
    X_test = (X_test - np.mean(X_test)) / np.std(X_test)

    # one hot encode labels
    temp_y_train = np.eye(10)[y_train]
    temp_y_test = np.eye(10)[y_test]
    y_train = temp_y_train.reshape(list(y_train.shape) + [10])
    y_test = temp_y_test.reshape(list(y_test.shape) + [10])

    # weights for single hidden layer
    W1 = np.random.randn(hidden_dim, X_train.shape[1]) * 0.01
    b1 = np.zeros((hidden_dim,))
    W2 = np.random.randn(output_dim, hidden_dim) * 0.01
    b2 = np.zeros((output_dim,))

    parameters = [W1, b1, W2, b2]

    # return to column vectors
    return parameters, X_train.T, X_test.T, y_train.T, y_test.T


def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))


def dsigmoid(x):
    # input x is already sigmoid, no need to recompute
    return x * (1.0 - x)


def ReLU(x):
    return x * (x > 0)


def dReLU(x):
    return 1. * (x > 0)

# Cross entropy loss
def closs(pred, y):
    return np.squeeze(-np.sum(np.multiply(np.log(pred), y)) / len(y))

def dcloss(pred, y):
    return pred - y

# Mean squared error loss
def loss(pred, y):
    return np.sum(.5 * np.sum((pred - y) ** 2, axis=0), axis=0) / y.shape[1]

def dloss(pred, y):
    return (pred - y) / y.shape[1]


class NeuralNet(object):

    def __init__(self, hidden_dim, output_dim):
        # size of layers
        self.hidden_dim_1 = hidden_dim
        self.output_dim = output_dim

        # weights and data
        parameters, self.x_train, self.x_test, self.y_train, self.y_test = initialize(hidden_dim, output_dim)
        self.W1, self.b1, self.W2, self.b2 = parameters

        # activations
        batch_size = self.x_train.shape[0]

        self.ai = np.ones((self.x_train.shape[1], batch_size))
        self.ah1 = np.ones((self.hidden_dim_1, batch_size))
        self.ao = np.ones((self.output_dim, batch_size))

        # classification output for transformed OHE
        self.classification = np.ones(self.ao.shape)

        # container for loss progress
        self.loss = None
        self.test_error = None

    def forward_pass(self, x):
        # input activations
        self.ai = x

        # hidden_1 activations
        self.ah1 = sigmoid(self.W1 @ self.ai + self.b1[:, np.newaxis])

        # output activations
        self.ao = sigmoid(self.W2 @ self.ah1 + self.b2[:, np.newaxis])

        # transform to OHE for classification
        self.classification = (self.ao == self.ao.max(axis=0, keepdims=0)).astype(float)

    def backward_pass(self, target):
        # calculate error for output
        out_error = dloss(self.ao, target)
        out_delta = out_error * dsigmoid(self.ao)

        # calculate error for hidden_1
        hidden_1_error = self.W2.T @ out_delta
        hidden_1_delta = hidden_1_error * dReLU(self.ah1)

        # derivative for W2/b2 (hidden_1 --> out)
        w2_deriv = out_delta @ self.ah1.T
        b2_deriv = np.sum(out_delta, axis=1)

        # derivative for W1/b1 (input --> hidden_1)
        w1_deriv = hidden_1_delta @ self.ai.T
        b1_deriv = np.sum(hidden_1_delta, axis=1)

        return [w1_deriv, b1_deriv, w2_deriv, b2_deriv]

    def train(self, epochs, lr, batch_size=128):

        self.loss = np.zeros((epochs,))
        self.test_error = np.zeros((epochs,))

        for epoch in range(epochs):

            indices = np.arange(self.x_train.shape[1])
            np.random.shuffle(indices)

            for i in range(0, self.x_train.shape[1] - batch_size + 1, batch_size):
                excerpt = indices[i:i + batch_size]

                batch_x, batch_y = self.x_train[:, excerpt], self.y_train[:, excerpt]

                # compute output of forward pass
                self.forward_pass(batch_x)

                # back prop error
                w1_deriv, b1_deriv, w2_deriv, b2_deriv = self.backward_pass(batch_y)

                # adjust weights with simple SGD
                self.W1 -= lr * w1_deriv
                self.b1 -= lr * b1_deriv
                self.W2 -= lr * w2_deriv
                self.b2 -= lr * b2_deriv

            # compute error
            self.forward_pass(self.x_test)
            error = loss(self.ao, self.y_test)
            self.loss[epoch] = error

            t = 0
            for i, pred in enumerate(self.classification.T):
                if np.argmax(self.y_test.T[i]) == np.argmax(pred):
                    t += 1

            acc = t / self.classification.shape[1]
            self.test_error[epoch] = 1 - acc

            # print progress
            if (epoch + 1) % 50 == 0:
                print("Epoch {}/{} -> loss: {:.5f} -> val_acc: {:.5f}".format(epoch + 1, epochs, error, acc))

    def predict(self, x):
        self.forward_pass(x)
        return self.classification

    def visualize_prediction(self, expected, predicted):
        # plot loss progress
        plt.figure(2)
        plt.plot(self.loss)
        plt.xlabel("Epochs")
        plt.ylabel("Loss")
        plt.show()

        # plot loss progress
        plt.figure(3)
        plt.plot(self.test_error)
        plt.xlabel("Epochs")
        plt.ylabel("Test Error")
        plt.show()


nn = NeuralNet(hidden_dim=784, output_dim=10)
nn.train(epochs=500, lr=.05)

y_hat = nn.predict(nn.x_test)
t = 0
for i, pred in enumerate(y_hat.T):
    if np.argmax(nn.y_test.T[i]) == np.argmax(pred):
        t += 1

print("Accuracy:", t / y_hat.shape[1])
nn.visualize_prediction(nn.y_test.T, y_hat.T)

Show how the misclassification error (in percent) on the testing set evolves during learning:

Task 3: Gaussian Processes

3a) GP Regression

Full code is below:

import numpy as np
from matplotlib import pyplot as plt

def target(x):
    s = np.sin(x)
    return s + np.square(s)


def kernel(x, z):
    return np.exp(-(x - z) ** 2)


def compute_c(x, noise):
    return kernel(x, x) + noise


def predict(x, X, Y, C_inv, noise):
    k = np.array([kernel(x, val) for val in X])
    mean = k.T.dot(C_inv).dot(Y)
    std = np.sqrt(compute_c(x, noise) - k.T.dot(C_inv).dot(k))
    return mean, std


def plot_gp(X, mean, std, iteration):
    y1 = mean - 2 * std
    y2 = mean + 2 * std

    plt.plot(X, target(X), "-", color="red", label="$sin(x) + sin^2(x)$")
    plt.plot(X, mean, color="black", label="$mu$")
    plt.fill_between(X, y1, y2, facecolor='blue', interpolate=True, alpha=.5, label="$2sigma$")
    plt.title("$mu$ and $sigma$ for iteration={}".format(iteration))
    plt.xlabel("x")
    plt.ylabel("y")


def gpr():
    noise = 0.001
    step_size = 0.005
    X = np.arange(0, 2 * np.pi + step_size, step_size)
    iterations = 15

    std = np.array([np.sqrt(compute_c(x, noise)) for x in X])
    j = np.argmax(std)

    Xn = np.array([X[j]])
    Yn = np.array([target(Xn)])

    C = np.array(compute_c(X[j], noise)).reshape((1, 1))
    C_inv = np.linalg.solve(C, np.identity(C.shape[0]))

    for iteration in range(0, iterations):
        mean = np.zeros(X.shape[0])
        std = np.zeros(X.shape[0])

        for i, x in enumerate(X):
            mean[i], std[i] = predict(x, Xn, Yn, C_inv, noise)

        if iteration + 1 in [1, 2, 5, 10, 15]:
            plot_gp(X, mean, std, iteration + 1)
            plt.plot(Xn, Yn, "o", c="blue", label="sampled")
            plt.legend()
            plt.show()

        j = np.argmax(std)
        Xn = np.append(Xn, X[j])
        Yn = np.append(Yn, target(Xn[-1]))

        # update C matrix
        x_new = Xn[-1]
        k = np.array([kernel(x_new, val) for val in Xn])
        c = kernel(x_new, x_new) + noise

        dim = C.shape[0]
        C_new = np.empty((dim + 1, dim + 1))
        C_new[:-1, :-1] = C
        C_new[-1, -1] = c
        C_new[:, -1] = k
        C_new[-1:] = k.T

        C = C_new
        C_inv = np.linalg.solve(C, np.identity(C.shape[0]))

gpr()

作者：Rest探路者
出处：http://www.cnblogs.com/Java-Starter/
本文版权归作者和博客园共有，欢迎转载，但未经作者同意请保留此段声明，请在文章页面明显位置给出原文连接
Github：https://github.com/cjy513203427