笔记：CS231n+assignment1（作业一）

CS231n的课后作业非常的好，这里记录一下自己对作业一些笔记。

一、第一个是KNN的代码，这里的trick是计算距离的三种方法，核心的话还是python和machine learning中非常实用的向量化操作，可以大大的提高计算速度。

import numpy as np

class KNearestNeighbor(object):#首先是定义一个处理KNN的类
  """ a kNN classifier with L2 distance """

  def __init__(self):
    pass

  def train(self, X, y):
    """
    Train the classifier. For k-nearest neighbors this is just 
    memorizing the training data.

    Inputs:
    - X: A numpy array of shape (num_train, D) containing the training data
      consisting of num_train samples each of dimension D.
    - y: A numpy array of shape (N,) containing the training labels, where
         y[i] is the label for X[i].
    """
    self.X_train = X
    self.y_train = y
    
  def predict(self, X, k=1, num_loops=0):
    """
    Predict labels for test data using this classifier.

    Inputs:
    - X: A numpy array of shape (num_test, D) containing test data consisting
         of num_test samples each of dimension D.
    - k: The number of nearest neighbors that vote for the predicted labels.
    - num_loops: Determines which implementation to use to compute distances
      between training points and testing points.

    Returns:
    - y: A numpy array of shape (num_test,) containing predicted labels for the
      test data, where y[i] is the predicted label for the test point X[i].  
    """
    if num_loops == 0:
      dists = self.compute_distances_no_loops(X)
    elif num_loops == 1:
      dists = self.compute_distances_one_loop(X)
    elif num_loops == 2:
      dists = self.compute_distances_two_loops(X)
    else:
      raise ValueError('Invalid value %d for num_loops' % num_loops)

    return self.predict_labels(dists, k=k)

  def compute_distances_two_loops(self, X):
    """
    Compute the distance between each test point in X and each training point
    in self.X_train using a nested loop over both the training data and the 
    test data.

    Inputs:
    - X: A numpy array of shape (num_test, D) containing test data.

    Returns:
    - dists: A numpy array of shape (num_test, num_train) where dists[i, j]
      is the Euclidean distance between the ith test point and the jth training
      point.
    """
    num_test = X.shape[0]
    num_train = self.X_train.shape[0]
    dists = np.zeros((num_test, num_train))
    for i in xrange(num_test):
      for j in xrange(num_train):
        dists[i][j] = np.sqrt(np.sum(np.square(self.X_train[j,:] - X[i,:])))
        #####################################################################
        # TODO:                                                             #
        # Compute the l2 distance between the ith test point and the jth    #
        # training point, and store the result in dists[i, j]. You should   #
        # not use a loop over dimension.                                    #
        #####################################################################
        #####################################################################
        #                       END OF YOUR CODE                            #
        #####################################################################
    return dists

  def compute_distances_one_loop(self, X):
    """
    Compute the distance between each test point in X and each training point
    in self.X_train using a single loop over the test data.

    Input / Output: Same as compute_distances_two_loops
    """
    num_test = X.shape[0]
    num_train = self.X_train.shape[0]
    dists = np.zeros((num_test, num_train))
    for i in xrange(num_test):
      #######################################################################
      # TODO:                                                               #
      # Compute the l2 distance between the ith test point and all training #
      # points, and store the result in dists[i, :].                        #
      #######################################################################
      dists[i,:] = np.sqrt(np.sum(np.square(self.X_train-X[i,:]),axis = 1))  
      #######################################################################
      #                         END OF YOUR CODE                            #
      #######################################################################
    return dists

  def compute_distances_no_loops(self, X):
    """
    Compute the distance between each test point in X and each training point
    in self.X_train using no explicit loops.

    Input / Output: Same as compute_distances_two_loops
    """
    num_test = X.shape[0]
    num_train = self.X_train.shape[0]
    dists = np.zeros((num_test, num_train)) 
    #########################################################################
    # TODO:                                                                 #
    # Compute the l2 distance between all test points and all training      #
    # points without using any explicit loops, and store the result in      #
    # dists.                                                                #
    #                                                                       #
    # You should implement this function using only basic array operations; #
    # in particular you should not use functions from scipy.                #
    #                                                                       #
    # HINT: Try to formulate the l2 distance using matrix multiplication    #
    #       and two broadcast sums.                                         #
    #########################################################################
    dists = np.multiply(np.dot(X,self.X_train.T),-2)  
    sq1 = np.sum(np.square(X),axis=1,keepdims = True)  
    sq2 = np.sum(np.square(self.X_train),axis=1)  
    dists = np.add(dists,sq1)  
    dists = np.add(dists,sq2)  
    dists = np.sqrt(dists) 
    #########################################################################
    #                         END OF YOUR CODE                              #
    #########################################################################
    return dists

  def predict_labels(self, dists, k=1):
    """
    Given a matrix of distances between test points and training points,
    predict a label for each test point.

    Inputs:
    - dists: A numpy array of shape (num_test, num_train) where dists[i, j]
      gives the distance betwen the ith test point and the jth training point.

    Returns:
    - y: A numpy array of shape (num_test,) containing predicted labels for the
      test data, where y[i] is the predicted label for the test point X[i].  
    """
    num_test = dists.shape[0]
    y_pred = np.zeros(num_test)
    for i in xrange(num_test):
      # A list of length k storing the labels of the k nearest neighbors to
      # the ith test point.
      closest_y = []  
      #########################################################################  
      # TODO:                                                                 #  
      # Use the distance matrix to find the k nearest neighbors of the ith    #  
      # training point, and use self.y_train to find the labels of these      #  
      # neighbors. Store these labels in closest_y.                           #  
      # Hint: Look up the function numpy.argsort.                             #  
      #########################################################################  
      closest_y = self.y_train[np.argsort(dists[i,:])[:k]]  
   
      #########################################################################  
      # TODO:                                                                 #  
      # Now that you have found the labels of the k nearest neighbors, you    #  
      # need to find the most common label in the list closest_y of labels.   #  
      # Store this label in y_pred[i]. Break ties by choosing the smaller     #  
      # label.                                                                #  
      #########################################################################  
      y_pred[i] = np.argmax(np.bincount(closest_y))        #########################################################################
      #                           END OF YOUR CODE                            # 
      #########################################################################

    return y_pred

二、softmax

同样是需要完成naive和vector的两种操作来比较速度。

import numpy as np

def softmax_loss_naive(W, X, y, reg):    

    # Initialize the loss and gradient to zero.
    loss = 0.0
    dW = np.zeros_like(W)    # 得到一个和W同样shape的矩阵
    dW_each = np.zeros_like(W)
    num_train, dim = X.shape
    num_class = W.shape[1]
    f = X.dot(W)    # N by C
    # Considering the Numeric Stability
    f_max = np.reshape(np.max(f, axis=1), (num_train, 1))   # 找到最大值然后减去，这样是为了防止后面的操作会出现数值上的一些偏差
    prob = np.exp(f - f_max) / np.sum(np.exp(f - f_max), axis=1, keepdims=True) # N by C
    y_trueClass = np.zeros_like(prob)
    y_trueClass[np.arange(num_train), y] = 1.0
    for i in xrange(num_train):
        for j in xrange(num_class):    
            loss += -(y_trueClass[i, j] * np.log(prob[i, j]))    # 损失函数的公式L = -(1/N)∑i∑j1(k=yi)log(exp(fk)/∑j exp(fj)) + λR(W)
            dW_each[:, j] = -(y_trueClass[i, j] - prob[i, j]) * X[i, :]#梯度的公式 ∇Wk L = -(1/N)∑i xiT(pi,m-Pm) + 2λWk, where Pk = exp(fk)/∑j exp(fj
        dW += dW_each　　　　　　　　　　　　　　　　　　#这是把每个类的放在了一起
    loss /= num_train
    loss += 0.5 * reg * np.sum(W * W)  # 加上正则
    dW /= num_traindW += reg * W

    return loss, dW

def softmax_loss_vectorized(W, X, y, reg):    
    """    
    Softmax loss function, vectorized version.    

    Inputs and outputs are the same as softmax_loss_naive.    
    """    
    # Initialize the loss and gradient to zero.    
    loss = 0.0    
    dW = np.zeros_like(W)    # D by C    
    num_train, dim = X.shape

    f = X.dot(W)    # N by C
    # Considering the Numeric Stability
    f_max = np.reshape(np.max(f, axis=1), (num_train, 1))   # N by 1
    prob = np.exp(f - f_max) / np.sum(np.exp(f - f_max), axis=1), keepdims=True)
    y_trueClass = np.zeros_like(prob)
    y_trueClass[range(num_train), y] = 1.0    # N by C
    loss += -np.sum(y_trueClass * np.log(prob)) / num_train + 0.5 * reg * np.sum(W * W)#向量化直接操作即可
    dW += -np.dot(X.T, y_trueClass - prob) / num_train + reg * W

    return loss, dW

三、SVM

import numpy as np
def svm_loss_naive(W, X, y, reg):
    """
    Inputs:
    - W: A numpy array of shape (D, C) containing weights.
    - X: A numpy array of shape (N, D) containing a minibatch of data.
    - y: A numpy array of shape (N,) containing training labels; y[i] = c means 
         that X[i] has label c, where 0 <= c < C.
    - reg: (float) regularization strength

    Returns a tuple of:
    - loss as single float
    - gradient with respect to weights W; an array of same shape as W
    """
    dW = np.zeros(W.shape)   # initialize the gradient as zero
    # compute the loss and the gradient
    num_classes = W.shape[1]
    num_train = X.shape[0]
    loss = 0.0
    for i in xrange(num_train):    
        scores = X[i].dot(W)    
        correct_class_score = scores[y[i]]
        for j in xrange(num_classes):
            if j == y[i]:    #根据公式，正确的那个不用算
                continue
            margin = scores[j] - correct_class_score + 1   # note delta = 1
            if margin > 0:
                loss += margin
                dW[:, y[i]] += -X[i, :]     #  根据公式：∇Wyi Li = - xiT(∑j≠yi1(xiWj - xiWyi +1>0)) + 2λWyi 
                dW[:, j] += X[i, :]         #  根据公式： ∇Wj Li = xiT 1(xiWj - xiWyi +1>0) + 2λWj , (j≠yi)
    # Right now the loss is a sum over all training examples, but we want it
    # to be an average instead so we divide by num_train.
    loss /= num_train
    dW /= num_train
    # Add regularization to the loss.
    loss += 0.5 * reg * np.sum(W * W)
    dW += reg * W
    return loss, dW

def svm_loss_vectorized(W, X, y, reg):
    """
    Structured SVM loss function, vectorized implementation.Inputs and outputs 
    are the same as svm_loss_naive.
    """
    loss = 0.0
    dW = np.zeros(W.shape)   # initialize the gradient as zero
    scores = X.dot(W)        # N by C
    num_train = X.shape[0]
    num_classes = W.shape[1]
    scores_correct = scores[np.arange(num_train), y]   # 1 by N
    scores_correct = np.reshape(scores_correct, (num_train, 1))  # N by 1
    margins = scores - scores_correct + 1.0     # N by C
    margins[np.arange(num_train), y] = 0.0
    margins[margins <= 0] = 0.0
    loss += np.sum(margins) / num_train
    loss += 0.5 * reg * np.sum(W * W)
    # compute the gradient
    margins[margins > 0] = 1.0
    row_sum = np.sum(margins, axis=1)                  # 1 by N
    margins[np.arange(num_train), y] = -row_sum        
    dW += np.dot(X.T, margins)/num_train + reg * W     # D by C

    return loss, dW

四、linear_classifier

　　从编程思路上来看，上面三个是不同的策略，确切的说是线性分类器的集中方法，所以我们用一个LinearClassifier类来调用他们。

from linear_svm import *
from softmax import *

class LinearClassifier(object):    

    def __init__(self):        
        self.W = None    

    def train(self, X, y, learning_rate=1e-3, reg=1e-5, num_iters=100, 
                          batch_size=200, verbose=True):  #注意这里传递的参数设置
        """        
        Train this linear classifier using stochastic gradient descent.   

        Inputs:       
        - X: A numpy array of shape (N, D) containing training data; there are N          
             training samples each of dimension D.        
        - y: A numpy array of shape (N,) containing training labels; y[i] = c          
             means that X[i] has label 0 <= c < C for C classes.        
        - learning_rate: (float) learning rate for optimization.        
        - reg: (float) regularization strength.        
        - num_iters: (integer) number of steps to take when optimizing
        - batch_size: (integer) number of training examples to use at each step.        
        - verbose: (boolean) If true, print progress during optimization.

        Outputs:         
        A list containing the value of the loss function at each training iteration.
        """
        num_train, dim = X.shape
        # assume y takes values 0...K-1 where K is number of classes
        num_classes = np.max(y) + 1  
        if self.W is None:
            # lazily initialize W
            self.W = 0.001 * np.random.randn(dim, num_classes)   # 初始化W

        # Run stochastic gradient descent(Mini-Batch) to optimize W
        loss_history = []
        for it in xrange(num_iters):  #每次随机取batch的数据来进行梯度下降
            X_batch = None
            y_batch = None
            # Sampling with replacement is faster than sampling without replacement.
            sample_index = np.random.choice(num_train, batch_size, replace=False)
            X_batch = X[sample_index, :]   # batch_size by D
            y_batch = y[sample_index]      # 1 by batch_size
            # evaluate loss and gradient
            loss, grad = self.loss(X_batch, y_batch, reg)
            loss_history.append(loss)

            # perform parameter update
            self.W += -learning_rate * grad
            if verbose and it % 100 == 0:
                print 'Iteration %d / %d: loss %f' % (it, num_iters, loss)

        return loss_history

    def predict(self, X):    
        """    
        Use the trained weights of this linear classifier to predict labels for   
        data points.    

        Inputs:    
        - X: D x N array of training data. Each column is a D-dimensional point.    

        Returns:    
        - y_pred: Predicted labels for the data in X. y_pred is a 1-dimensional     
                  array of length N, and each element is an integer giving the 
                  predicted class.  
        """
        y_pred = np.zeros(X.shape[1])    # 1 by N
        X=X.T
        y_pred = np.argmax(X.dot(self.W), axis=0) #预测直接找到最后y最大的那个值

        return y_pred

    def loss(self, X_batch, y_batch, reg):   
        """    
        Compute the loss function and its derivative.    
        Subclasses will override this.    

        Inputs:    
        - X_batch: A numpy array of shape (N, D) containing a minibatch of N 
                   data points; each point has dimension D.    
        - y_batch: A numpy array of shape (N,) containing labels for the minibatch.
        - reg: (float) regularization strength.   

        Returns: A tuple containing:    
        - loss as a single float    
        - gradient with respect to self.W; an array of the same shape as W   
        """    
        pass

class LinearSVM(LinearClassifier):   
    """ 
    A subclass that uses the Multiclass SVM loss function 
    """    
    def loss(self, X_batch, y_batch, reg):        
        return svm_loss_vectorized(self.W, X_batch, y_batch, reg)

class Softmax(LinearClassifier):   
    """ 
    A subclass that uses the Softmax + Cross-entropy loss function 
    """    
    def loss(self, X_batch, y_batch, reg):        
        return softmax_loss_vectorized(self.W, X_batch, y_batch, reg)

五、简单的两层神经网络

这里只是一个简单的神经网络的写法，在下次作业会有一个很好很强大的神经网络等我们去构造。

BP可以看这幅图来理解：

import numpy as np
import matplotlib.pyplot as plt


class TwoLayerNet(object):
  """
  A two-layer fully-connected neural network. The net has an input dimension of
  N, a hidden layer dimension of H, and performs classification over C classes.
  We train the network with a softmax loss function and L2 regularization on the
  weight matrices. The network uses a ReLU nonlinearity after the first fully
  connected layer.

  In other words, the network has the following architecture:

  input - fully connected layer - ReLU - fully connected layer - softmax

  The outputs of the second fully-connected layer are the scores for each class.
  """
  def __init__(self, input_size, hidden_size, output_size, std=1e-4):
    """
    Initialize the model. Weights are initialized to small random values and
    biases are initialized to zero. Weights and biases are stored in the
    variable self.params, which is a dictionary with the following keys:

    W1: First layer weights; has shape (D, H)
    b1: First layer biases; has shape (H,)
    W2: Second layer weights; has shape (H, C)
    b2: Second layer biases; has shape (C,)

    Inputs:
    - input_size: The dimension D of the input data.
    - hidden_size: The number of neurons H in the hidden layer.
    - output_size: The number of classes C.
    """
    self.params = {}
    self.params['W1'] = std * np.random.randn(input_size, hidden_size)
    self.params['b1'] = np.zeros(hidden_size)
    self.params['W2'] = std * np.random.randn(hidden_size, output_size)
    self.params['b2'] = np.zeros(output_size)     #初始化神经网络的参数

  def loss(self, X, y=None, reg=0.0):
    """
    Compute the loss and gradients for a two layer fully connected neural
    network.

    Inputs:
    - X: Input data of shape (N, D). Each X[i] is a training sample.
    - y: Vector of training labels. y[i] is the label for X[i], and each y[i] is
      an integer in the range 0 <= y[i] < C. This parameter is optional; if it
      is not passed then we only return scores, and if it is passed then we
      instead return the loss and gradients.
    - reg: Regularization strength.

    Returns:
    If y is None, return a matrix scores of shape (N, C) where scores[i, c] is
    the score for class c on input X[i].

    If y is not None, instead return a tuple of:
    - loss: Loss (data loss and regularization loss) for this batch of training
      samples.
    - grads: Dictionary mapping parameter names to gradients of those parameters
      with respect to the loss function; has the same keys as self.params.
    """
    # Unpack variables from the params dictionary
    W1, b1 = self.params['W1'], self.params['b1']
    W2, b2 = self.params['W2'], self.params['b2']
    N, D = X.shape

    # Compute the forward pass
    scores = None
    #############################################################################
    # TODO: Perform the forward pass, computing the class scores for the input. #
    # Store the result in the scores variable, which should be an array of      #
    # shape (N, C).                                                             #
    #############################################################################
    h1=np.maximum(0,np.dot(X,W1)+b1) 这里是做了一个RELU的activition function
    scores=np.dot(h1,W2)+b2
    #############################################################################
    #                              END OF YOUR CODE                             #
    #############################################################################
    
    # If the targets are not given then jump out, we're done
    if y is None:
      return scores

    # Compute the loss
    loss = None
    #############################################################################
    # TODO: Finish the forward pass, and compute the loss. This should include  #
    # both the data loss and L2 regularization for W1 and W2. Store the result  #
    # in the variable loss, which should be a scalar. Use the Softmax           #
    # classifier loss. So that your results match ours, multiply the            #
    # regularization loss by 0.5                                                #
    #############################################################################
    scores_max = np.max(scores, axis=1, keepdims=True)  # (N,1)
    # Compute the class probabilities
    exp_scores = np.exp(scores - scores_max)  # (N,C)
    probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)  # (N,C)
    # cross-entropy loss and L2-regularization
    correct_logprobs = -np.log(probs[range(N), y])  # (N,1)
    data_loss = np.sum(correct_logprobs) / N
    reg_loss = 0.5 * reg * np.sum(W1 * W1) + 0.5 * reg * np.sum(W2 * W2)
    loss = data_loss + reg_loss     #计算出误差
    #############################################################################
    #                              END OF YOUR CODE                             #
    #############################################################################

    # Backward pass: compute gradients
    grads = {}
    #############################################################################
    # TODO: Compute the backward pass, computing the derivatives of the weights #
    # and biases. Store the results in the grads dictionary. For example,       #
    # grads['W1'] should store the gradient on W1, and be a matrix of same size #
    #############################################################################
    dscores = probs  # (N,C) 
    dscores[range(N), y] -= 1  #  这个是输出的误差敏感项也就是梯度的计算，具体可以看上面softmax 的计算
    dscores /= N
    # Backprop into W2 and b2
    dW2 = np.dot(h1.T, dscores)  # (H,C) BP算法的计算，下面同理
    db2 = np.sum(dscores, axis=0, keepdims=True)  # (1,C
    # Backprop into hidden layer
    dh1 = np.dot(dscores, W2.T)  # (N,H)
    # Backprop into ReLU non-linearity
    dh1[h1 <= 0] = 0
    # Backprop into W1 and b1
    dW1 = np.dot(X.T, dh1)  # (D,H)
    db1 = np.sum(dh1, axis=0, keepdims=True)  # (1,H)
    # Add the regularization gradient contribution
    dW2 += reg * W2
    dW1 += reg * W1
    grads['W1'] = dW1
    grads['b1'] = db1
    grads['W2'] = dW2
    grads['b2'] = db2
    #############################################################################
    #                              END OF YOUR CODE                             #
    #############################################################################

    return loss, grads

  def train(self, X, y, X_val, y_val,
            learning_rate=1e-3, learning_rate_decay=0.95,
            reg=1e-5, num_iters=100,
            batch_size=200, verbose=False):
    """
    Train this neural network using stochastic gradient descent.

    Inputs:
    - X: A numpy array of shape (N, D) giving training data.
    - y: A numpy array f shape (N,) giving training labels; y[i] = c means that
      X[i] has label c, where 0 <= c < C.
    - X_val: A numpy array of shape (N_val, D) giving validation data.
    - y_val: A numpy array of shape (N_val,) giving validation labels.
    - learning_rate: Scalar giving learning rate for optimization.
    - learning_rate_decay: Scalar giving factor used to decay the learning rate
      after each epoch.
    - reg: Scalar giving regularization strength.
    - num_iters: Number of steps to take when optimizing.
    - batch_size: Number of training examples to use per step.
    - verbose: boolean; if true print progress during optimization.
    """
    num_train = X.shape[0]
    iterations_per_epoch = max(num_train / batch_size, 1)

    # Use SGD to optimize the parameters in self.model
    loss_history = []
    train_acc_history = []
    val_acc_history = []

    for it in xrange(num_iters):
      X_batch = None
      y_batch = None

      #########################################################################
      # TODO: Create a random minibatch of training data and labels, storing  #
      # them in X_batch and y_batch respectively.                             #
      #########################################################################
      sample_index = np.random.choice(num_train, batch_size, replace=True)
      X_batch = X[sample_index, :]  # (batch_size,D)
      y_batch = y[sample_index]  # (1,batch_size)

      #########################################################################
      #                             END OF YOUR CODE                          #
      #########################################################################

      # Compute loss and gradients using the current minibatch
      loss, grads = self.loss(X_batch, y=y_batch, reg=reg)
      loss_history.append(loss)

      #########################################################################
      # TODO: Use the gradients in the grads dictionary to update the         #
      # parameters of the network (stored in the dictionary self.params)      #
      # using stochastic gradient descent. You'll need to use the gradients   #
      # stored in the grads dictionary defined above.                         #
      #########################################################################
      print grads['b2']
      a=grads['b2'].reshape(-1)
      grads['b2']=a
      a = grads['b1'].reshape(-1)
      grads['b1'] = a
      grads['b1'].reshape(-1)
      v_W2 = - learning_rate * grads['W2']
      self.params['W2'] += v_W2
      self.params['b2']-=learning_rate * grads['b2']
      v_W1 =  - learning_rate * grads['W1']
      self.params['W1'] += v_W1
      v_b1 = - learning_rate * grads['b1']
      self.params['b1'] += v_b1 　　　　　#对参数进行更新
      #########################################################################
      #                             END OF YOUR CODE                          #
      #########################################################################

      if verbose and it % 100 == 0:
        print 'iteration %d / %d: loss %f' % (it, num_iters, loss)

      # Every epoch, check train and val accuracy and decay learning rate.
      if it % iterations_per_epoch == 0:
        # Check accuracy
        train_acc = (self.predict(X_batch) == y_batch)
        val_acc = (self.predict(X_val) == y_val)
        train_acc_history.append(train_acc)
        val_acc_history.append(val_acc)

        # Decay learning rate
        learning_rate *= learning_rate_decay

    return {
      'loss_history': loss_history,
      'train_acc_history': train_acc_history,
      'val_acc_history': val_acc_history,
    }

  def predict(self, X):
    """
    Use the trained weights of this two-layer network to predict labels for
    data points. For each data point we predict scores for each of the C
    classes, and assign each data point to the class with the highest score.

    Inputs:
    - X: A numpy array of shape (N, D) giving N D-dimensional data points to
      classify.

    Returns:
    - y_pred: A numpy array of shape (N,) giving predicted labels for each of
      the elements of X. For all i, y_pred[i] = c means that X[i] is predicted
      to have class c, where 0 <= c < C.
    """
    y_pred = None

    ###########################################################################
    # TODO: Implement this function; it should be VERY simple!                #
    ###########################################################################
    y_pred = None
    h1 = np.maximum(0,(np.dot(X, self.params['W1']) + self.params['b1']))
    scores = np.dot(h1, self.params['W2']) + self.params['b2']
    y_pred = np.argmax(scores, axis=1)
    ###########################################################################
    #                              END OF YOUR CODE                           #
    ###########################################################################

    return y_pred

总结

　这个作业是很久之前做的了，代码基本上也是借鉴的网上的，我主要就是把它搞懂和跑通，主要还是不太熟悉python这一套，numpy这个东西还是有一定的学习时间的...

　整个作业一来说，难度不大，关键是把之前学到的公式转换成代码，我觉得做完这个工作很有收获。