多元线性回归

关于多元线性回归
求解
算法封装
- 使用 sklearn 处理 boston 房价回归问题
使用 kNN 解决多元线性回归问题

关于多元线性回归

简单线性回归：假设样本只有一个特征值；
多元线性回归：解决很多特征值。

$ hat{y}^{(i)} = heta_{0} + heta_{1}X_1^{(i)} + heta_{2}X_2^{(i)} + ... + heta_{n}X_n^{(i)} $

求解

目标：要找到 $ heta_{0}, heta_{1}, heta_{2} ... heta_{n} $ 使得目标函数（损失函数） $ sum_{i=1}^m ( y^{(i)} - hat{y}^{(i)} )^2 $ 尽可能小

将上面多个 $ heta $ 整理为一个元素，转置为列向量。
$ heta = ( heta_{0}, heta_{1}, heta_{2}, ... , heta_{n} )^T $

给 $ heta_{0} $ 增加一个虚构的乘数 $ X_0^{(i) } equiv 1(，得到： ) hat{y}^{(i)} = heta_{0}X_0^{(i) } + heta_{1}X_1^{(i)} + heta_{2}X_2^{(i)} + ... + heta_{n}X_n^{(i)} $

$ X^{(i)} = ( X_0^{(i)}, X_1^{(i)}, X_2^{(i)}, ... , X_n^{(i)} ) $

$ hat{y}^{(i)} = X^{(i)} cdot heta $

将此推广到所有样本上

[left( egin{array}{cc} 1 & X_1^{(1)} & X_2^{(1)} & ... & X_n^{(1)} \ 1 & X_1^{(2)} & X_2^{(2)} & ... & X_n^{(2)} \ ... \ 1 & X_1^{(m)} & X_2^{(m)} & ... & X_n^{(m)} end{array} ight) ]

比之前的 (X) 多了一列 1，为了区分，记作 $ X_b$

[ heta = left( egin{array}{cc} heta_0 \ heta_1 \ heta_2 \ ... \ heta_n end{array} ight) ]

$ heta_0 $ 称为截距 intercept，代表一个偏移；
$ heta_1 - heta_n $ 称为系数 coefficients；每一个都对应原来样本中的一个特征，用于描述这个特征对于最终样本相应的贡献程度；

$ hat{y} = X_b cdot heta$

所以目标函数也可以转化为：
$ (y - X_b cdot heta)^T (y - X_b cdot heta) $

$ heta = ( X_b^T X_b )^{-1} X_b^T y $

这个式子称为多元线性回归的 正规方程解（Normal Equation ）

优点：不需要对数据做归一化处理；
因为最后估计的 $ heta $ 是原始数据进行数学运算的结果，这种运算中不存在量纲的问题， $ heta $ 是线性方程中每一个 x 前面的系数而已，没有量纲的问题。

缺点：时间复杂度比较高，大概是 O(n^3)，即使使用优化方案，也在 O(n^2.4) 左右。
无论是样本量大，还是特征多，用这个方法都会很慢。所以一般使用其他方法。

算法封装

import numpy as np
from .metrics import r2_score

class LinearRegression:

    def __init__(self):
        """初始化Linear Regression模型"""
        self.coef_ = None
        self.intercept_ = None
        self._theta = None
    
    def fit_normal(self, X_train, y_train):
        """根据训练数据集X_train, y_train训练Linear Regression模型"""
        assert X_train.shape[0] == y_train.shape[0], 
            "the size of X_train must be equal to the size of y_train"
    
        X_b = np.hstack([np.ones((len(X_train), 1)), X_train])
        self._theta = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y_train)
    
        self.intercept_ = self._theta[0]
        self.coef_ = self._theta[1:]
    
        return self
    
    def fit_gd(self, X_train, y_train, eta=0.01, n_iters=1e4):
        """根据训练数据集X_train, y_train, 使用梯度下降法训练Linear Regression模型"""
        assert X_train.shape[0] == y_train.shape[0], 
            "the size of X_train must be equal to the size of y_train"
    
        def J(theta, X_b, y):
            try:
                return np.sum((y - X_b.dot(theta)) ** 2) / len(y)
            except:
                return float('inf')
    
        def dJ(theta, X_b, y):
            # res = np.empty(len(theta))
            # res[0] = np.sum(X_b.dot(theta) - y)
            # for i in range(1, len(theta)):
            #     res[i] = (X_b.dot(theta) - y).dot(X_b[:, i])
            # return res * 2 / len(X_b)
            return X_b.T.dot(X_b.dot(theta) - y) * 2. / len(X_b)
    
        def gradient_descent(X_b, y, initial_theta, eta, n_iters=1e4, epsilon=1e-8):
    
            theta = initial_theta
            cur_iter = 0
    
            while cur_iter < n_iters:
                gradient = dJ(theta, X_b, y)
                last_theta = theta
                theta = theta - eta * gradient
                if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
                    break
    
                cur_iter += 1
    
            return theta
    
        X_b = np.hstack([np.ones((len(X_train), 1)), X_train])
        initial_theta = np.zeros(X_b.shape[1])
        self._theta = gradient_descent(X_b, y_train, initial_theta, eta, n_iters)
    
        self.intercept_ = self._theta[0]
        self.coef_ = self._theta[1:]
    
        return self
    
    def fit_sgd(self, X_train, y_train, n_iters=5, t0=5, t1=50):
        """根据训练数据集X_train, y_train, 使用梯度下降法训练Linear Regression模型"""
        assert X_train.shape[0] == y_train.shape[0], 
            "the size of X_train must be equal to the size of y_train"
        assert n_iters >= 1
    
        def dJ_sgd(theta, X_b_i, y_i):
            return X_b_i * (X_b_i.dot(theta) - y_i) * 2.
    
        def sgd(X_b, y, initial_theta, n_iters, t0=5, t1=50):
    
            def learning_rate(t):
                return t0 / (t + t1)
    
            theta = initial_theta
            m = len(X_b)
    
            for cur_iter in range(n_iters):
                indexes = np.random.permutation(m)
                X_b_new = X_b[indexes]
                y_new = y[indexes]
                for i in range(m):
                    gradient = dJ_sgd(theta, X_b_new[i], y_new[i])
                    theta = theta - learning_rate(cur_iter * m + i) * gradient
    
            return theta
    
        X_b = np.hstack([np.ones((len(X_train), 1)), X_train])
        initial_theta = np.random.randn(X_b.shape[1])
        self._theta = sgd(X_b, y_train, initial_theta, n_iters, t0, t1)
    
        self.intercept_ = self._theta[0]
        self.coef_ = self._theta[1:]
    
        return self
    
    def predict(self, X_predict):
        """给定待预测数据集X_predict，返回表示X_predict的结果向量"""
        assert self.intercept_ is not None and self.coef_ is not None, 
            "must fit before predict!"
        assert X_predict.shape[1] == len(self.coef_), 
            "the feature number of X_predict must be equal to X_train"
    
        X_b = np.hstack([np.ones((len(X_predict), 1)), X_predict])
        return X_b.dot(self._theta)
    
    def score(self, X_test, y_test):
        """根据测试数据集 X_test 和 y_test 确定当前模型的准确度"""
    
        y_predict = self.predict(X_test)
        return r2_score(y_test, y_predict)
    
    def __repr__(self):
        return "LinearRegression()"

使用 sklearn 处理 boston 房价回归问题

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
 
boston = datasets.load_boston()
 
boston.keys()
# dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])
 
boston.DESCR
# 有 13 个特征
 
boston.feature_names
# array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')
  
X = boston.data
y = boston.target
 
X = X[y < 50.0]
y = y[y < 50.0]
 
from sklearn.model_selection import train_test_split
 
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42)
 
from sklearn.linear_model import LinearRegression

reg = LinearRegression()
 
reg.fit(X_train, y_train)
# LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
 
reg.coef_
'''
    array([-1.22096140e-01,  3.24934065e-02, -4.51945652e-02,  6.32525034e-02,
           -1.17444911e+01,  3.61793376e+00, -2.00394486e-02, -1.21059188e+00,
            2.47235697e-01, -1.31042159e-02, -8.35556922e-01,  8.18076684e-03,
           -3.81616606e-01])
'''
 
reg.intercept_
# 32.73189970831903

 
reg.score(X_test, y_test)
# 0.7640047258028569

使用 kNN 解决多元线性回归问题

from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
knn_reg = KNeighborsRegressor()
knn_reg.fit(X_train, y_train)
# KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=None, n_neighbors=5, p=2, weights='uniform')
 
knn_reg.score(X_test, y_test)
# 0.5812004118756823
 
# 使用网格搜索寻找超参数 
param_grid = [{'weights': ['uniform'],
               'n_neighbors': [i for i in range(1,11)]
              }, 
              {'weights': ['distance'],
               'n_neighbors': [i for i in range(1,11)],
               'p': [i for i in range(1,6)]
              }]
   
from sklearn.model_selection import GridSearchCV
# CV 的意思是 Cross Validation，交叉验证。

grid_search = GridSearchCV(knn_reg, param_grid, n_jobs=-1, verbose=2)
 
%time 
grid_search.fit(X_train, y_train)

'''
    CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
    Wall time: 4.77 µs
    Fitting 3 folds for each of 60 candidates, totalling 180 fits

 

    GridSearchCV(cv='warn', error_score='raise-deprecating',
                 estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30,
                                               metric='minkowski',
                                               metric_params=None, n_jobs=None,
                                               n_neighbors=5, p=2,
                                               weights='uniform'),
                 iid='warn', n_jobs=-1,
                 param_grid=[{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                              'weights': ['uniform']},
                             {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                              'p': [1, 2, 3, 4, 5], 'weights': ['distance']}],
                 pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
                 scoring=None, verbose=2)

'''
 
 grid_search.best_params_
# {'n_neighbors': 10, 'p': 1, 'weights': 'distance'}
 
grid_search.best_score_
# 0.6343537550346618
 
grid_search.best_estimator_
# KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=None, n_neighbors=10, p=1, weights='distance')
 
grid_search.best_estimator_.score(X_test, y_test)
# 比 knn 默认的结果，但是不如线性回归的效果。但这里的score 标准，可能和 lr 中的标准不通过。
# 0.6717251762406973