15.衡量回归算法的标准

衡量回归算法的标准

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

波士顿房产数据

boston = datasets.load_boston()

x = boston.data[:, 5] # 只使用房间数量这个特征

x.shape

(506,)

y = boston.target
y.shape

(506,)

plt.scatter(x, y)

np.max(y)

50.0

去除干扰数据

x = x[y < 50.0]
y = y[y < 50.0]

x.shape, y.shape

((490,), (490,))

plt.scatter(x, y)

使用简单线性回归

# 数据分割为训练集和测试集

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=666)

x_train.shape, y_train.shape

((367,), (367,))

# 使用训练集求归回方程

x_mean = np.mean(x_train)
y_mean = np.mean(y_train)

num = (x_train - x_mean).dot(y_train - y_mean)
d = (x_train - x_mean).dot(x_train - x_mean)

a = num / d
b = y_mean - a * x_mean

y_hat = a * x_train + b

x_train.shape, y_train.shape

((367,), (367,))

y_hat.shape

(367,)

plt.scatter(x_train, y_train)
plt.plot(x_train, y_hat)

衡量回归算法的标准

# 在回归方程下求测试集的预测值
y_predict = a * x_test + b

# MSE 预测值与真实值误差衡量

mse_test = np.sum((y_predict - y_test)**2) / len(y_test)
mse_test

28.215949368640807

# RMSE

from math import sqrt
rmse_test = sqrt(mse_test)
rmse_test

5.311868726600913

# MAE

mae_test = np.sum(np.absolute(y_predict - y_test)) / len(y_test)
mae_test

3.9489046062737834

sklearn 中的MSE MAE

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error


mean_squared_error(y_test, y_predict)

28.215949368640807

mean_absolute_error(y_test, y_predict)

3.9489046062737834

R Square

rsquare = 1 - mean_squared_error(y_test, y_predict) / np.var(y_test)
rsquare

0.5682464825049472

from sklearn.metrics import r2_score
r2_score(y_test, y_predict)

0.5682464825049472