sklearn调用随机梯度下降法

#梯度下降法原理编写
#一维函数的梯度下降方法编写
import numpy as np
import matplotlib.pyplot as plt
x=np.linspace(-1,6,141)
y=(x-2.5)**2-1
plt.plot(x,y)
plt.show()
def dJ(theta):
    return 2*(theta-2.5)
def J(theta):
        return (theta-2.5)**2-1
#设置梯度下降法的初始化点以及学习率大小
theta=0.0  #初始值
eta=0.1    #学习率
epsilon=1e-10 #梯度下降条件值
theta1=[theta]
while True:
    gradient=dJ(theta)
    last_theta=theta
    theta=theta-eta*gradient
    theta1.append(theta)
    if (abs(J(theta)-J(last_theta)))<epsilon:
        break
print(theta)
print(J(theta))
plt.plot(x,y)
plt.plot(np.array(theta1),J(np.array(theta1)),color="r",marker="+")
plt.show()
print(len(theta1))
#将梯度下降法封装成为直接利用的函数
#四个超参数eta学习率,theta_initial初始值,epsilon=1e-8差距判断值,n_iters=1e3迭代下降次数
def gradient_descent(eta,theta_initial,epsilon=1e-8,n_iters=1e3):
    theta=theta_initial
    theta_history.append(theta_initial)
    i_iters=0
    while i_iters<n_iters:
        gradient = dJ(theta)
        last_theta = theta
        theta = theta - gradient * eta
        theta_history.append(theta)
        i_iters += 1
        if (abs(J(theta) - J(last_theta))) < epsilon:
            break
def plot_theta_history():
    plt.plot(x,y)
    plt.plot(np.array(theta_history),J(np.array(theta_history)),color="r",marker="+")
    plt.show()
eta=0.01   #学习率一般设置为0.01是比较保险的数值
theta_history=[]
gradient_descent(eta,0.0)
plot_theta_history()
print(len(theta_history))
#多元线性回归算法的梯度下降方法编写
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(666)
x=2*np.random.random(size=100)
y=x*3.0+4.0+np.random.normal(size=100)
X=x.reshape(-1,1)
print(x.shape)
print(y.shape)
plt.scatter(x,y)
plt.show()
print(len(X))
def J(theta,x_b,y):
    try:
        return np.sum((y-x_b.dot(theta))**2)/len(x_b)
    except:
        return float("inf")
def dJ(theta,x_b,y):
    res=np.empty(len(theta))
    res[0]=np.sum(x_b.dot(theta)-y)
    for i in range(1,len(theta)):
        res[i]=(x_b.dot(theta)-y).dot(x_b[:,i])
    return res*2/len(x_b)
#线性回归中梯度求导的向量化处理
def dJ1(theta,x_b,y):
    return x_b.T.dot(x_b.dot(theta)-y)*2/len(x_b)
def gradient_descent(x_b,y,theta_initial,eta,epsilon=1e-8,n_iters=1e4):
    theta=theta_initial
    i_iters=0
    while i_iters<n_iters:
        gradient = dJ1(theta,x_b,y)
        last_theta = theta
        theta = theta - gradient * eta
        if abs(J(theta,x_b,y) - J(last_theta,x_b,y))< epsilon:
            break
        i_iters += 1
    return theta
x_b =np.hstack([np.ones((len(X),1),dtype=float),X])
initial_theta=np.zeros(x_b.shape[1])  #初始化theta
theta=gradient_descent(x_b,y,initial_theta,eta)
eta=0.01 #初始化学习率的大小
print(theta)
#使用梯度下降法的搜索算法求取线性回归算法
#需要进行数据的统一化，因为每个数据特征的数量级不同，eta作用时是不同的
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
#1-2加载数据
boston=datasets.load_boston()
X=boston.data
y=boston.target
x=X[y<50.0]
y=y[y<50.0]
#进行数据的分割训练数据集与测试数据集
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=666)
X=x_train
y=y_train
x_b =np.hstack([np.ones((len(X),1),dtype=float),X])
initial_theta=np.zeros(x_b.shape[1])  #初始化theta
eta=0.000001 #初始化学习率的大小
theta=gradient_descent(x_b,y,initial_theta,eta,n_iters=1e6)
print(theta)
x_b =np.hstack([np.ones((len(x_test),1),dtype=float),x_test])
y_pre=x_b.dot(theta)
from sklearn.metrics import r2_score
print(r2_score(y_pre,y_test))
#进行数据的归一化处理
from sklearn.preprocessing import StandardScaler
s=StandardScaler()
s.fit(x_train)
x_train1=s.transform(x_train)
X=x_train1
y=y_train
x_b =np.hstack([np.ones((len(X),1),dtype=float),X])
initial_theta=np.zeros(x_b.shape[1])  #初始化theta
eta=0.001 #初始化学习率的大小
theta=gradient_descent(x_b,y,initial_theta,eta,n_iters=1e6)
print(theta)
x_test=s.transform(x_test)
x_b =np.hstack([np.ones((len(x_test),1),dtype=float),x_test])
y_pre=x_test.dot(theta[1:])+theta[0]
from sklearn.metrics import r2_score
print(r2_score(y_pre,y_test))

#随机梯度下降法的思想
def J1(theta,x_b,y):
    try:
        return np.sum((y-x_b.dot(theta))**2)/len(x_b)
    except:
        return float("inf")
def dJ_SGD(theta,x_b_i,y_i):
    return x_b_i.T.dot(x_b_i.dot(theta)-y_i)*2
#普通随机梯度下降法思想
def sgd(x_b,y,initial_theta,n_iters):
    t0=5
    t1=50
    def learning_rate(t):  #euxilv
        return t0/(t1+t)
    theta=initial_theta
    for cur_iter in range(n_iters):
        rand_i=np.random.randint(len(x_b))
        gradient=dJ_SGD(theta,x_b[rand_i],y[rand_i])
        theta=theta-gradient*learning_rate(cur_iter)
    return theta
#随机梯度下降法的标准模式
def sgd1(x_b,y,initial_theta,n_iters,t0=5,t1=50):
    def learning_rate(t):  #euxilv
        return t0/(t1+t)
    theta=initial_theta
    m=len(x_b)
    for cur_iter in range(n_iters):
        index=np.random.permutation(m)
        x_b_new=x_b[index]
        y_new=y[index]
        for i in range(m):
            gradient=dJ_SGD(theta,x_b_new[i],y_new[i])
            theta=theta-gradient*learning_rate(cur_iter*m+i)
    return theta
np.random.seed(666)
x=2*np.random.random(size=100000)
y=x*3.0+4.0+np.random.normal(size=100000)
X=x.reshape(-1,1)
x_b=np.hstack([np.ones((len(X),1),dtype=float),X])
initial_theta=np.zeros(x_b.shape[1])  #初始化theta
theta=sgd1(x_b,y,initial_theta,n_iters=5)
eta=0.01 #初始化学习率的大小
print(theta)

#sklearn中调用SGD随机梯度下降法解决多元线性回归问题
boston=datasets.load_boston()
X=boston.data
y=boston.target
x=X[y<50.0]
y=y[y<50.0]
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=666)
from sklearn.preprocessing import StandardScaler #数据归一化
s=StandardScaler()
s.fit(x_train)
x_train=s.transform(x_train)
x_test=s.transform(x_test)
from sklearn.linear_model import SGDRegressor
s=SGDRegressor(n_iter_no_change=10)  #定义遍历的数目
s.fit(x_train,y_train)
print(s.score(x_test,y_test))

#多元线性回归中使用梯度下降法来求得损失函数的最小值
np.random.seed(666)
x=np.random.random(size=(1000,10))
ture_y=np.arange(1,12,dtype=float)
x_b=np.hstack([np.ones((len(x),1)),x])
print(ture_y)
y=x_b.dot(ture_y)+np.random.normal(size=1000)
#1使用梯度下降法训练
def J11(theta,x_b,y):
        return np.sum((y-x_b.dot(theta))**2)/len(x_b)
#多元函数偏导数的计算方式
#1-1数学公式法
def DJmath(theta, x_b, y):
    return x_b.T.dot(x_b.dot(theta)-y)*2/len(y)
#1-2导数定义逼近法（各种函数都适用）
def DJdebug(theta, x_b, y,ep=0.0001):
    res = np.empty(len(theta))
    for f in range(len(theta)):
        theta1=theta.copy()
        theta1[f]=theta1[f]+ep
        theta2 = theta.copy()
        theta2[f] = theta2[f]-ep
        res[f]=(J1(theta1,x_b,y)-J1(theta2,x_b,y))/(2*ep)
    return res
def gradient_descent1(dj,x_b,y,eta,theta_initial,erro=1e-8, n=1e4):
    theta=theta_initial
    i=0
    while i<n:
        gradient =dj(theta,x_b,y)
        last_theta = theta
        theta = theta - gradient * eta
        if (abs(J11(theta,x_b,y) - J11(last_theta,x_b,y))) < erro:
            break
        i+=1
    return theta
print(x_b)
theta0=np.zeros(x_b.shape[1])
eta=0.1
theta1=gradient_descent1(DJdebug,x_b,y,eta,theta0)
print(theta1)
theta2=gradient_descent1(DJmath,x_b,y,eta,theta0)
print(theta2)