import numpy as np

data_arr = []
label_arr = []
f = open('D:\mlInAction\data\5.Logistic\TestSet.txt', 'r')
for line in f.readlines():
    line_arr = line.strip().split()
    # 为了方便计算，我们将 X0 的值设为 1.0 ，也就是在每一行的开头添加一个 1.0 作为 X0
    data_arr.append([1.0, np.float(line_arr[0]), np.float(line_arr[1])])
    label_arr.append(int(line_arr[2]))

data_arr

[[1.0, -0.017612, 14.053064],
 [1.0, -1.395634, 4.662541],
 [1.0, -0.752157, 6.53862],
 [1.0, -1.322371, 7.152853],
 [1.0, 0.423363, 11.054677],
 [1.0, 0.406704, 7.067335],
 [1.0, 0.667394, 12.741452],
 [1.0, -2.46015, 6.866805],
 [1.0, 0.569411, 9.548755],
 [1.0, -0.026632, 10.427743],
 [1.0, 0.850433, 6.920334],
 [1.0, 1.347183, 13.1755],
 [1.0, 1.176813, 3.16702],
 [1.0, -1.781871, 9.097953],
 [1.0, -0.566606, 5.749003],
 [1.0, 0.931635, 1.589505],
 [1.0, -0.024205, 6.151823],
 [1.0, -0.036453, 2.690988],
 [1.0, -0.196949, 0.444165],
 [1.0, 1.014459, 5.754399],
 [1.0, 1.985298, 3.230619],
 [1.0, -1.693453, -0.55754],
 [1.0, -0.576525, 11.778922],
 [1.0, -0.346811, -1.67873],
 [1.0, -2.124484, 2.672471],
 [1.0, 1.217916, 9.597015],
 [1.0, -0.733928, 9.098687],
 [1.0, -3.642001, -1.618087],
 [1.0, 0.315985, 3.523953],
 [1.0, 1.416614, 9.619232],
 [1.0, -0.386323, 3.989286],
 [1.0, 0.556921, 8.294984],
 [1.0, 1.224863, 11.58736],
 [1.0, -1.347803, -2.406051],
 [1.0, 1.196604, 4.951851],
 [1.0, 0.275221, 9.543647],
 [1.0, 0.470575, 9.332488],
 [1.0, -1.889567, 9.542662],
 [1.0, -1.527893, 12.150579],
 [1.0, -1.185247, 11.309318],
 [1.0, -0.445678, 3.297303],
 [1.0, 1.042222, 6.105155],
 [1.0, -0.618787, 10.320986],
 [1.0, 1.152083, 0.548467],
 [1.0, 0.828534, 2.676045],
 [1.0, -1.237728, 10.549033],
 [1.0, -0.683565, -2.166125],
 [1.0, 0.229456, 5.921938],
 [1.0, -0.959885, 11.555336],
 [1.0, 0.492911, 10.993324],
 [1.0, 0.184992, 8.721488],
 [1.0, -0.355715, 10.325976],
 [1.0, -0.397822, 8.058397],
 [1.0, 0.824839, 13.730343],
 [1.0, 1.507278, 5.027866],
 [1.0, 0.099671, 6.835839],
 [1.0, -0.344008, 10.717485],
 [1.0, 1.785928, 7.718645],
 [1.0, -0.918801, 11.560217],
 [1.0, -0.364009, 4.7473],
 [1.0, -0.841722, 4.119083],
 [1.0, 0.490426, 1.960539],
 [1.0, -0.007194, 9.075792],
 [1.0, 0.356107, 12.447863],
 [1.0, 0.342578, 12.281162],
 [1.0, -0.810823, -1.466018],
 [1.0, 2.530777, 6.476801],
 [1.0, 1.296683, 11.607559],
 [1.0, 0.475487, 12.040035],
 [1.0, -0.783277, 11.009725],
 [1.0, 0.074798, 11.02365],
 [1.0, -1.337472, 0.468339],
 [1.0, -0.102781, 13.763651],
 [1.0, -0.147324, 2.874846],
 [1.0, 0.518389, 9.887035],
 [1.0, 1.015399, 7.571882],
 [1.0, -1.658086, -0.027255],
 [1.0, 1.319944, 2.171228],
 [1.0, 2.056216, 5.019981],
 [1.0, -0.851633, 4.375691],
 [1.0, -1.510047, 6.061992],
 [1.0, -1.076637, -3.181888],
 [1.0, 1.821096, 10.28399],
 [1.0, 3.01015, 8.401766],
 [1.0, -1.099458, 1.688274],
 [1.0, -0.834872, -1.733869],
 [1.0, -0.846637, 3.849075],
 [1.0, 1.400102, 12.628781],
 [1.0, 1.752842, 5.468166],
 [1.0, 0.078557, 0.059736],
 [1.0, 0.089392, -0.7153],
 [1.0, 1.825662, 12.693808],
 [1.0, 0.197445, 9.744638],
 [1.0, 0.126117, 0.922311],
 [1.0, -0.679797, 1.22053],
 [1.0, 0.677983, 2.556666],
 [1.0, 0.761349, 10.693862],
 [1.0, -2.168791, 0.143632],
 [1.0, 1.38861, 9.341997],
 [1.0, 0.317029, 14.739025]]

label_arr

[0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0]

def sigmoid(x):
    # 这里其实非常有必要解释一下，会出现的错误 RuntimeWarning: overflow encountered in exp
    # 这个错误在学习阶段虽然可以忽略，但是我们至少应该知道为什么
    # 这里是因为我们输入的有的 x 实在是太小了，比如 -6000之类的，那么计算一个数字 np.exp(6000)这个结果太大了，没法表示，所以就溢出了
    # 如果是计算 np.exp（-6000），这样虽然也会溢出，但是这是下溢，就是表示成零
    # 去网上搜了很多方法，比如 使用bigfloat这个库（我竟然没有安装成功，就不尝试了，反正应该是有用的
    return 1.0 / (1 + np.exp(-x))

def grad_ascent(data_arr, class_labels):
    """
    梯度上升法，其实就是因为使用了极大似然估计，这个大家有必要去看推导，只看代码感觉不太够
    :param data_arr: 传入的就是一个普通的数组，当然你传入一个二维的ndarray也行
    :param class_labels: class_labels 是类别标签，它是一个 1*100 的行向量。
                    为了便于矩阵计算，需要将该行向量转换为列向量，做法是将原向量转置，再将它赋值给label_mat
    :return: 
    """
    # 注意一下，我把原来 data_mat_in 改成data_arr,因为传进来的是一个数组，用这个比较不容易搞混
    # turn the data_arr to numpy matrix
    data_mat = np.mat(data_arr)
    # 变成矩阵之后进行转置
    label_mat = np.mat(class_labels).transpose()
    # m->数据量，样本数 n->特征数
    m, n = np.shape(data_mat)
    # 学习率，learning rate
    alpha = 0.001
    # 最大迭代次数，假装迭代这么多次就能收敛2333
    max_cycles = 500
    # 生成一个长度和特征数相同的矩阵，此处n为3 -> [[1],[1],[1]]
    # weights 代表回归系数， 此处的 ones((n,1)) 创建一个长度和特征数相同的矩阵，其中的数全部都是 1
    weights = np.ones((n, 1))
    for k in range(max_cycles):
        # 这里是点乘  m x 3 dot 3 x 1
        h = sigmoid(data_mat * weights)
        error = label_mat - h
        # 这里比较建议看一下推导，为什么这么做可以，这里已经是求导之后的
        weights = weights + alpha * data_mat.transpose() * error
    return weights

weights = grad_ascent(data_arr, label_arr)
weights

matrix([[ 4.12414349],
        [ 0.48007329],
        [-0.6168482 ]])

import matplotlib.pyplot as plt


def plot_best_fit(data_mat, label_mat, weights):
    """
    可视化
    :param weights: 
    :return: 
    """
    data_arr = np.array(data_mat)
    n = np.shape(data_mat)[0]
    x_cord1 = []
    y_cord1 = []
    x_cord2 = []
    y_cord2 = []
    for i in range(n):
        if int(label_mat[i]) == 1:
            x_cord1.append(data_arr[i, 1])
            y_cord1.append(data_arr[i, 2])
        else:
            x_cord2.append(data_arr[i, 1])
            y_cord2.append(data_arr[i, 2])
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.scatter(x_cord1, y_cord1, s=30, color='k', marker='^')
    ax.scatter(x_cord2, y_cord2, s=30, color='red', marker='s')
    x = np.arange(-3.0, 3.0, 0.1)
    # print(x)
    y = (-weights[0] - weights[1] * x) / weights[2]
    # type(y)
    y = np.ravel(y)  # y原来是一个二维，需要转化为1维
    """
    y的由来，卧槽，是不是没看懂？
    首先理论上是这个样子的。
    dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])
    w0*x0+w1*x1+w2*x2=f(x)
    x0最开始就设置为1叻， x2就是我们画图的y值，而f(x)被我们磨合误差给算到w0,w1,w2身上去了
    所以： w0+w1*x+w2*y=0 => y = (-w0-w1*x)/w2   
    """
    ax.plot(x, y)
    plt.xlabel('x1')
    plt.ylabel('y1')
    plt.show()

plot_best_fit(data_arr, label_arr, weights)

def stoc_grad_ascent0(data_mat, class_labels):
    """
    随机梯度上升，只使用一个样本点来更新回归系数
    :param data_mat: 输入数据的数据特征（除去最后一列）,ndarray
    :param class_labels: 输入数据的类别标签（最后一列数据）
    :return: 得到的最佳回归系数
    """
    m, n = np.shape(data_mat)
    alpha = 0.01
    weights = np.ones(n)
    for i in range(m):
        # sum(data_mat[i]*weights)为了求 f(x)的值， f(x)=a1*x1+b2*x2+..+nn*xn,
        # 此处求出的 h 是一个具体的数值，而不是一个矩阵
        h = sigmoid(sum(data_mat[i] * weights))
        error = class_labels[i] - h
        # 还是和上面一样，这个先去看推导，再写程序
        weights = weights + alpha * error * data_mat[i]
    return weights

def stoc_grad_ascent1(data_mat, class_labels, num_iter=150):
    """
    改进版的随机梯度上升，使用随机的一个样本来更新回归系数
    :param data_mat: 输入数据的数据特征（除去最后一列）,ndarray
    :param class_labels: 输入数据的类别标签（最后一列数据
    :param num_iter: 迭代次数
    :return: 得到的最佳回归系数
    """
    m, n = np.shape(data_mat)
    weights = np.ones(n)
    for j in range(num_iter):
        # 这里必须要用list，不然后面的del没法使用
        data_index = list(range(m))
        for i in range(m):
            # i和j的不断增大，导致alpha的值不断减少，但是不为0
            alpha = 4 / (1.0 + j + i) + 0.01
            # 随机产生一个 0～len()之间的一个值
            # random.uniform(x, y) 方法将随机生成下一个实数，它在[x,y]范围内,x是这个范围内的最小值，y是这个范围内的最大值。
            rand_index = int(np.random.uniform(0, len(data_index)))
            h = sigmoid(np.sum(data_mat[data_index[rand_index]] * weights))
            error = class_labels[data_index[rand_index]] - h
            weights = weights + alpha * error * data_mat[data_index[rand_index]]
            del(data_index[rand_index])
    return weights

weights1 = stoc_grad_ascent1(np.array(data_arr), np.array(label_arr))
weights1

array([13.97204276,  1.24615972, -1.9102347 ])

plot_best_fit(data_arr, label_arr, weights1)

05机器学习实战之Logistic 回归

Logistic 回归概述

须知概念

Sigmoid 函数

回归概念

二值型输出分类函数

基于最优化方法的回归系数确定

梯度上升法

梯度上升法的思想

Logistic 回归原理

Logistic 回归工作原理

Logistic 回归开发流程

Logistic 回归算法特点

附加方向导数与梯度

Logistic 回归项目案例

项目概述

开发流程

注意

05机器学习实战之Logistic 回归

Logistic 回归 概述

须知概念

Sigmoid 函数

回归 概念

二值型输出分类函数

基于最优化方法的回归系数确定

梯度上升法

梯度上升法的思想

Logistic 回归 原理

Logistic 回归 工作原理

Logistic 回归 开发流程

Logistic 回归 算法特点

附加 方向导数与梯度

Logistic 回归 项目案例

项目概述

开发流程

注意

Logistic 回归概述

回归概念

Logistic 回归原理

Logistic 回归工作原理

Logistic 回归开发流程

Logistic 回归算法特点

附加方向导数与梯度

Logistic 回归项目案例