SDN大作业

1.小组成员

组名：bzzb银河战舰
031702627 李至恒（组长）
031702623 蔡嘉懿
031702625 杨蓝宇
031702632 林华伟
031702634 徐祖豪

2.应用背景

本次实验属于流量分类课题中的恶意流量识别小专题（主要针对DDOS攻击）
SDN环境下的DDoS攻击检测，检测主机是否遭到了DDoS攻击。

3.环境搭建

以下代码完整部分均已上传至github
拓扑图

from mininet.topo import Topo
class MyTopo(Topo):

    def __init__(self):

        # initilaize topology
        Topo.__init__(self)

        # add hosts and switches
        h1 = self.addHost('h1',ip='10.1.1.1')
        h2 = self.addHost('h2',ip='10.1.1.2')
        h3 = self.addHost('h3',ip='10.1.1.3')
        h4 = self.addHost('h4',ip='10.1.2.1')
        h5 = self.addHost('h5',ip='10.1.2.2')
        h6 = self.addHost('h6',ip='10.1.2.3')
	h7 = self.addHost('h7',ip='10.10.10.1')
        h8 = self.addHost('h8',ip='10.10.10.2')
        h9 = self.addHost('h9',ip='10.10.10.3')
        h10 = self.addHost('h10',ip='10.10.20.1')
        h11 = self.addHost('h11',ip='10.10.20.2')
        h12 = self.addHost('h12',ip='10.10.20.3')
        s1 = self.addSwitch('s1')
	s11 = self.addSwitch('s11')
	s12 = self.addSwitch('s12')
        s2 = self.addSwitch('s2')
	s21 = self.addSwitch('s21')
	s22 = self.addSwitch('s22')

        # add links
        self.addLink(s1,s2,3,1)
	self.addLink(s1,s11,1,3)
	self.addLink(s1,s12,2,3)
	self.addLink(s11,h1,1,1)	
	self.addLink(s11,h2,2,1)
	self.addLink(s11,h3,4,1)
	self.addLink(s12,h4,1,1)
	self.addLink(s12,h5,2,1)	
	self.addLink(s12,h6,4,1)
	self.addLink(s2,s21,2,3)
	self.addLink(s2,s22,3,3)
        self.addLink(s21,h7,1,1)
        self.addLink(s21,h8,2,1)
        self.addLink(s21,h9,4,1)
        self.addLink(s22,h10,1,1)
        self.addLink(s22,h11,2,1)
        self.addLink(s22,h12,4,1)
        
topos = {'mytopo': (lambda: MyTopo())}

拓扑代码

def cal_PPS(flow_set):
    # 数据包速率
    length = len(flow_set)
    PPS = float(length / 2)
    return PPS

def cal_FER(flow_set, fnum):
    # 流表项生成率
    for i in flow_set:
        flows.update({i[3], i[4]})
    length = len(flows)
    new_num = length - fnum  # 新增流表项数目
    FER = float(new_num / 2)
    fnum = length  # 总流表项数据数目
    return FER, fnum, new_num  # FER 流表项数目 新增数目

def cal_APPF(flow_set):
    # 流表项的平均数据包数目
    temp_flows = set()
    for i in flow_set:
        temp_flows.update({i[3], i[4]})
    num = len(flows)
    APPF = len(flow_set) / num
    return APPF

def cal_SFP(flow_set):
    # 单流表项的数目

    temp_flows = set()
    setL = set()
    setR = set()
    for i in flow_set:
        temp = {(i[3], i[4])}
        temp_flows.update(temp)
    #        print(temp_flows)
    num = len(temp_flows)
    for i in temp_flows:
        #        print(i)
        setL.add(i[0])
        setR.add(i[1])
    setR = setL - setR
    single_num = len(setR)
    SEP = (float(single_num) / float(num))
    return SEP

def cal_PS(flow_set):
    k = 0
    type = ["eth:ethertype:ip:tcp", "eth:ethertype:ip:udp"]
    pro = [0.0, 0.0]
    for i in flow_set:
        for j in type:
            if i[2][0:20] == j:
                pro[k] = pro[k] + 1
                break
            k = k + 1
        k = 0
    M = len(flow_set)
    if pro[0] != 0 and pro[1] != 0:
        PS = -(((pro[0] / M) * math.log(pro[0] / M)) + ((pro[1] / M) * math.log(pro[1] / M)))
        return PS
    if pro[0] == 0 and pro[1] != 0:
        PS = -(pro[1] / M) * math.log(pro[1] / M)
        return PS
    if pro[0] != 0 and pro[1] == 0:
        PS = -(pro[0] / M) * math.log(pro[0] / M)
        return PS
    else:
        return 0

def cal_h_sIP_dIP(flow_set):
    print("cal_h_sIP_dIP")
    h = 0
    sIP = 0
    dIP = 0
    temp_flows = set()
    srcIP = set()
    dstIP = set()
    dstPort = set()

    for i in flow_set:
        #   print(i)
        temp = {(i[3], i[4], i[6])}
        temp_flows.update(temp)
    for i in temp_flows:
        srcIP.add(i[0])
        dstIP.add(i[1])
        dstPort.add(i[2])
    srcList = list(srcIP)
    dstList = list(dstIP)
    PortList = list(dstPort)

    # 计算H(srcIP|dstIP)
    A = [[] for i in range(len(dstList))]
    B = [[[] for i in range(len(dstList))] for i in range(len(srcList))]
    # Init
    for i in range(len(dstList)):
        A[i] = 0
    for i in range(len(srcList)):
        for j in range(len(dstList)):
            B[i][j] = 0
    # print(srcList)
    #  print(dstList)
    flag = 0
    for k in flow_set:  # 每一个数据包
        for i in range(len(srcList)):
            for j in range(len(dstList)):
                # print(k[3],k[4])
                if srcList[i] == k[3] and dstList[j] == k[4]:
                    B[i][j] += 1
                    A[j] += 1
                    flag = 1
                    break
            if flag == 1:
                flag = 0
                break
    for j in range(len(dstList)):
        tempj = (float(A[j]) / float(len(flow_set)))
        tempi = 0
        for i in range(len(srcList)):
            if B[i][j] != 0 and A[j] != 0:
                tempi += ((float(B[i][j]) / float(A[j]))) * (math.log(float(B[i][j]) / float(A[j])))
        h += tempj * tempi
    h = -h

    # 计算H(dstPort|dstIP)
    C = [[[] for i in range(len(dstList))] for i in range(len(PortList))]
    # Init
    for i in range(len(PortList)):
        for j in range(len(dstList)):
            C[i][j] = 0

    for k in flow_set:  # 每一个数据包
        for i in range(len(PortList)):
            for j in range(len(dstList)):
                if PortList[i] == k[6] and dstList[j] == k[4]:
                    C[i][j] += 1
    for j in range(len(dstList)):
        tempj = (float(A[j]) / float(len(flow_set)))
        tempi = 0
        for i in range(len(PortList)):
            if C[i][j] != 0 and A[j] != 0:
                tempi += (float(C[i][j]) / float(A[j])) * (math.log(float(C[i][j]) / float(A[j])))
        dIP += tempj * tempi
    dIP = -dIP

    # 计算H(srcIP|dstPort)
    D = [[] for i in range(len(PortList))]
    E = [[[] for i in range(len(PortList))] for i in range(len(srcList))]
    # Init
    for i in range(len(PortList)):
        D[i] = 0
    for i in range(len(srcList)):
        for j in range(len(PortList)):
            E[i][j] = 0
    for k in flow_set:  # 每一个数据包
        for i in range(len(srcList)):
            for j in range(len(PortList)):
                if srcList[i] == k[3] and PortList[j] == k[6]:
                    E[i][j] += 1
                    D[j] += 1
    for j in range(len(PortList)):
        tempj = float(D[j]) / float(len(flow_set))
        tempi = 0
        for i in range(len(srcList)):
            if E[i][j] != 0 and D[j] != 0:
                tempi += (float(E[i][j]) / float(D[j])) * (math.log(float(E[i][j]) / float(D[j])))
        sIP += tempj * tempi
    sIP = -sIP

    return h, sIP, dIP

读取数据集（数据集来自UNB ISCX 2012 intrusion detection evaluation dataset）

def zc_read_csv(m):
    zc_dataframe = pd.read_csv("./data.csv", sep=",")
    x = np.array([[0,0,0,0,0,0,0,0,0,0]])
    y = np.array([[0,0]])
    listy = [0,0]
    flag = 0
    for i in zc_dataframe.index:
        zc_row = zc_dataframe.loc[i]
        private = np.array([(zc_row)],float)
	listy[0] = zc_row[8]
	listy[1] = zc_row[9]
	private_y = np.array([(listy)],float)
        x = np.concatenate((x,private),axis=0)
	y = np.concatenate((y,private_y),axis=0)
	flag += 1 
	if(flag ==m):
		break;
    y=y[1:,:]
    x=x[1:,:-2]
    return (x,y)
	
def RNN(X, weights, biases):
    X_in = tf.matmul(X, weights['in'] + biases['in'])
    X_in = tf.reshape(X_in, [-1, n_steps, n_hidden_units])
    lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(n_hidden_units, forget_bias=1.0, state_is_tuple=True)
    _init_state = lstm_cell.zero_state(batch_size, dtype=tf.float32)
    outputs, states = tf.nn.dynamic_rnn(lstm_cell, X_in, initial_state=_init_state, time_major=False)
    results = tf.matmul(states[1], weights['out']) + biases['out']
    return results

机器学习

4.演示视频（视频时长6分钟，可以跳跃观看）

https://www.bilibili.com/video/av82387501/

5.实验思路说明

前期准备：网络上下载数据集 ==> 数据集初步处理 ==> 特征向量提取 ==>生成训练集和测试集 ==>构建LSTM模型 ==>训练模型 ==> 保存模型
实验过程（视频）：POX控制器开启 ==> 拓扑搭建 ==> 全网互通 ==> 攻击模拟 ==> 运行脚本（脚本内容放置在github上） ==> 返回结果
脚本流程：捕获这2秒内网络环境中的数据包 ==>生成初步处理数据集 ==>特征向量提取 ==>载入模型 ==>模型给出判断

如果需要的话：
循环取值训练，模型在训练集（来自网络）上的准确率曲线

随机取值训练，模型在训练集（来自网络）上的准确率曲线

（但是因为测试集和环境中模型表现很差，最终是重新自己针对实验制作了数据集训练了模型，可以完美符合这个小实验的要求......很遗憾因为知识不足无法更好地改进模型以匹配真实网络环境中的攻击）

6.github链接

https://github.com/SilentSamsara/DDos-attack-detection/

7.分工和比例

学号	姓名	分工	比例
031702627	李至恒	模型搭建和模型训练	20%
031702623	蔡嘉懿	数据集的获取和特征向量提取的部分函数	20%
031702625	杨蓝宇	部分模型工作和运行脚本和演示工作	20%
031702632	林华伟	拓扑环境搭建和pox控制器学习	20%
031702634	徐祖豪	特征向量提取代码主编	20%

因为这次实验大家一起在活动室坐了2天，难以量化数据，故取平均值

8.实验感想

林华伟：这次实验由于时间比较赶，和同学一起在活动室肝了两天，期间大家一起分工并讨论问题，我在这几天学习了如何使用pox控制器连接拓扑，如何用hping3模拟DDos攻击，如何编写shell脚本执行tcpdump自动抓包并将pcap文件转换为csv文件等等，这段时间通过实践学到了很多东西，受益匪浅。
蔡嘉懿：这次大作业我们做的是深度学习检测DDoS攻击，选择课题的难度较大，所以所有组员一起合作分工做了很久才完成。通过这个大作业感受到了研究课题的难度和学习的艰辛，但同时感受到更多的是收获的知识以及团队协同的精神。大家都很努力的在完成这次作业，看到最后的成果，真的是满足在心中！
徐祖豪：SDN作为一门创新创业的课程的确是有其新颖性，通过这一学期的学习，从一开始的一窍不通到如今的掌握了很多新的概念，学习的过程还是很顺利的。但是mininet作为核心工具却只能运行在linux环境下，在这样一个新的环境下编写程序运行程序花了很多时间去试错，但也正是在不停的实践和试错中才会有更深刻的理解。然后就是大作业的选题是报告时的恶意流量检测，题目很新颖，但是大家都没有机器学习中的分类问题的开发经验，在数据集的选择上一头雾水，对于训练集数据的采集也不够具有代表性，训练出来的模型的正确率不是很理想，刚好最近又有考试，也不能allin，的确有遗憾，下次努力吧
杨蓝宇：大作业有点复杂，搞得我做得内心极其复杂，不是代码有多难写，而是Ubuntu和内部的条件太难懂了，但是我们团队的人没有放弃，尝试了很多次Ubuntu的脚本结合python，有了一些起色，进行了很多次试验，最后成功检测感觉付出没有白费。
李至恒：这次实验真的让人感觉头很大，之前从来都没有做过机器学习，从头开始去看，映像最深的是第一天坐在活动室一整天都搭不出个模型，基础实在太差了。模型做完以后本来以为基本完事了，结果训练的时候又出现了问题，准确率不管怎么样都上不去，而且原因太多了，排查成本也很大（训练集问题啊，模型问题啊），最后是一切从简，既然网络中的DDOS攻击检测因为数据集的提取或者模型的选择有问题无法很好拟合，那我们就针对这次实验训练模型（说白了就是测试集当训练集的做法，哈哈），所以最终的模型的训练集用的是自己提取的训练集......但是不能说没有意义，我们的模型无法契合现实，但是借用的思想未必是错的，想想问题还是在于知识的匮乏，没法很好的调整和选择模型。我们在一起学习了很多，我们从数据集获取，处理，导入模型，训练模型，部署模型，拓扑搭建，攻击模拟，结果展示，和队友们一步一步走过来，完成了整个流程，我个人觉得还是很有意义的，感谢队友们一路一起走来，想说的话真的太多太多。