python3-爬取cnnvd漏洞信息-代理+多线程

2018年1月26日 16:04:06 更新代码。

excel结果:

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# by 默不知然
# 2017年12月18日 10:01:30

import requests
from urllib import parse
from bs4 import BeautifulSoup
import xlwt
import zlib
import re
import time
import xlsxwriter
import sys
import datetime
import random
import threadpool

'''
运行方法:
python spider_cnnvd.py 2017-10-01 2017-10-31 178
第一个为开始时间,第二个为结束时间,第三个为总页数。
'''

#获取代理,并写入列表agent_lists
def agent_list(url):
    header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0'}
    r = requests.get(url,headers=header)
    agent_info = BeautifulSoup(r.content,'lxml').find(id="ip_list").find_all('tr')[1:]
    for i in range(len(agent_info)):
        info = agent_info[i].find_all('td')
        agents = {info[5].string : 'http://' + info[1].string}
        agent_lists.append(agents)


#获得漏洞详情链接列表
def vulnerabilities_url_list(url,start_time,end_time):
        header = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0',
                'Content-Type': 'application/x-www-form-urlencoded'
                }
        data ={
                'qstartdate':'2018-02-01',
                'qenddate':'2018-02-01'
                }
        data['qstartdate'] = start_time
        data['qenddate'] = end_time
        proxy = random.sample(agent_lists,1)[0]
        vulnerabilities_url_html = requests.post(url,headers=header,proxies=proxy,data=data)
        vulnerabilities_url_html = vulnerabilities_url_html.content.decode()
    
    #提取漏洞详情链接
        response = r'href="(.+?)" target="_blank" class="a_title2"'
        vulnerabilities_link_list = re.compile(response).findall(vulnerabilities_url_html)
    
    #添加http前序
        i = 0
        for link in vulnerabilities_link_list:
                vulnerabilities_lists.append('http://cnnvd.org.cn'+vulnerabilities_link_list[i])
                i+=1
                print("已完成爬行第%d个漏洞链接"%i)

#漏洞信息爬取函数
def vulnerabilities_data(url):
    header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0'}
    #proxy = random.sample(agent_lists,1)[0]
    vulnerabilities_data_html = requests.get(url,headers=header)            #,proxies=proxy)
    vulnerabilities_data_html = vulnerabilities_data_html.content.decode()

    #global vulnerabilities_result_list
    vulnerabilities_result_list_eve=[]    #抓取信息列表命名
    
    #添加漏洞信息详情
    vulnerabilities_detainled_soup1 = BeautifulSoup(vulnerabilities_data_html,'html.parser')
    vulnerabilities_detainled_data = vulnerabilities_detainled_soup1.find('div',attrs={'class':'detail_xq w770'})    ##定义 漏洞信息详情 块的soup
    vulnerabilities_detainled_data = vulnerabilities_detainled_data.encode().decode()
    vulnerabilities_detainled_soup = BeautifulSoup(vulnerabilities_detainled_data,'html.parser')    #二次匹配    

    vulnerabilities_detainled_data_list = vulnerabilities_detainled_soup.find_all('li')    #标签a信息汇总    
    
    try:
        vulnerabilities_name = vulnerabilities_detainled_soup.h2.string    #漏洞名称
    except:
        vulnerabilities_name = ''
    vulnerabilities_result_list_eve.append(vulnerabilities_name)
    
    try:
        vulnerabilities_cnnvd_num = vulnerabilities_detainled_soup.span.string    #cnnvd编号
        vulnerabilities_cnnvd_num = re.findall(r":([sS]*)",vulnerabilities_cnnvd_num)[0]
    except:
        vulnerabilities_cnnvd_num = ''
    vulnerabilities_result_list_eve.append(vulnerabilities_cnnvd_num)
    
    try:                            #漏洞等级
        vulnerabilities_rank = vulnerabilities_detainled_soup.a.decode()
        vulnerabilities_rank = re.search(u'([u4e00-u9fa5]+)',vulnerabilities_rank).group(0)
    except:
        vulnerabilities_rank = ''
    vulnerabilities_result_list_eve.append(vulnerabilities_rank)

    vulnerabilities_cve_html = vulnerabilities_detainled_data_list[2].encode().decode()    #漏洞cve编号
    vulnerabilities_cve_soup = BeautifulSoup(vulnerabilities_cve_html,'html.parser')
    try:
        vulnerabilities_cve = vulnerabilities_cve_soup.a.string
        vulnerabilities_cve = vulnerabilities_cve.replace("
","").replace("	","").replace("
","").replace(" ","")
    except:
        vulnerabilities_cve = ''
    vulnerabilities_result_list_eve.append(vulnerabilities_cve)
    
    vulnerabilities_type_html = vulnerabilities_detainled_data_list[3].encode().decode()    #漏洞类型
    vulnerabilities_type_soup = BeautifulSoup(vulnerabilities_type_html,'html.parser')
    try:
        vulnerabilities_type = vulnerabilities_type_soup.a.string
        vulnerabilities_type = vulnerabilities_type.replace("
","").replace("	","").replace("
","").replace(" ","")
    except:
        vulnerabilities_type = ''
    vulnerabilities_result_list_eve.append(vulnerabilities_type)
    
    vulnerabilities_time_html = vulnerabilities_detainled_data_list[4].encode().decode()    #发布时间
    vulnerabilities_time_soup = BeautifulSoup(vulnerabilities_time_html,'html.parser')
    try:    
        vulnerabilities_time = vulnerabilities_time_soup.a.string
        vulnerabilities_time = vulnerabilities_time.replace("
","").replace("	","").replace("
","")
    except:
        vulnerabilities_time = ''
    vulnerabilities_result_list_eve.append(vulnerabilities_time)

    vulnerabilities_attack_html = vulnerabilities_detainled_data_list[5].encode().decode()    #威胁类型
    vulnerabilities_attack_soup = BeautifulSoup(vulnerabilities_attack_html,'html.parser')
    try:    
        vulnerabilities_attack = vulnerabilities_attack_soup.a.string
        vulnerabilities_attack = vulnerabilities_attack.replace("
","").replace("	","").replace("
","")
    except:
        vulnerabilities_attack = ''
    vulnerabilities_result_list_eve.append(vulnerabilities_attack)

    vulnerabilities_update_html = vulnerabilities_detainled_data_list[6].encode().decode()    #更新时间
    vulnerabilities_update_soup = BeautifulSoup(vulnerabilities_update_html,'html.parser')
    try:
        vulnerabilities_update = vulnerabilities_update_soup.a.string
        vulnerabilities_update = vulnerabilities_update.replace("
","").replace("	","").replace("
","")
    except:
        vulnerabilities_update = ''    
    vulnerabilities_result_list_eve.append(vulnerabilities_update)

    vulnerabilities_firm_html = vulnerabilities_detainled_data_list[7].encode().decode()    #厂商
    vulnerabilities_firm_soup = BeautifulSoup(vulnerabilities_firm_html,'html.parser')
    try:
        vulnerabilities_firm = vulnerabilities_firm_soup.a.string
        vulnerabilities_firm = vulnerabilities_firm.replace("
","").replace("	","").replace("
","")
    except:
        vulnerabilities_firm = ''
    vulnerabilities_result_list_eve.append(vulnerabilities_firm)

    vulnerabilities_source_html = vulnerabilities_detainled_data_list[8].encode().decode()    #漏洞来源
    vulnerabilities_source_soup = BeautifulSoup(vulnerabilities_source_html,'html.parser')
    try:
        vulnerabilities_source = vulnerabilities_source_soup.a.string
        vulnerabilities_source = vulnerabilities_source.replace("
","").replace("	","").replace("
","")
    except:
        vulnerabilities_source = ''
    vulnerabilities_result_list_eve.append(vulnerabilities_source)
    

    #添加漏洞简介详情
    vulnerabilities_title_html = vulnerabilities_detainled_soup1.find('div',attrs={'class':'d_ldjj'})    #定义 漏洞简介 块的soup
    vulnerabilities_title_html = vulnerabilities_title_html.encode().decode()
    vulnerabilities_title_soup2 = BeautifulSoup(vulnerabilities_title_html,'html.parser')

    try:
        vulnerabilities_titles1 = vulnerabilities_title_soup2.find_all(name='p')[0].string
        vulnerabilities_titles2 = vulnerabilities_title_soup2.find_all(name='p')[1].string
        vulnerabilities_titles = vulnerabilities_titles1 + vulnerabilities_titles2
        vulnerabilities_titles = vulnerabilities_titles.replace(' ','').replace('	','').replace('
','').replace('
','')
    except:
        vulnerabilities_titles = ''
    vulnerabilities_result_list_eve.append(vulnerabilities_titles)


    #漏洞公告
    vulnerabilities_notice_html = vulnerabilities_detainled_soup1.find('div',attrs={'class':'d_ldjj m_t_20'})    #定义 漏洞公告 块的soup
    vulnerabilities_notice_html = vulnerabilities_notice_html.encode().decode()
    vulnerabilities_notice_soup2 = BeautifulSoup(vulnerabilities_notice_html,'html.parser')
    
    try:
        vulnerabilities_notice1 = vulnerabilities_notice_soup2.find_all(name='p')[0].string
        vulnerabilities_notice2 = vulnerabilities_notice_soup2.find_all(name='p')[1].string
        vulnerabilities_notice = vulnerabilities_notice1+vulnerabilities_notice2
        vulnerabilities_notice = vulnerabilities_notice.replace('
','').replace('
','').replace('	','')
    except:
        vulnerabilities_notice = ''
    vulnerabilities_result_list_eve.append(vulnerabilities_notice)


    #参考网址
    vulnerabilities_reference_html = vulnerabilities_detainled_soup1.find_all('div',attrs={'class':'d_ldjj m_t_20'})[1]    #定义 参考网址 块的soup
    vulnerabilities_reference_html = vulnerabilities_reference_html.encode().decode()
    vulnerabilities_reference_soup2 = BeautifulSoup(vulnerabilities_reference_html,'html.parser')

    try:
        vulnerabilities_reference = vulnerabilities_reference_soup2.find_all(name='p')[1].string
        vulnerabilities_reference = vulnerabilities_reference.replace('
','').replace('
','').replace('	','').replace('链接:','')
    except:
        vulnerabilities_reference = ''
    vulnerabilities_result_list_eve.append(vulnerabilities_reference)
    

    #受影响实体
    vulnerabilities_effect_html = vulnerabilities_detainled_soup1.find_all('div',attrs={'class':'d_ldjj m_t_20'})[2]    #定义 受影响实体 块的soup
    vulnerabilities_effect_html = vulnerabilities_effect_html.encode().decode()
    vulnerabilities_effect_soup2 = BeautifulSoup(vulnerabilities_effect_html,'html.parser')
    try:
        vulnerabilities_effect = vulnerabilities_effect_soup2.find_all(name='p')[0].string
        vulnerabilities_effect = vulnerabilities_effect.replace('
','').replace('
','').replace('	','').replace(' ','')
    except:
        try:
            vulnerabilities_effect = vulnerabilities_effect_soup2.find_all(name='a')[0].string
            vulnerabilities_effect = vulnerabilities_effect.replace('
','').replace('
','').replace('	','').replace(' ','')        
        except:
            vulnerabilities_effect = ''
    vulnerabilities_result_list_eve.append(vulnerabilities_effect)



    #补丁
    vulnerabilities_patch_html = vulnerabilities_detainled_soup1.find_all('div',attrs={'class':'d_ldjj m_t_20'})[3]    #定义 补丁 块的soup
    vulnerabilities_patch_html = vulnerabilities_patch_html.encode().decode()
    vulnerabilities_patch_soup2 = BeautifulSoup(vulnerabilities_patch_html,'html.parser')
    

    try:
        vulnerabilities_patch = vulnerabilities_patch_soup2.find_all(name='p')[0].string
        vulnerabilities_patch = vulnerabilities_patch.replace('
','').replace('
','').replace('	','').replace(' ','')
    except:
        vulnerabilities_patch = ''
    vulnerabilities_result_list_eve.append(vulnerabilities_patch)
    for i in vulnerabilities_result_list_eve:
        vulnerabilities_result_list.append(i)
    print (re.findall(r'CNNVD-[s+S+]+',url)[0])
    

#漏洞信息写入excel
def vulnerabilities_excel(excel):
    workbook = xlsxwriter.Workbook('spider_cnnvd.xlsx')
    worksheet = workbook.add_worksheet()

    row = 0
    col = 0
    worksheet.write(row,0,'漏洞名称')
    worksheet.write(row,1,'CNNVD编号')
    worksheet.write(row,2,'危害等级')
    worksheet.write(row,3,'CVE编号')
    worksheet.write(row,4,'漏洞类型')
    worksheet.write(row,5,'发布时间')
    worksheet.write(row,6,'攻击途径')
    worksheet.write(row,7,'更新时间')
    worksheet.write(row,8,'厂商')
    worksheet.write(row,9,'漏洞来源')
    worksheet.write(row,10,'漏洞描述')
    worksheet.write(row,11,'解决方案')
    worksheet.write(row,12,'参考链接')
    worksheet.write(row,13,'受影响实体')
    worksheet.write(row,14,'补丁')

    row = 1
    n = 0
    while n < len(excel):
        worksheet.write(row,col,excel[n])
        worksheet.write(row,col+1,excel[n+1])
        worksheet.write(row,col+2,excel[n+2])
        worksheet.write(row,col+3,excel[n+3])
        worksheet.write(row,col+4,excel[n+4])
        worksheet.write(row,col+5,excel[n+5])
        worksheet.write(row,col+6,excel[n+6])
        worksheet.write(row,col+7,excel[n+7])
        worksheet.write(row,col+8,excel[n+8])
        worksheet.write(row,col+9,excel[n+9])
        worksheet.write(row,col+10,excel[n+10])
        worksheet.write(row,col+11,excel[n+11])
        worksheet.write(row,col+12,excel[n+12])
        worksheet.write(row,col+13,excel[n+13])
        worksheet.write(row,col+14,excel[n+14])
        row += 1
        n += 15
        
    workbook.close()def main():
agent_lists=[]
vulnerabilities_lists=[]
vulnerabilities_result_list = []
#获取代理 for i in range(1,2): url='http://www.xicidaili.com/nn/'+str(i) agent_list(url) #调用漏洞列表函数并获得漏洞链接列表 begin = datetime.datetime.now() page_count = sys.argv[3] j = 1 page_count = int(page_count) start_time = sys.argv[1] end_time = sys.argv[2] while j<=page_count: try: holes_url = 'http://cnnvd.org.cn/web/vulnerability/queryLds.tag?pageno=%d&repairLd='%j vulnerabilities_url_list(holes_url,start_time,end_time) print("已完成爬行第%d页"%j) print(' ') j+=1 except: print('爬取失败,更换代理重新爬取。') time.sleep(10) global bi bi = 0 pool = threadpool.ThreadPool(5) requests = threadpool.makeRequests(vulnerabilities_data,vulnerabilities_lists) [pool.putRequest(req) for req in requests] pool.wait() #漏洞信息写入excel vulnerabilities_excel(vulnerabilities_result_list) #爬行结束 end = datetime.datetime.now() total_time = end - begin print ('漏洞信息爬取结束') print ('爬行漏洞数量: ',len(vulnerabilities_lists)) print ('爬行时间: ',total_time) if __name__ == '__main__': main()
原文地址:https://www.cnblogs.com/kaiho/p/8056317.html