从网页上抓取Windows补丁信息然后整型输出(Python)

Powershell实现:http://www.cnblogs.com/IvanChen/p/4488246.html

今天通过Python实现:

# coding=utf-8
import re
import requests
import csv
import sys
from lxml import etree

reload(sys)
sys.setdefaultencoding('utf8')

summaryurl = 'https://technet.microsoft.com/en-us/library/security/mt637763.aspx'
summarycontent = requests.get(summaryurl).content
selector = etree.HTML(summarycontent)
mslist = selector.xpath('//*[@id="mainBody"]/table/tr/td[2]/p/a/text()')

pattern_published_date = re.compile(r"[a-zA-Z]*?s[0-9]*?,s[0-9]*")
pattern_severity = re.compile(r"[a-zA-Z]*$")
pattern_kbnumber = re.compile(r"d+")
pattern_vultype = re.compile(r"Information Disclosure|Remote Code Execution|Elevation of Privilege|Security Feature Bypass|Cumulative Security Update|Denial of Service|Tampering|Spoofing", re.I)

csvfile = file('eggs.csv', 'wb')
writer = csv.writer(csvfile, dialect="excel")
writer.writerow(['Date', 'MSRC', 'KB', 'Severity', 'Version', 'Summary', 'Type'])

for eachmsrc in mslist:
    msrcurl = "https://technet.microsoft.com/en-us/library/security/" + eachmsrc + ".aspx"
    msrc_content = requests.get(msrcurl).content
    msrc_selector = etree.HTML(msrc_content)

    published_date = msrc_selector.xpath('//*[@id="pubInfo"]/p[1]/text()')
    kbnumber = msrc_selector.xpath('//*[@id="mainBody"]/h2/text()')
    severity = msrc_selector.xpath('//*[@id="content"]/div[2]/h1/text()')
    version = msrc_selector.xpath('//*[@id="pubInfo"]/p[2]/text()')
    summary = msrc_selector.xpath('//*[@id="mainBody"]/div[3]/div/p[1]/text()')
    vultype = msrc_selector.xpath('string(//*[@id="mainBody"]/div[3]/div)')
    ft_published_date = re.search(pattern_published_date, published_date[0]).group()
    ft_kbnumber = re.search(pattern_kbnumber, kbnumber[0]).group()
    ft_severity = re.search(pattern_severity, severity[0].strip('
 ')).group()
    ft_version = version[1]
    ft_summary = summary[0]
    ft_vultype = re.search(pattern_vultype, vultype)
    if ft_vultype:
        writer.writerow([ft_published_date, eachmsrc, ft_kbnumber, ft_severity, ft_version, ft_summary, ft_vultype.group()])
    else:
        vultype = msrc_selector.xpath('string(//*[@id="mainBody"]/div[position()>3]/div/table)')
        ft_vultype = re.search(pattern_vultype, vultype)
        writer.writerow([ft_published_date, eachmsrc, ft_kbnumber, ft_severity, ft_version, ft_summary, ft_vultype.group()])
csvfile.close()
原文地址:https://www.cnblogs.com/IvanChen/p/5495796.html