XML爬取

url_str = 'https://www.tenable.com/plugins/feeds?sort=updated'
        respose_str = requests.get(url_str)
        print(respose_str.text)
        soup = BeautifulSoup(respose_str.text, 'xml')
        soup_items = soup.find_all('item')
        pattern = re.compile(r'<[^>]+>', re.S)
        for soup_item in soup_items:
            name = soup_item.find("title")
            name = pattern.sub('', str(name))
            link = soup_item.find("link")
            link = pattern.sub('', str(link))
            nessus_id = link.split("/")[-1]
            # description = soup_item.find("description")
            total_description = soup_item.find("description")
            # print(total_description.text)
            soup1 = BeautifulSoup(total_description.text, "lxml")
            span = soup1.find_all("span")
            # Synopsis =   BeautifulSoup(span[0],'html.parser').get_text
            # Description = BeautifulSoup(span[1],'html.parser').get_text
            # Solution = BeautifulSoup(span[2],'html.parser').get_text
            # print(Solution)
            Synopsis = pattern.sub('', str(span[0]))
            Description = pattern.sub('', str(span[1]))
            Solution = pattern.sub('', str(span[2]))
            up_dic = {
                "name": name,
                "nessus_id": nessus_id,
                "synopsis": Synopsis,
                "description": Description,
                "solution": Solution,
            }
            ne_item = mdb.get_one("CrawlDataForIDbyNessus", {"nessus_id": nessus_id})
            print(ne_item)
            if not ne_item:
                mdb.add("CrawlDataForIDbyNessus", up_dic)

  

原文地址:https://www.cnblogs.com/weidaijie/p/14097431.html