顶会热词爬取代码

import pymysql
import requests
from lxml import etree


class Spider:
    def __init__(self):
        self.url = "http://openaccess.thecvf.com/CVPR2019?day=2019-06-18"
        self.header = {
            "user-agent": "Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Mobile Safari/537.36"}
        self.db = pymysql.connect(host='localhost', port=3306, user='root', passwd='yangjiang', db='words',
                                  charset='utf8')
        self.cursor = self.db.cursor()
        self.html_list = []

    def getHtmlList(self):
        response = requests.get(self.url, headers=self.header)
        html_body = etree.HTML(response.text)
        title = html_body.xpath("//dt[@class='ptitle']/a/@href")
        for item in title:
            self.html_list.append("http://openaccess.thecvf.com/" + item)

    def getContent(self, url):
        try:
            response = requests.get(url, headers=self.header)
            body = etree.HTML(response.text)
            title = body.xpath("//div[@id='papertitle']/text()")[0]
            abstract = body.xpath("//div[@id='abstract']/text()")[0]
            down_url = body.xpath("//div[@id='content']//a/@href")[0].replace("../../", "http://openaccess.thecvf.com/")
            year=2019
            sql = '''insert into paper(title,year,link,abstract,keywords) values("{}","{}","{}","{}","{}")'''.format(title,year, down_url, abstract,title)
            self.cursor.execute(sql)
            print(title + "插入成功!")
            self.db.commit()
        except Exception as e:
            print(e)

    def run(self):
        self.getHtmlList()
        for url in self.html_list:
            self.getContent(url)


if __name__ == '__main__':
    spwder = Spider()
    spwder.run()
原文地址:https://www.cnblogs.com/yongyuandishen/p/14908651.html