大二下学期第二次个人作业第一阶段

今日爬取了剩下论文,ICCV,ECVA,ICCV与CVPR在同一个网页上,所以爬取解析的方式一之前一样去,主要是ECVA的爬取以及解析。

def get_tencent_data_ECVA():
    url_ECCV = 'https://www.ecva.net/papers.php'

    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
    }
    res=requests.get(url_ECCV,headers=headers)
    soup= BeautifulSoup(res.text)
    dt=soup.find_all("dt",{"class":"ptitle"})
    num=1;
    url_paper=[]
    cursor = None
    conn = None
    conn, cursor = get_conn()
    for i in dt:
        a=i.select("a")
        url_paper.append(url_ECCV[0:len(url_ECCV)-10]+a[0]['href'])
    num = 400;#由于网络不好,老是断线,所以记录已经插入的位置,断网后从断点处重新爬取。
    try:
        print("开始插入数据")
        flag=1;
        for i in url_paper:
            if(flag>400):#之前爬过的不必再解析
                res = requests.get(i, headers=headers)
                soup = BeautifulSoup(res.text)
                div = soup.find_all("div", {"id": "papertitle"})
                title = div[0].text
                div = soup.find_all("div", {"id": "authors"})
                authors = div[0].text.replace("
", "")
                div = soup.find_all("div", {"id": "abstract"})
                abstract = div[0].text
                url = i;
                keyworld = replace(title)
                url_s = url.split("/")
                if ("eccv_2020" in url_s):
                    yeardata = 2020
                else:
                    yeardata = 2018
                meet = "ECVA"
                print(url)
                sql = "insert into paper_data (title,authors,abstract,keyworld,url,yeardata,meet) values(%s,%s,%s,%s,%s,%s,%s)"
                cursor.execute(sql, [title, authors, abstract, keyworld, url, yeardata, meet])
                num = num + 1;
                conn.commit()
                print("数据插入成功", num)
            flag=flag+1
    except:
        print("插入失败!",num)
        traceback.print_exc()
    close_conn(conn, cursor)
原文地址:https://www.cnblogs.com/fengchuiguobanxia/p/14730481.html