python爬取cvpr2018论文

摘要:爬取CVPR2018论文的标题、摘要、关键字、文章链接。

一、数据库建表

 最好加id,方便管理,注意abstract(摘要)的类型为text。(原因:varchar默认长度255,摘要可能会溢出)

二、代码部分

import requests
from bs4 import BeautifulSoup
import pymysql

headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}#创建头部信息
url='http://openaccess.thecvf.com/CVPR2018.py'
r=requests.get(url,headers=headers)
content=r.content.decode('utf-8')
soup = BeautifulSoup(content, 'html.parser')
dts=soup.find_all('dt',class_='ptitle')
hts='http://openaccess.thecvf.com/'
#数据爬取
alllist=[]
for i in range(len(dts)):
    print('这是第'+str(i)+'')
    title=dts[i].a.text.strip()
    href=hts+dts[i].a['href']
    r = requests.get(href, headers=headers)
    content = r.content.decode('utf-8')
    soup = BeautifulSoup(content, 'html.parser')
    #print(title,href)
    divabstract=soup.find(name='div',attrs={"id":"abstract"})
    abstract=divabstract.text.strip()
    #print('第'+str(i)+'个:',abstract)
    alllink=soup.select('a')
    link=hts+alllink[4]['href'][6:]
    keyword=str(title).split(' ')
    keywords=''
    for k in range(len(keyword)):
        if(k==0):
            keywords+=keyword[k]
        else:
            keywords+=','+keyword[k]
    value=(title,abstract,link,keywords)
    alllist.append(value)
print(alllist)
tuplist=tuple(alllist)
#数据保存
db = pymysql.connect("localhost", "root", "0000", "cvpr", charset='utf8')
cursor = db.cursor()
sql_cvpr = "INSERT INTO cvpr values (%s,%s,%s,%s)"
try:
    cursor.executemany(sql_cvpr,tuplist)
    db.commit()
except:
      print('执行失败,进入回调3')
      db.rollback()
db.close()
cvpr

三、爬取结果

原文地址:https://www.cnblogs.com/dd110343/p/12832779.html