2021 6 1

顶会热词爬虫:

# -*- coding = UTF-8 -*-
# @Time : 2021/6/12 23:34
# @Author : 伏珊瑞
# @File : test2021.py
# @Software : PyCharm
import urllib
import pymysql
import requests
import jieba.analyse
from bs4 import BeautifulSoup
import re
from pymysql.constants.FIELD_TYPE import JSON
conn = pymysql.connect(host='localhost', user="root", passwd="123456", database="paper")
# 获取游标
cursor = conn.cursor()
Sql="insert into pypaper1(name,herf,writer,Abstract,time,keywords) values(%s,%s,%s,%s,%s,%s)"
def main():  #主函数,之后调用
    html=askURL("https://openaccess.thecvf.com/CVPR2021?day=all")
    getherf(html)
    cursor.close()
    conn.commit()
    conn.close()
def getAbstract(url):   #爬取论文的摘要
    html=askURL(url)       #将网页的html转换成文本
    data=""
    bs = BeautifulSoup(html, "html.parser")
    findlink_herf = re.compile(r'
(.*?)
')#摘要的相应的正则表达式
    a=bs.find_all(id="abstract")
    for item in a:
        item=str(item)
        data=re.findall(findlink_herf,item)[0]
    return data
def getherf(html):#爬取论文所有数据并写入数据库
    bs=BeautifulSoup(html,"html.parser")
    a=bs.find_all(class_="ptitle")
    b = bs.find_all("dd")#dd是网页中每条论文数据的框框
    findlink_herf=re.compile(r'<a href="(.*?)">')#论文的超链接的正则表达式
    findlink_name = re.compile(r'<a href="(.*?)">(.*?)</a></dt>')#论文名字的正则表达式
    findlink_writer = re.compile(r'">(.*?)</a>')#论文作者的正则表达式
    TEMP=1;#爬取的作者连同ptf也爬了所以每两个才是作者数据
    inta=1;#计数器
    for item in a:
        # try:
            item=str(item)
            name=str(b[TEMP])
            link_href=re.findall(findlink_herf,item)[0]
            link_name = re.findall(findlink_name, item)[0]
            writer=re.findall(findlink_writer, name)
            link_writer=""
            for s in writer:
                link_writer+=s+"+"
            link_Abstract=getAbstract("https://openaccess.thecvf.com/"+link_href)#论文超链接应该加上链接头
            keywords=""
            p=0
            for word in jieba.analyse.extract_tags(link_Abstract):#将关键词数组拼成字符串
                keywords+=word+"+"
                p=p+1
                if(p==5):
                    break
            # print(link_name[1])
            # print(link_href)
            # print(link_writer)
            # print(link_Abstract)
            # print(keywords)
            insert = cursor.execute(Sql, (link_name[1], link_href, link_writer, link_Abstract, "2021",keywords))#写入数据库
            TEMP+=2
            print(inta)
            inta+=1
        # except:
        #     print(link_name[1])
        #     print(link_href)
        #     print(link_writer)
        #     print(link_Abstract)
        #     print(keywords)
def askURL(url):#获取网页的html文本
    header = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36'
    }
    request = urllib.request.Request(url, headers=header)
    html = ""
    response = urllib.request.urlopen(request)
    html = response.read().decode("UTF-8")
    return html
main()#调用main函数
# -*- coding = UTF-8 -*-
# @Time : 2021/6/12 23:34
# @Author : 伏珊瑞
# @File : test2021.py
# @Software : PyCharm
import urllib
import pymysql
import requests
import jieba.analyse
from bs4 import BeautifulSoup
import re
from pymysql.constants.FIELD_TYPE import JSON
conn = pymysql.connect(host='localhost', user="root", passwd="123456", database="paper")
# 获取游标
cursor = conn.cursor()
Sql="insert into pypaper(name,herf,writer,Abstract,time,keywords) values(%s,%s,%s,%s,%s,%s)"
def main():
    html=askURL("https://openaccess.thecvf.com/WACV2021")
    getherf(html)
    cursor.close()
    conn.commit()
    conn.close()
    #getAbstract("https://openaccess.thecvf.com/content_WACV_2020/html/Sang_Inferring_Super-Resolution_Depth_from_a_Moving_Light-Source_Enhanced_RGB-D_Sensor_WACV_2020_paper.html")
def getAbstract(url):
    html=askURL(url)
    data=""
    bs = BeautifulSoup(html, "html.parser")
    findlink_herf = re.compile(r'
(.*?)
')
    a=bs.find_all(id="abstract")
    for item in a:
        item=str(item)
        data=re.findall(findlink_herf,item)[0]
    return data
def getherf(html):
    bs=BeautifulSoup(html,"html.parser")
    a=bs.find_all(class_="ptitle")
    b = bs.find_all("dd")
    findlink_herf=re.compile(r'<a href="(.*?)">')
    findlink_name = re.compile(r'<a href="(.*?)">(.*?)</a></dt>')
    findlink_writer = re.compile(r'">(.*?)</a>')
    TEMP=0;
    inta=1;
    for item in a:
        try:
            item=str(item)
            name=str(b[TEMP])
            link_href=re.findall(findlink_herf,item)[0]
            link_name = re.findall(findlink_name, item)[0]
            writer=re.findall(findlink_writer, name)
            link_writer=""
            for s in writer:
                link_writer+=s+"+"
            link_Abstract=getAbstract("https://openaccess.thecvf.com/"+link_href)
            keywords=""
            for word in jieba.analyse.extract_tags(link_Abstract):
                keywords+=word+"+"
            # print(link_name[1])
            # print(link_href)
            # print(link_writer)
            # print(link_Abstract)
            # print(keywords)
            insert = cursor.execute(Sql, (link_name[1], link_href, link_writer, link_Abstract, "2021",keywords))
            #print("insert into pypaper values("+link_name[1]+","+link_href+","+link_writer+","+link_Abstract+",2021)")
            TEMP+=2
            print(inta)
            inta+=1
        except:
            print(link_name[1])
            print(link_href)
            print(link_writer)
            print(link_Abstract)
            print(keywords)
def askURL(url):
    header = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36'
    }
    request = urllib.request.Request(url, headers=header)
    html = ""
    response = urllib.request.urlopen(request)
    html = response.read().decode("UTF-8")
    return html
main()
#def getwriter(html):
#     bs = BeautifulSoup(html, "html.parser")
#     a = bs.find_all("dd")
#     findlink = re.compile(r'">(.*?)</a>')
#     TEMP=2;
#     for item in a:
#         if(TEMP%2==0):
#             item = str(item)
#             link = re.findall(findlink, item)
#             print(link)
#         TEMP+=1
# def getname(html):
#     bs = BeautifulSoup(html, "html.parser")
#     a = bs.find_all(class_="ptitle", )
#     findlink = re.compile(r'<a href="(.*?)">(.*?)</a></dt>')
#     list = bs.find_all(re.compile("dt"))
#     for item in a:
#         item = str(item)
#         link = re.findall(findlink, item)[0]
#         print(link[1])
原文地址:https://www.cnblogs.com/fuxw4971/p/14913378.html