软工超越日报-爬取顶会热词 5/3

今天我们来做下个人作业,爬取一下计算机视觉顶级会议热词

代码如下:

import operator
from nltk.corpus import stopwords
import pymysql as pymysql
from selenium import webdriver
from lxml import etree

# 构建数组
data_s = []

fen_ci = ""

# 通过webdriver启动chrome获取数据
driver = webdriver.Chrome()
# 2019
driver.get("https://openaccess.thecvf.com/ICCV2019?day=2019-10-29")
html = driver.page_source

# 正则表达式搜寻链接
htmlc=etree.HTML(html)
indexs=htmlc.xpath('//dl/dd/a[1]/@href')
base_url='https://openaccess.thecvf.com/'
title=htmlc.xpath('//dl/dt/a/text()')
print(len(title))
for i in range(0,len(title)):
    fen_ci=fen_ci+title[i]
    url=base_url+indexs[i]
    data_s.append([title[i], url, 2019])

driver.get("https://openaccess.thecvf.com/ICCV2019?day=2019-10-30")
html = driver.page_source



# 连接数据库
conn = pymysql.connect(host="127.0.0.1", port=3306, user="root", password="260702266", database="topclass", charset="utf8")
cursor = conn.cursor()

print(data_s)
count = 0
sql="insert into iccv(title,links,year) values (%s,%s,%s)"
try:
    count = count+1
    cursor.executemany(sql, data_s)
    conn.commit()
except:
    conn.rollback()
    count = count + 1

# 统计词频
fen_cil = fen_ci.lower().split()
dic = {}
for word in fen_cil:
    if word not in dic:
        dic[word] = 1
    else:
        dic[word] = dic[word] + 1
swd = sorted(dic.items(),key=operator.itemgetter(1),reverse=True)
# print(swd)

实际效果:

原文地址:https://www.cnblogs.com/Sakuraba/p/14910415.html