python爬取论文

python实现爬取论文的信息:

 1 import requests
 2 import pymysql
 3 from jieba.analyse import extract_tags
 4 from lxml import etree  # 导入库
 5 from bs4 import BeautifulSoup
 6 import re
 7 import time
 8 
 9 db = pymysql.connect(host="localhost", user="root", passwd="0424wyhhxx", database="test", charset='utf8')
10 cursor = db.cursor()
11 
12 
13 # 定义爬虫类
14 
15 class Spider():
16     def __init__(self): 
17         self.url = 'https://openaccess.thecvf.com/CVPR2018?day=2018-06-19'
18         self.headers = {
19             'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 '
20                           'Safari/537.36 '
21         }
22         r = requests.get(self.url, headers=self.headers)
23         r.encoding = r.apparent_encoding
24         self.html = r.text
25 
26     def lxml_find(self):
27         '''用lxml解析'''
28         tonum = 200
29         number = 1
30         num = 1
31         start = time.time()  # 三种方式速度对比
32         selector = etree.HTML(self.html)  # 转换为lxml解析的对象
33         titles = selector.xpath('//dt[@class="ptitle"]/a/@href')  # 这里返回的是一个列表
34         for each in titles[200:]:
35             title0 = each.strip()  # 去掉字符左右的空格
36             # print("https://openaccess.thecvf.com/content_CVPR_2019"+title[17:])
37             chaolianjie = "https://openaccess.thecvf.com/content_cvpr_2018" + title0[17:]
38             req = requests.get(chaolianjie, headers=self.headers)
39             req.encoding = req.apparent_encoding
40             onehtml = req.text
41             selector1 = etree.HTML(req.text)
42             title = selector1.xpath('//div[@id="papertitle"]/text()')
43             # print(title[0][1:])
44             abst = selector1.xpath('//div[@id="abstract"]/text()')
45             hre0 = selector1.xpath('//a/@href')
46             hre = "https://openaccess.thecvf.com" + hre0[5][5:]
47             # print(hre)
48             author = selector1.xpath('//dd/div[@id="authors"]/b/i/text()')
49             va = []
50             for keyword, weight in extract_tags(abst[0].strip(), topK=1, withWeight=True):
51                 print('%s %s' % (keyword, weight))
52             va.append(title)
53             va.append(hre)
54             va.append(abst)
55             va.append(author)
56             va.append("2018-06-19")
57             va.append(keyword)
58             sql = "insert into cvpr1 (title,link,abstract,author,time,keyword) values (%s,%s,%s,%s,%s,%s)"
59             #sql = "insert into lunwen (title,abstract,link,keyword) values (%s,%s,%s,%s)"
60             #cursor.execute(sql, va)
61             cursor.execute(sql, va)
62             db.commit()
63 
64             print("已爬取" + str(number) + "条数据")
65             number = number + 1
66 
67         end = time.time()
68         print('共耗时:', end - start)
69 
70 
71 if __name__ == '__main__':
72     spider = Spider()
73     spider.lxml_find()
74     cursor.close()
75     db.close()

原文地址:https://www.cnblogs.com/znjy/p/14884125.html