Python爬虫爬取ECCV Conference Papers(一)

爬取到2020年所有论文标题

代码:

 1 import re
 2 import requests
 3 from bs4 import BeautifulSoup
 4 import lxml
 5 import traceback
 6 import time
 7 import json
 8 from lxml import etree
 9 def get_paper():
10     #https://www.ecva.net/papers/eccv_2020/papers_ECCV/html/267_ECCV_2020_paper.php
11     #https://www.ecva.net/papers/eccv_2020/papers_ECCV/html/283_ECCV_2020_paper.php
12     #https://www.ecva.net/papers/eccv_2020/papers_ECCV/html/343_ECCV_2020_paper.php
13     url='https://www.ecva.net/papers.php'
14     headers = {
15         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
16     }
17     response=requests.get(url,headers)
18     response.encoding='utf-8'
19     page_text=response.text
20     #输出页面html
21     # print(page_text)
22     soup = BeautifulSoup(page_text,'lxml')
23     all_dt=soup.find_all('dt',class_='ptitle')
24     for dt in all_dt:
25         single_dt=str(dt)
26         single_soup=BeautifulSoup(single_dt,'lxml')
27         title=single_soup.find('a').text
28         print(title)
29     return
30 if (__name__=='__main__'):
31     get_paper()

 

 

原文地址:https://www.cnblogs.com/rainbow-1/p/14815110.html