爬虫之selenium

概述

selenium模块的作用通过编写代码模拟人工对浏览器的事件,触发相关操作,从而获取网页信息,相对于使用requests模块,selenium模块对动态数据的爬取更为方便

安装selenium:pip install selenium -i https://pypi.douban.com/simple

使用:

1:使用内置的webdriver类实例化一个浏览器对象diver(示例浏览器对象时,需要传入一个浏览器驱动的路径)如实例化一个谷歌浏览器地下diver = webdriver.Chrome(r'./chromedriver.exe)

2:才用实例化的对象中的方法进行模拟人工操作浏览器

常用内置方法:

打开网页:diver.get("要访问的url")

查询标签:diver.find_element_by_id("id值") # 根据标签id查找,可以修改为class,tagname等值,与js查找标签类似,找到标签返回一个obj对象

　　obj对象的方法:　 obj.click（点击）

　　　　　　　　　　obj.send_keys(输入)

获取网页源码:diver.page_source

关闭浏览器:diver.close()/diver.quit

执行js代码:diver.execute_script("js代码") # 如widow.scrollTo(0,document.body.scrollHeight放到js代码中,浏览器会执行滚轮下滑一定高度的动作,

截图:diver.save_screenshot("图片保存路径和文件名")

前进:diver.forward()

后退:diver.back()

切换到iframe标签:diver.switch_to.frame("iframe标签")

实现是鼠标按住不松手:线实例化动作链对象action = ActionChains(diver) # ActionChains从selenium.webdriver中导入

　　　　　　　　　　然后保持不松开:action.click_and_hold(“标签对象”) #点击标签对象松开

　　　　　　　　　　移动标签:action.move_by_offset(x,y) #移动标签对象，如果是移动到另一个标签里,可以使用action.drag_and_drop(被移动标签对象, 目标标签对象)

　　　　　　　　　　执行上述代码:action.perform

获取cookise值:diver.get_cookies()

使用无头浏览器

通过添加参数可以让selenium操作浏览器在后台运行,不会有界面显示

# 创建一个参数对象，用来控制chrome以无界面模式打开
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')

# 实例化一个谷歌浏览器对象, 需要加上一个谷歌无界面浏览器的参数chrome_options
diver = webdriver.Chrome(r'./chromedriver.exe',chrome_options=chrome_options)

规避检测

#通过添加参数可以降低被网站服务器检测为自动化程序的风险

# 实例化一个options对象, 添加规避被检测识别的参数
options = ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-automation'])

bro = webdriver.Chrome(executable_path='./chromedriver.exe',options=options)

示例

# 使用selenium爬取网易新闻里面["国内", "国际", "军事", "航空"]四个版块里面的新闻数据

 1 # 使用selenium爬取网易新闻里面["国内", "国际", "军事", "航空"]四个版块里面的新闻数据
 2 import requests,random
 3 from selenium import webdriver
 4 from selenium.webdriver import ChromeOptions
 5 from selenium.webdriver.chrome.options import Options
 6 from lxml import etree
 7 from multiprocessing.dummy import Pool
 8 
 9 # 设置不打开浏览器查询
10 chrome_options = Options()
11 chrome_options.add_argument("--headless")
12 chrome_options.add_argument("--disable-gpu")
13 
14 # 规避脚本检测
15 options = ChromeOptions()
16 options.add_experimental_option('excludeSwitches', ['enable-automation'])
17 # 生成谷歌浏览器对象
18 diver = webdriver.Chrome('chromedriver.exe',chrome_options=chrome_options,options=options)
19 # 链接目标url
20 diver.get("https://news.163.com")
21 # 获取网页代码
22 response_text = diver.page_source
23 
24 #使用lxml解析源代码
25 tree = etree.HTML(response_text)
26 guonei_url = tree.xpath('//li[@class="menu_guonei"]/a/@href')[0]
27 guoji_url = tree.xpath('//li[@class="menu_guoji"]/a/@href')[0]
28 war_url = tree.xpath('//li[@class="menu_war"]/a/@href')[0]
29 hangkong_url = tree.xpath('//li[@class="menu_hangkong"]/a/@href')[0]
30 diver.close()
31 
32 
33 def get_new(url):
34     '''模仿人工操作浏览器下拉到页面底部,并获取整张页面源码'''
35     new_diver = webdriver.Chrome('chromedriver.exe',options=options,chrome_options=chrome_options)
36     new_diver.get(url)
37     js = 'window.scrollTo(0,document.body.scrollHeight)'
38     check_bottom = new_diver.find_element_by_class_name("load_more_tip")
39     while check_bottom.get_attribute('style') == 'display: none;':
40         new_diver.execute_script(js)
41         obj = new_diver.find_element_by_class_name("post_addmore")
42         if obj.get_attribute('style') == 'visibility: visible;':
43             obj.click()
44     new_diver.execute_script(js)
45     # 获取网页代码
46     response_text = new_diver.page_source
47 #     filename = str(random.randint(1000,9999)) + ".html"
48 #     with open(filename,"w",encoding="utf-8") as f:
49 #         f.write(response_text)
50     new_diver.close()
51     return response_text
52 
53 def mark_url(html_text):
54     '''获取各个新闻的详情页标签'''
55     mark_tree = etree.HTML(html_text)
56     title_url_list = mark_tree.xpath('//div[@class="ndi_main"]/div/div/div/h3/a/@href')
57     return title_url_list
58 
59 def get_new_detail(title_url_list):
60     '''爬取并将新闻标题个内容保存在本地'''
61     filename = str(random.randint(1000,9999)) + ".txt"
62     with open(filename,"w",encoding="utf-8") as f:
63         for title_url in title_url_list:
64             detail_diver = webdriver.Chrome('chromedriver.exe',options=options,chrome_options=chrome_options)
65             detail_diver.get(title_url)
66             response_text = detail_diver.page_source
67             detail_tree = etree.HTML(response_text)
68             title = detail_tree.xpath('//div[@id="epContentLeft"]/h1/text()')[0]
69             text = detail_tree.xpath('//div[@id="endText"]/p/text()')
70             text = ''.join(text)
71             f.write(title)
72             f.write(text)
73 
74 # 初始化四个要爬取的网页url
75 url_list = [guonei_url,guoji_url,war_url,hangkong_url]
76 # 实例化线程池
77 pool = Pool(4)
78 # 使用线程池获取要爬取网页的所有新闻标题和新闻详情页的url
79 data_list = pool.map(get_new,url_list)
80 # 解析所有详情页的url
81 title_url_list = pool.map(mark_url,data_list)
82 # 爬取新闻详情
83 pool.map(get_new_detail,title_url_list)

View Code

# 使用线程池爬取梨视频app的视频

# 使用线程池爬取梨视频(10个视频)
import requests,re,random
from lxml import etree
from multiprocessing.dummy import Pool
requests = requests.Session()
url = 'https://www.pearvideo.com/category_4'
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36",
    "connection":"close"
}

response_text = requests.get(url=url,headers=headers).text

tree = etree.HTML(response_text)
video_url_list = tree.xpath('//ul[@id="listvideoListUl"]/li/div/a/@href | //ul[@id="categoryList"]/li/div/a/@href')

for i in range(len(video_url_list)):
    video_url_list[i] = 'https://www.pearvideo.com/' + video_url_list[i] 

'''
srcUrl="https://video.pearvideo.com/mp4/third/20191023/cont-1615387-11549790-203859-hd.mp4",vdoUrl=srcUrl,

'''
def get_data_url(url):
    response_text = requests.get(url=url,headers=headers).text
    data_url = re.findall('srcUrl="(.*?)",vdoUrl=srcUrl,',response_text)[0]
    return data_url
    
def get_data(data_url):
    data = requests.get(url=data_url,headers=headers).content
    filename = str(random.randint(1000,9999)) + ".mp4"
    with open(filename,"wb") as f:
        f.write(data)
    


pool = Pool(5)
data_url_list = pool.map(get_data_url,video_url_list)
pool.map(get_data,data_url_list)

View Code