Python爬虫_selenium

环境安装

下载安装selenuim：pip install selenuim
下载浏览器驱动程序：
- http://chromedriver.storage.googleapis.com/index.html
查看驱动和浏览器版本的映射关系：

http://blog.csdn.net/huilan_same/article/details/51896672

应用

from selenium import webdriver
from time import sleep
#实例化浏览器插件
bro = webdriver.Chrome(executable_path='./chromedriver.exe')
bro.get('https://www.baidu.com')
sleep(2)
#标签定位
tag_input = bro.find_element_by_id('kw')
tag_input.send_keys('人民币')
sleep(2)

btn = bro.find_element_by_id('su')
btn.click()
sleep(2)
#关闭浏览器
bro.quit()

雪球网应用

from selenium import webdriver
from time import sleep
bro = webdriver.Chrome(executable_path='./chromedriver.exe')

bro.get('https://xueqiu.com/')
sleep(5)

#执行js实现滚轮向下滑动
js = 'window.scrollTo(0,document.body.scrollHeight)'
bro.execute_script(js)
sleep(2)
bro.execute_script(js)
sleep(2)
bro.execute_script(js)
sleep(2)
bro.execute_script(js)
sleep(2)
#定位到加载更多按钮
a_tag = bro.find_element_by_xpath('//*[@id="app"]/div[3]/div/div[1]/div[2]/div[2]/a')
a_tag.click()
sleep(5)
#获取当前浏览器页面数据(动态)
print(bro.page_source)

bro.quit()

PhantomJs是一款无可视化界面的浏览器（免安装）已停止更新不建议使用

from selenium import webdriver
from time import sleep
bro = webdriver.PhantomJS(executable_path=r'phantomjs-2.1.1-windowsinphantomjs.exe')

bro.get('https://xueqiu.com/')
sleep(2)
#截屏
bro.save_screenshot('./1.png')
#执行js实现滚轮向下滑动
js = 'window.scrollTo(0,document.body.scrollHeight)'
bro.execute_script(js)
sleep(2)
bro.execute_script(js)
sleep(2)
bro.execute_script(js)
sleep(2)
bro.execute_script(js)
sleep(2)
bro.save_screenshot('./2.png')
# a_tag = bro.find_element_by_xpath('//*[@id="app"]/div[3]/div/div[1]/div[2]/div[2]/a')
# bro.save_screenshot('./2.png')
# a_tag.click()
sleep(2)
#获取当前浏览器页面数据(动态)
print(bro.page_source)

bro.quit()

谷歌无头浏览器

from selenium import webdriver
from time import sleep
from selenium.webdriver.chrome.options import Options
# 创建一个参数对象，用来控制chrome以无界面模式打开
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')

bro = webdriver.Chrome(executable_path='./chromedriver.exe',options=chrome_options)
bro.get('https://www.baidu.com')
sleep(2)
bro.save_screenshot('1.png')
#标签定位
tag_input = bro.find_element_by_id('kw')
tag_input.send_keys('人民币')
sleep(2)

btn = bro.find_element_by_id('su')
btn.click()
sleep(2)

print(bro.page_source)
bro.quit()

动作链

from selenium import webdriver
from time import sleep
from selenium.webdriver import ActionChains  
bro = webdriver.Chrome(executable_path='./chromedriver.exe')
url = 'https://www.runoob.com/try/try.php?filename=jqueryui-api-droppable'
bro.get(url=url)
#如果定位的标签存在于iframe标签之中，则必须经过switch_to操作在进行标签定位
bro.switch_to.frame('iframeResult')
source_tag = bro.find_element_by_id('draggable')
#创建一个动作连的对象
action = ActionChains(bro)
action.click_and_hold(source_tag)

for i in range(4):
    #perform表示开始执行动作链
    action.move_by_offset(20,0).perform()
    sleep(1)
bro.quit()

selenium规避被检测识别

现在不少大网站有对selenium采取监测机制。比如正常情况下我们用浏览器访问淘宝等网站的 window.navigator.webdriver的值为undefined。而使用selenium访问则该值为true。

只需要设置Chromedriver的启动参数即可解决问题。在启动Chromedriver之前，为Chrome开启实验性功能参数 excludeSwitches，它的值为['enable-automation']

from selenium.webdriver import Chrome
from selenium.webdriver import ChromeOptions

option = ChromeOptions()
option.add_experimental_option('excludeSwitches',['enable-automation'])
driver=Chrome(options=option)

Python爬虫_selenium

环境安装

应用

PhantomJs是一款无可视化界面的浏览器（免安装） 已停止更新 不建议使用

谷歌无头浏览器

动作链

selenium规避被检测识别

PhantomJs是一款无可视化界面的浏览器（免安装）已停止更新不建议使用