爬虫之selenium

安装

1 进入虚拟环境下安装 selenium :在cmd下输入 activate base 

2 pip install selenium

简介:

selenium 就是利用浏览器驱动模拟浏览器访问爬取页面,

优点:能有效的解决某些动态资源访问困难的问题

缺点:需要根据浏览器的具体版本选择下载浏览器驱动

谷歌浏览器驱动下载地址:http://chromedriver.storage.googleapis.com/index.html

下载的驱动程序必须和浏览器的版本统一,大家可以根据http://blog.csdn.net/huilan_same/article/details/51896672中提供的版本映射表进行对应

使用:

导入

from selenium import webdriver
from time import sleep

加载驱动,

bro = webdriver.Chrome(executable_path=r'C:UsersAdministratorDesktop爬虫+数据day04chromedriver.exe')

方法:

#根据find系列的函数定位到指定的标签
my_input = bro.find_element_by_id('kw')

#获取当前浏览器显示的页面的页面源码
page_text = bro.page_source

bro.save_screenshot('./1.png') #截图

###############################################
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
#谷歌无头浏览器
bro = webdriver.Chrome(executable_path=r'C:UsersAdministratorDesktop爬虫+数据day04chromedriver.exe', chrome_options=chrome_options)

#滚轮滚到 当前页面最下端
js = 'window.scrollTo(0,document.body.scrollHeight)'
bro.execute_script(js)


#定位到指定的iframe
bro.switch_to.frame('login_frame')

bro.find_element_by_id('switcher_plogin').click()

  

百度迪丽热吧

from selenium import webdriver
from time import sleep
from lxml import etree

bro = webdriver.Chrome(executable_path=r'G:papaday04chromedriver.exe')
bro.get(url='https://www.baidu.com')
sleep(1)
my_input=bro.find_element_by_id("kw")
my_input.send_keys("迪丽热巴")
sleep(3)
bro.find_element_by_id('su').click()
sleep(3)

page_text = bro.page_source
print(page_text)
bro.quit()

无界面访问迪丽热吧

bro = webdriver.PhantomJS(executable_path=r'G:papaday04phantomjs-2.1.1-windowsinphantomjs.exe')
bro.get(url='https://www.baidu.com')
sleep(1)
my_input=bro.find_element_by_id("kw")
my_input.send_keys("迪丽热巴")
sleep(3)
bro.find_element_by_id('su').click()
sleep(3)

page_text = bro.page_source
print(page_text)
bro.quit()

谷歌无头访问

from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
#谷歌无头浏览器
bro = webdriver.Chrome(executable_path=r'G:papaday04chromedriver.exe',chrome_options=chrome_options)
bro.get(url='https://www.baidu.com')
sleep(1)
my_input=bro.find_element_by_id("kw")
my_input.send_keys("迪丽热巴")
sleep(3)
bro.find_element_by_id('su').click()
sleep(3)

page_text = bro.page_source
print(page_text)
bro.quit()

爬取豆瓣电影

bro = webdriver.Chrome(executable_path=r'G:papaday04chromedriver.exe')
url = 'https://movie.douban.com/typerank?type_name=%E7%88%B1%E6%83%85&type=13&interval_id=100:90&action='
bro.get(url=url)
sleep(2)
js='window.scrollTo(0,document.body.scrollHeight)'
for i in range(3):
    bro.execute_script(js)
    sleep(3)
page_text = bro.page_source
print(page_text)
bro.quit()

爬取qq空间

bro = webdriver.Chrome(executable_path=r'G:papaday04chromedriver.exe')
url = 'https://qzone.qq.com/'
bro.get(url=url)
sleep(1)
#定位到frame
bro.switch_to.frame("login_frame")
bro.find_element_by_id("switcher_plogin").click()
sleep(1)

name = bro.find_element_by_id('u')
name.send_keys('用户名')
sleep(1)
pwd = bro.find_element_by_id('p')
pwd.send_keys('密码')
sleep(1)
bro.find_element_by_id("login_button").click()
sleep(1)

js='window.scrollTo(0,document.body.scrollHeight)'
for i in range(3):
bro.execute_script(js)
sleep(3)

page_text = bro.page_source
sleep(5)

tree =etree.HTML(page_text)
div_list=tree.xpath('//div[@class="f-info qz_info_cut"] | //div[@class="f-info"]')
for div in div_list:
text = div.xpath(".//text()")
text = ''.join(text)
print(text)

bro.quit()

原文地址:https://www.cnblogs.com/wszxdzd/p/10252522.html