爬虫请求库之selenium

一、介绍

selenium最初是一个自动化测试工具,而爬虫中使用它主要是为了解决requests无法直接执行JavaScript代码的问题

selenium本质是通过驱动浏览器，完全模拟浏览器的操作，比如跳转、输入、点击、下拉等，来拿到网页渲染之后的结果，可支持多种浏览器

from selenium import webdriver
from selenium.webdriver.common.keys import Keys #键盘按键操作
import time
browser=webdriver.Chrome()

官网：http://selenium-python.readthedocs.io

二、安装

#安装：selenium+chromedriver
pip3 install selenium
下载chromdriver.exe放到python安装路径的scripts目录中即可，注意最新版本是2.38，并非2.9
国内镜像网站地址：http://npm.taobao.org/mirrors/chromedriver/2.38/
最新的版本去官网找:https://sites.google.com/a/chromium.org/chromedriver/downloads

三、基本使用

from selenium import webdriver
from selenium.webdriver.common.keys import Keys #键盘按键操作
import time

bro = webdriver.Chrome()
bro.get('https//:www.baidu.com')
#取到输入框
inp=bro.find_element_by_id('kw')
#往框里写字
inp.send_keys("美女")
inp.send_keys(Keys.ENTER) #输入回车
#另一种方式，取出按钮，点击su
time.sleep(3)
bro.close()

四、选择器

1.基本用法

# 1、find_element_by_id   根据id找
# 2、find_element_by_link_text     根据链接名字找到控件（a标签的文字）
# 3、find_element_by_partial_link_text   根据链接名字找到控件（a标签的文字）模糊查询
# 4、find_element_by_tag_name       根据标签名
# 5、find_element_by_class_name     根据类名
# 6、find_element_by_name           根据属性名
# 7、find_element_by_css_selector   根据css选择器
# 8、find_element_by_xpath          根据xpath选择

from selenium import webdriver
import time
bro = webdriver.Chrome()
bro.get('https://www.baidu.com')  # 以百度网站为例子
bro.implicitly_wait(10)  # 隐式等待:在查找所有元素时，如果尚未被加载，则等10秒


dl_button = bro.find_element_by_link_text('登录')  # 获取登录界面
dl_button.click()

user_login = bro.find_element_by_id('TANGRAM__PSP_10__footerULoginBtn')  # 获取用户登录id标签
user_login.click()  # 模拟点击登录
time.sleep(1)
input_name = bro.find_element_by_name('userName')  # 获取用户名输入id标签
input_name.send_keys('5636288322qq.com')  # 模拟输入用户名

input_password = bro.find_element_by_id('TANGRAM__PSP_10__password')  # 获取密码
input_password.send_keys('xxxxxxxx')  # 模拟输入密码

submit_button = bro.find_element_by_id('TANGRAM__PSP_10__submit')  # 获取提交标签
time.sleep(1)
submit_button.click()  # 模拟点击提交

2.显示等待与隐示等待

隐式等待:在查找所有元素时，如果尚未被加载，则等10秒
 browser.implicitly_wait(10)   表示等待所有

显式等待：显式地等待某个元素被加载
wait=WebDriverWait(browser,10)
wait.until(EC.presence_of_element_located((By.ID,'content_left')))

3.爬取京东商品信息

from selenium import webdriver
from selenium.webdriver.common.keys import Keys #键盘按键操作
import time
bro=webdriver.Chrome()
bro.get("https://www.jd.com")
bro.implicitly_wait(10)




def get_goods(bro):
    print("------------------------------------")
    goods_li = bro.find_elements_by_class_name('gl-item')  # 获取商品名称列表
    for good in goods_li:  # 进行判断
        img_url = good.find_element_by_css_selector('.p-img a img').get_attribute('src')  # 如果在就获取地址
        if not img_url:
            img_url = 'https:' + good.find_element_by_css_selector('.p-img a img').get_attribute('data-lazy-img')  # 不在就返回none
        url = good.find_element_by_css_selector('.p-img a').get_attribute('href')  # 地址
        price = good.find_element_by_css_selector('.p-price i').text  # 价格
        name = good.find_element_by_css_selector('.p-name em').text.replace('
', '')  # 名字
        commit = good.find_element_by_css_selector('.p-commit a').text # 评论数
        print('''
        商品链接：%s
        商品图片：%s
        商品名字：%s
        商品价格：%s
        商品评论数：%s

        ''' % (url, img_url, name, price, commit))

    next_page = bro.find_element_by_partial_link_text("下一页")
    time.sleep(1)
    next_page.click()
    time.sleep(1)
    get_goods(bro)
input_search=bro.find_element_by_id('key')
input_search.send_keys("一加手机")
input_search.send_keys(Keys.ENTER)

#进入了另一个页面
try:
    get_goods(bro)
except Exception as e:
    print("结束")
finally:
    bro.close()

4.获取标签属性

#获取属性：
# tag.get_attribute('src')
#获取文本内容
# tag.text
#获取标签ID，位置，名称，大小（了解）
# print(tag.id)
# print(tag.location)
# print(tag.tag_name)
# print(tag.size)

5.模拟浏览器前进后退

 browser.back()
 time.sleep(10)
 browser.forward()

6.其它

# print(browser.get_cookies())  获取cookie
# browser.add_cookie({'k1':'xxx','k2':'yyy'})  设置cookie
# print(browser.get_cookies())

cookies管理

 from selenium import webdriver
 import time

 bro=webdriver.Chrome()
 bro.get("http://www.baidu.com")
 bro.execute_script('alert("hello world")') #打印警告
 time.sleep(5)

运行js

# import time
# from selenium import webdriver
#
# browser=webdriver.Chrome()
# browser.get('https://www.baidu.com')
# browser.execute_script('window.open()')
#
# print(browser.window_handles) #获取所有的选项卡
# browser.switch_to_window(browser.window_handles[1])
# browser.get('https://www.taobao.com')
# time.sleep(3)
# browser.switch_to_window(browser.window_handles[0])
# browser.get('https://www.sina.com.cn')
# browser.close()

选项卡管理

7.xpath

doc='''
<html>
 <head>
  <base href='http://example.com/' />
  <title>Example website</title>
 </head>
 <body>
  <div id='images'>
   <a href='image1.html' a="xxx">Name: My image 1 <br /><img src='image1_thumb.jpg' /></a>
   <a href='image2.html'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a>
   <a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a>
   <a href='image4.html' class='li'>Name: My image 4 <br /><img src='image4_thumb.jpg' /></a>
   <a href='image5.html' class='li li-item' name='items'>Name: My image 5 <br /><img src='image5_thumb.jpg' /></a>
   <a href='image6.html' name='items'><span><h5>test</h5></span>Name: My image 6 <br /><img src='image6_thumb.jpg' /></a>
  </div>
 </body>
</html>
'''
from lxml import etree

html=etree.HTML(doc)
# html=etree.parse('search.html',etree.HTMLParser())
# 1 所有节点
a=html.xpath('//*')    #匹配所有标签
# 2 指定节点（结果为列表）
# a=html.xpath('//head')
# 3 子节点，子孙节点
a=html.xpath('//div/a')
a=html.xpath('//body/a') #无数据
a=html.xpath('//body//a')
# 4 父节点
# a=html.xpath('//body//a[@href="image1.html"]/..')
a=html.xpath('//body//a[1]/..')  #从1开始
# 也可以这样
a=html.xpath('//body//a[1]/parent::*')
# 5 属性匹配
a=html.xpath('//body//a[@href="image1.html"]')

# 6 文本获取
a=html.xpath('//body//a[@href="image1.html"]/text()')
a=html.xpath('//body//a/text()')

# 7 属性获取
# a=html.xpath('//body//a/@href')
# # 注意从1 开始取（不是从0）
a=html.xpath('//body//a[2]/@href')
# 8 属性多值匹配
#  a 标签有多个class类，直接匹配就不可以了，需要用contains
# a=html.xpath('//body//a[@class="li"]')
a=html.xpath('//body//a[contains(@class,"li")]/text()')
# a=html.xpath('//body//a[contains(@class,"li")]/text()')
# 9 多属性匹配
a=html.xpath('//body//a[contains(@class,"li") or @name="items"]')  # 属性要使用@符号
a=html.xpath('//body//a[contains(@class,"li") and @name="items"]/text()')
a=html.xpath('//body//a[contains(@class,"li")]/text()')
# 10 按序选择
a=html.xpath('//a[2]/text()')
a=html.xpath('//a[2]/@href')
# 取最后一个
a=html.xpath('//a[last()]/@href')
# 位置小于3的
a=html.xpath('//a[position()<3]/@href')
# 倒数第二个
a=html.xpath('//a[last()-2]/@href')
# 11 节点轴选择
# ancestor：祖先节点
# 使用了* 获取所有祖先节点
a=html.xpath('//a/ancestor::*')
# # 获取祖先节点中的div
a=html.xpath('//a/ancestor::div')
# attribute：属性值
a=html.xpath('//a[1]/attribute::*')
# child：直接子节点
a=html.xpath('//a[1]/child::*')
# descendant：所有子孙节点
a=html.xpath('//a[6]/descendant::*')
# following:当前节点之后所有节点(兄弟节点)
a=html.xpath('//a[1]/following::*')
a=html.xpath('//a[1]/following::*[1]/@href')
# following-sibling:当前节点之后同级节点
a=html.xpath('//a[1]/following-sibling::*')  # 同级的a标签全选出来
a=html.xpath('//a[1]/following-sibling::a')  # 同上
a=html.xpath('//a[1]/following-sibling::*[2]/text()')  # 第二个text取出来
a=html.xpath('//a[1]/following-sibling::*[2]/@href')

print(a)

View Code