爬虫请求库之selenium

一：简介

1:介绍

（1）selenium最初是测试工具

（2）爬虫使用该模块的原因是request无法操作js代码而selenium可以操作js代码

（3）selenium本质是操作浏览器内核完全模拟浏览器行为例如输入内容点击等

（4）因为直接操作浏览器我们无需考虑请求头等

2:支持的浏览器

from selenium import webdriver
browser=webdriver.Chrome()
browser=webdriver.Firefox()
browser=webdriver.PhantomJS()
browser=webdriver.Safari()
browser=webdriver.Edge()

二：安装使用

1:安装

pip3 install selenium

下载chromdriver.exe放到python安装路径的scripts目录中即可
国内镜像网站地址：http://npm.taobao.org/mirrors/chromedriver/2.38/
最新的版本去官网找:https://sites.google.com/a/chromium.org/chromedriver/downloads

#注意：
selenium3默认支持的webdriver是Firfox，而Firefox需要安装geckodriver
下载链接：https://github.com/mozilla/geckodriver/releases

2:基本使用

from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By #按照什么方式查找，By.ID,By.CSS_SELECTOR
from selenium.webdriver.common.keys import Keys #键盘按键操作
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素

1、find_element_by_id   根据id找
2、find_element_by_link_text     根据链接名字找到控件（a标签的文字）
3、find_element_by_partial_link_text   根据链接名字找到控件（a标签的文字）模糊查询
4、find_element_by_tag_name       根据标签名
5、find_element_by_class_name     根据类名
6、find_element_by_name           根据属性名 例如 name = 'xxx'
7、find_element_by_css_selector   根据css选择器
8、find_element_by_xpath          根据xpath选择

3:显示等待与隐式等待的作用

（1）大部分网页可能都是由ajax + js开发的加载需要一定的时间当我们通过代码进行操作的时候可能有的标签还没渲染出来

（2）通过设置等待时间让标签能够被加载出来

4:显示等待

（1）设置最大的等待时间

（2）如果指定查询的元素在规定时间内查找出来便会执行下一行代码

（3）如果在规定时间内没有查询出指定的元素便会抛出异常TimeoutException

wait=WebDriverWait(browser,10)
wait.until(EC.presence_of_element_located((By.ID,'content_left')))

5:隐式等待

（1）设置最大的等待时间

（2）如果在规定时间内完成页面加载会执行下一步

（3）否则会一直等到时间结束才会执行下一步

from selenium import webdriver

bro=webdriver.Chrome()
bro.get("http://www.baidu.com")
bro.implicitly_wait(10)   # 表示等待所有 等待时间10s中

4:模拟百度进行登录案例

from selenium import webdriver
import time


def login(url,browser):
    login_button = browser.find_element_by_link_text('登录')  # 查找登录标签

    login_button.click()  # 点击登录

    time.sleep(1)

    login_type = browser.find_element_by_id('TANGRAM__PSP_10__footerULoginBtn')  # 选择用户名密码登录的方式
    login_type.click()


    input_username = browser.find_element_by_id('TANGRAM__PSP_10__userName')  # 获取输入用户名框
    input_username.send_keys("123")  # 输入用户名
    input_password = browser.find_element_by_id('TANGRAM__PSP_10__password')  # 获取输入用户密码框
    input_password.send_keys("123")  # 输入用户密码
    login_submit = browser.find_element_by_id('TANGRAM__PSP_10__submit')  # 用户名密码输入成功 进行登录

    login_submit.click()  # 点击登录

    cookie = browser.get_cookies()  # 获取所有的cookies  如果获取单个cookie需要知道某个cookie的名称

    browser.close()   # 进行关闭当前浏览器



if __name__ == '__main__':
    browser = webdriver.Chrome()

    browser.implicitly_wait(10)  # 等待时间10s

    url = browser.get('https://www.baidu.com')  # 进行百度请求

    login(url,browser)

模拟百度登录

三:xpath

doc='''
<html>
 <head>
  <base href='http://example.com/' />
  <title>Example website</title>
 </head>
 <body>
  <div id='images'>
   <a href='image1.html' a="xxx">Name: My image 1 <br /><img src='image1_thumb.jpg' /></a>
   <a href='image2.html'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a>
   <a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a>
   <a href='image4.html' class='li'>Name: My image 4 <br /><img src='image4_thumb.jpg' /></a>
   <a href='image5.html' class='li li-item' name='items'>Name: My image 5 <br /><img src='image5_thumb.jpg' /></a>
   <a href='image6.html' name='items'><span><h5>test</h5></span>Name: My image 6 <br /><img src='image6_thumb.jpg' /></a>
  </div>
 </body>
</html>
'''
from lxml import etree

html=etree.HTML(doc)
# html=etree.parse('search.html',etree.HTMLParser())
# 1 所有节点
a=html.xpath('//*')    #匹配所有标签
# 2 指定节点（结果为列表）
# a=html.xpath('//head')
# 3 子节点，子孙节点
a=html.xpath('//div/a')
a=html.xpath('//body/a') #无数据
a=html.xpath('//body//a')
# 4 父节点
# a=html.xpath('//body//a[@href="image1.html"]/..')
a=html.xpath('//body//a[1]/..')  #从1开始
# 也可以这样
a=html.xpath('//body//a[1]/parent::*')
# 5 属性匹配
a=html.xpath('//body//a[@href="image1.html"]')

# 6 文本获取
a=html.xpath('//body//a[@href="image1.html"]/text()')
a=html.xpath('//body//a/text()')

# 7 属性获取
# a=html.xpath('//body//a/@href')
# # 注意从1 开始取（不是从0）
a=html.xpath('//body//a[2]/@href')
# 8 属性多值匹配
#  a 标签有多个class类，直接匹配就不可以了，需要用contains
# a=html.xpath('//body//a[@class="li"]')
a=html.xpath('//body//a[contains(@class,"li")]/text()')
# a=html.xpath('//body//a[contains(@class,"li")]/text()')
# 9 多属性匹配
a=html.xpath('//body//a[contains(@class,"li") or @name="items"]')
a=html.xpath('//body//a[contains(@class,"li") and @name="items"]/text()')
a=html.xpath('//body//a[contains(@class,"li")]/text()')
# 10 按序选择
a=html.xpath('//a[2]/text()')
a=html.xpath('//a[2]/@href')
# 取最后一个
a=html.xpath('//a[last()]/@href')
# 位置小于3的
a=html.xpath('//a[position()<3]/@href')
# 倒数第二个
a=html.xpath('//a[last()-2]/@href')
# 11 节点轴选择
# ancestor：祖先节点
# 使用了* 获取所有祖先节点
a=html.xpath('//a/ancestor::*')
# # 获取祖先节点中的div
a=html.xpath('//a/ancestor::div')
# attribute：属性值
a=html.xpath('//a[1]/attribute::*')
# child：直接子节点
a=html.xpath('//a[1]/child::*')
# descendant：所有子孙节点
a=html.xpath('//a[6]/descendant::*')
# following:当前节点之后所有节点
a=html.xpath('//a[1]/following::*')
a=html.xpath('//a[1]/following::*[1]/@href')
# following-sibling:当前节点之后同级节点
a=html.xpath('//a[1]/following-sibling::*')
a=html.xpath('//a[1]/following-sibling::a')
a=html.xpath('//a[1]/following-sibling::*[2]/text()')
a=html.xpath('//a[1]/following-sibling::*[2]/@href')

print(a)

xpath

四:获取元素属性

from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By #按照什么方式查找，By.ID,By.CSS_SELECTOR
from selenium.webdriver.common.keys import Keys #键盘按键操作
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素

browser=webdriver.Chrome()

browser.get('https://www.amazon.cn/')

wait=WebDriverWait(browser,10)
wait.until(EC.presence_of_element_located((By.ID,'cc-lm-tcgShowImgContainer')))

tag=browser.find_element(By.CSS_SELECTOR,'#cc-lm-tcgShowImgContainer img')

#获取标签属性，
print(tag.get_attribute('src'))

#获取标签ID，位置，名称，大小（了解）
print(tag.id)
print(tag.location)
print(tag.tag_name)
print(tag.size)

browser.close()

获取元素属性

五:元素交互操作

from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By #按照什么方式查找，By.ID,By.CSS_SELECTOR
from selenium.webdriver.common.keys import Keys #键盘按键操作
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素

browser=webdriver.Chrome()
browser.get('https://www.amazon.cn/')
wait=WebDriverWait(browser,10)



input_tag=wait.until(EC.presence_of_element_located((By.ID,'twotabsearchtextbox')))
input_tag.send_keys('iphone 8')
button=browser.find_element_by_css_selector('#nav-search > form > div.nav-right > div > input')
button.click()

import time
time.sleep(3)

input_tag=browser.find_element_by_id('twotabsearchtextbox')
input_tag.clear() #清空输入框
input_tag.send_keys('iphone7plus')
button=browser.find_element_by_css_selector('#nav-search > form > div.nav-right > div > input')
button.click()

元素交互信息

from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By  # 按照什么方式查找，By.ID,By.CSS_SELECTOR
from selenium.webdriver.common.keys import Keys  # 键盘按键操作
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait  # 等待页面加载某些元素
import time

driver = webdriver.Chrome()
driver.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
wait=WebDriverWait(driver,3)
# driver.implicitly_wait(3)  # 使用隐式等待

try:
    driver.switch_to.frame('iframeResult') ##切换到iframeResult
    sourse=driver.find_element_by_id('draggable')
    target=driver.find_element_by_id('droppable')

```
#方式一：基于同一个动作链串行执行
# actions=ActionChains(driver) #拿到动作链对象
# actions.drag_and_drop(sourse,target) #把动作放到动作链中，准备串行执行
# actions.perform()

#方式二：不同的动作链，每次移动的位移都不同
```

    ActionChains(driver).click_and_hold(sourse).perform()
    distance=target.location['x']-sourse.location['x']

```
track=0
while track < distance:
    ActionChains(driver).move_by_offset(xoffset=2,yoffset=0).perform()
    track+=2

ActionChains(driver).release().perform()

time.sleep(10)
```

finally:
    driver.close()

Action Chains

滑动验证码

from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By #按照什么方式查找，By.ID,By.CSS_SELECTOR
from selenium.webdriver.common.keys import Keys #键盘按键操作
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素



try:
    browser=webdriver.Chrome()
    browser.get('https://www.baidu.com')
    browser.execute_script('alert("hello world")') #打印警告
finally:
    browser.close()

在交互动作比较难实现的时候可以自己写JS（万能方法）

操作js

六：其他操作

#cookies
from selenium import webdriver

browser=webdriver.Chrome()
browser.get('https://www.zhihu.com/explore')
print(browser.get_cookies())
browser.add_cookie({'k1':'xxx','k2':'yyy'})
print(browser.get_cookies())

# browser.delete_all_cookies()

模拟浏览器前进后退

from selenium import webdriver

browser=webdriver.Chrome()
browser.get('https://www.zhihu.com/explore')
print(browser.get_cookies())
browser.add_cookie({'k1':'xxx','k2':'yyy'})
print(browser.get_cookies())

import time
from selenium import webdriver

browser=webdriver.Chrome()
browser.get('https://www.baidu.com')
browser.execute_script('window.open()')

print(browser.window_handles) #获取所有的选项卡
browser.switch_to_window(browser.window_handles[1])
browser.get('https://www.taobao.com')
time.sleep(10)
browser.switch_to_window(browser.window_handles[0])
browser.get('https://www.sina.com.cn')
browser.close()

选项卡管理

from selenium import webdriver
from selenium.common.exceptions import TimeoutException,NoSuchElementException,NoSuchFrameException

try:
    browser=webdriver.Chrome()
    browser.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
    browser.switch_to.frame('iframssseResult')

except TimeoutException as e:
    print(e)
except NoSuchFrameException as e:
    print(e)
finally:
    browser.close()

异常处理

七:爬取示例

from selenium import webdriver
from selenium.webdriver.common.keys import Keys  # 键盘按键操作
import time
import os, requests, hashlib

path = str(time.time())


def get_good(bro):
    goods_list = bro.find_elements_by_class_name('gl-item')

    for good in goods_list:

        good_detail_url = good.find_element_by_css_selector('.p-img a').get_attribute('href')  # 获取商品详情

        good_img_url = good.find_element_by_css_selector('.p-img a img').get_attribute('src')  # 获取商品图片
        if not good_img_url:  # 此只有当鼠标滑动到商品的时候才会进行加载出来
            good_img_url = 'https:' + good.find_element_by_css_selector('.p-img a img').get_attribute('data-lazy-img')
        good_price = good.find_element_by_css_selector('.p-price i').text
        good_brief = good.find_element_by_css_selector('.p-name a ').get_attribute('title')
        good_comment = good.find_element_by_css_selector('.p-commit strong  a').text

        response = requests.get(good_img_url)
        good_name = good_brief.split(' ')[0][:5]

        md5 = hashlib.md5()
        md5.update(good_img_url.encode('utf-8'))
        file_name = '%s%s.jpg' % (good_name, md5.hexdigest())
        photo_path = 'photo'
        if not os.path.exists(photo_path):
            os.mkdir(photo_path)

        file_path = os.path.join(photo_path, file_name)

        with open(file_path, 'wb') as f:
            print("%s下载之中:" % good_brief)
            for line in response.iter_content():
                f.write(line)

    next_page = bro.find_element_by_css_selector(".page .p-num .pn-next em")  # 点击下一页 继续获取商品

    time.sleep(1)

    next_page.click()

    time.sleep(1)
    get_good(bro)  # 循环调用函数


if __name__ == '__main__':

    name = input('商品名>>:')
    bro = webdriver.Chrome()
    bro.get("https://www.jd.com")
    bro.implicitly_wait(10)
    search_input = bro.find_element_by_id('key')

    search_input.send_keys(name)
    search_input.send_keys(Keys.ENTER)

    try:
        print('商品获取中')
        get_good(bro)
    except Exception as e:
        print("结束")
    finally:
        bro.close()

爬取京东商品

from selenium import webdriver
from selenium.webdriver.common.keys import Keys  # 键盘按键操作
import time
import pymysql


def get_goods(bro):
    goods_lists = bro.find_elements_by_class_name('gl-item')

    return goods_lists


def get_goods_info(goods_lists):

    for good in goods_lists:


        good_detail_url = good.find_element_by_css_selector('.p-img a').get_attribute('href')  # 获取商品详情

        good_img_url = good.find_element_by_css_selector('.p-img a img').get_attribute('src')  # 获取商品图片
        if not good_img_url:  # 此只有当鼠标滑动到商品的时候才会进行加载出来
            good_img_url = 'https:' + good.find_element_by_css_selector('.p-img a img').get_attribute('data-lazy-img')
        good_price = good.find_element_by_css_selector('.p-price i').text
        good_brief = good.find_element_by_css_selector('.p-name a ').get_attribute('title')
        good_comment = good.find_element_by_css_selector('.p-commit strong  a').text

        return good_detail_url, good_img_url, good_price, good_comment, good_brief


def write_database(good_detail_url, good_img_url, good_price, good_comment, good_brief):
    db = pymysql.connect(host="localhost", user="root",
                         password="123", db="syl", port=3306)

    cur = db.cursor()

    sql_insert = """insert into goods(good_detail,good_image,good_price,good_comment,good_brief) values({},{},{},{},{})""".format(
        repr(good_detail_url), repr(good_img_url), good_price, repr(good_comment), repr(good_brief))

    try:
        cur.execute(sql_insert)

        # 提交
        db.commit()

    except Exception as e:
        # 错误回滚
        print(e)
        db.rollback()
    finally:

        db.close()


def next_get_good():
    next_page = bro.find_element_by_css_selector(".page .p-num .pn-next em")  # 点击下一页 继续获取商品

    time.sleep(1)

    next_page.click()

    time.sleep(1)

    bro.implicitly_wait(10)

    main(bro)


def main(bro):


    goods_lists = get_goods(bro)

    response_good_info = get_goods_info(goods_lists)

    good_detail_url, good_img_url, good_price, good_comment, good_brief = response_good_info


    write_database( good_detail_url, good_img_url, good_price, good_comment, good_brief)

    next_get_good()

if __name__ == '__main__':



    name = input('商品名>>:')


    bro = webdriver.Chrome()

    bro.implicitly_wait(10)
    bro.get("https://www.jd.com")

    search_input = bro.find_element_by_id('key')

    search_input.send_keys(name)
    search_input.send_keys(Keys.ENTER)
    try:
        main(bro)
    except Exception as e:
        print(e)
        print("结束")
    finally:
        bro.close()

爬取京东商品存入数据库

from selenium import webdriver
import requests

import os
import hashlib

path = 'photo'


def get_url(base_url):
    browser.get(base_url)

    browser.implicitly_wait(10)


def get_image_url():
    images_list = browser.find_elements_by_css_selector('.goods-item .figure-img img ')

    yield images_list


def get_image(images):
    image = images.get_attribute('src')

    image_title = images.get_attribute('alt')

    yield image, image_title


def download_image(image, image_title):
    if not os.path.exists(path):  # 判断存储路径是否存在
        os.mkdir(path)

    md5 = hashlib.md5()
    md5.update(image_title.encode('utf-8'))

    file_name = '%s%s.jpg' % (image_title, md5.hexdigest())  # 防止文件名重复

    file_path = os.path.join(path, file_name)  # 拼接文件路径

    response = requests.get(image)  # 请求图片数据流
    with open(file_path, 'wb') as f:
        print("%s下载之中:" % image_title)
        for line in response.iter_content():
            f.write(line)


def main():
    for i in range(page_num):
        base_url = 'https://www.plmm.com.cn/tags-199-%s.html' % i
        get_url(base_url)
        images_list = list(get_image_url())[0]
        for images in images_list:
            images_detail = list(get_image(images))[0]

            image_detail, image_title = images_detail

            download_image(image_detail, image_title)


if __name__ == '__main__':

    request_url = 'https://www.plmm.com.cn/tags-199-0.html'
    browser = webdriver.Chrome()
    browser.implicitly_wait(10)
    browser.get(request_url)

    page = browser.find_elements_by_class_name('page-num')   # 获取所有的页面

    page_num = len(page) + 1   # 通过len判断有多少页面 因为首页没有page-num +1补上首页

    try:
        main()
    except Exception as e:
        print(e)
    finally:
        print('爬取结束')
        browser.close()

爬取漂亮美美网图片