爬虫请求库之selenium

一:简介

1:介绍

(1)selenium最初是测试工具

(2)爬虫使用该模块的原因是request无法操作js代码 而selenium可以操作js代码

(3)selenium本质是操作浏览器内核 完全模拟浏览器行为 例如 输入内容 点击等

(4)因为直接操作浏览器 我们无需考虑请求头等

2:支持的浏览器

from selenium import webdriver
browser=webdriver.Chrome()
browser=webdriver.Firefox()
browser=webdriver.PhantomJS()
browser=webdriver.Safari()
browser=webdriver.Edge() 

二:安装使用

1:安装

pip3 install selenium
下载chromdriver.exe放到python安装路径的scripts目录中即可
国内镜像网站地址:http://npm.taobao.org/mirrors/chromedriver/2.38/
最新的版本去官网找:https://sites.google.com/a/chromium.org/chromedriver/downloads

#注意:
selenium3默认支持的webdriver是Firfox,而Firefox需要安装geckodriver
下载链接:https://github.com/mozilla/geckodriver/releases

2:基本使用

from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR
from selenium.webdriver.common.keys import Keys #键盘按键操作
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素
1、find_element_by_id   根据id找
2、find_element_by_link_text     根据链接名字找到控件(a标签的文字)
3、find_element_by_partial_link_text   根据链接名字找到控件(a标签的文字)模糊查询
4、find_element_by_tag_name       根据标签名
5、find_element_by_class_name     根据类名
6、find_element_by_name           根据属性名 例如 name = 'xxx'
7、find_element_by_css_selector   根据css选择器
8、find_element_by_xpath          根据xpath选择

3:显示等待与隐式等待的作用

(1)大部分网页可能都是由ajax + js开发的 加载需要一定的时间 当我们通过代码进行操作的时候 可能有的标签还没渲染出来

 (2)通过设置等待时间让标签能够被加载出来

4:显示等待

(1)设置最大的等待时间 

(2)如果指定查询的元素在规定时间内查找出来 便会执行下一行代码

(3)如果在规定时间内没有查询出指定的元素便会抛出异常TimeoutException

wait=WebDriverWait(browser,10)
wait.until(EC.presence_of_element_located((By.ID,'content_left')))

5:隐式等待

(1)设置最大的等待时间

(2)如果在规定时间内完成页面加载 会执行下一步

(3)否则会一直等到时间结束才会执行下一步

from selenium import webdriver

bro=webdriver.Chrome()
bro.get("http://www.baidu.com")
bro.implicitly_wait(10)   # 表示等待所有 等待时间10s中

4:模拟百度进行登录案例

from selenium import webdriver
import time


def login(url,browser):
    login_button = browser.find_element_by_link_text('登录')  # 查找登录标签

    login_button.click()  # 点击登录

    time.sleep(1)

    login_type = browser.find_element_by_id('TANGRAM__PSP_10__footerULoginBtn')  # 选择用户名密码登录的方式
    login_type.click()


    input_username = browser.find_element_by_id('TANGRAM__PSP_10__userName')  # 获取输入用户名框
    input_username.send_keys("123")  # 输入用户名
    input_password = browser.find_element_by_id('TANGRAM__PSP_10__password')  # 获取输入用户密码框
    input_password.send_keys("123")  # 输入用户密码
    login_submit = browser.find_element_by_id('TANGRAM__PSP_10__submit')  # 用户名密码输入成功 进行登录

    login_submit.click()  # 点击登录

    cookie = browser.get_cookies()  # 获取所有的cookies  如果获取单个cookie需要知道某个cookie的名称

    browser.close()   # 进行关闭当前浏览器



if __name__ == '__main__':
    browser = webdriver.Chrome()

    browser.implicitly_wait(10)  # 等待时间10s

    url = browser.get('https://www.baidu.com')  # 进行百度请求

    login(url,browser)
模拟百度登录

 三:xpath

doc='''
<html>
 <head>
  <base href='http://example.com/' />
  <title>Example website</title>
 </head>
 <body>
  <div id='images'>
   <a href='image1.html' a="xxx">Name: My image 1 <br /><img src='image1_thumb.jpg' /></a>
   <a href='image2.html'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a>
   <a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a>
   <a href='image4.html' class='li'>Name: My image 4 <br /><img src='image4_thumb.jpg' /></a>
   <a href='image5.html' class='li li-item' name='items'>Name: My image 5 <br /><img src='image5_thumb.jpg' /></a>
   <a href='image6.html' name='items'><span><h5>test</h5></span>Name: My image 6 <br /><img src='image6_thumb.jpg' /></a>
  </div>
 </body>
</html>
'''
from lxml import etree

html=etree.HTML(doc)
# html=etree.parse('search.html',etree.HTMLParser())
# 1 所有节点
a=html.xpath('//*')    #匹配所有标签
# 2 指定节点(结果为列表)
# a=html.xpath('//head')
# 3 子节点,子孙节点
a=html.xpath('//div/a')
a=html.xpath('//body/a') #无数据
a=html.xpath('//body//a')
# 4 父节点
# a=html.xpath('//body//a[@href="image1.html"]/..')
a=html.xpath('//body//a[1]/..')  #从1开始
# 也可以这样
a=html.xpath('//body//a[1]/parent::*')
# 5 属性匹配
a=html.xpath('//body//a[@href="image1.html"]')

# 6 文本获取
a=html.xpath('//body//a[@href="image1.html"]/text()')
a=html.xpath('//body//a/text()')

# 7 属性获取
# a=html.xpath('//body//a/@href')
# # 注意从1 开始取(不是从0)
a=html.xpath('//body//a[2]/@href')
# 8 属性多值匹配
#  a 标签有多个class类,直接匹配就不可以了,需要用contains
# a=html.xpath('//body//a[@class="li"]')
a=html.xpath('//body//a[contains(@class,"li")]/text()')
# a=html.xpath('//body//a[contains(@class,"li")]/text()')
# 9 多属性匹配
a=html.xpath('//body//a[contains(@class,"li") or @name="items"]')
a=html.xpath('//body//a[contains(@class,"li") and @name="items"]/text()')
a=html.xpath('//body//a[contains(@class,"li")]/text()')
# 10 按序选择
a=html.xpath('//a[2]/text()')
a=html.xpath('//a[2]/@href')
# 取最后一个
a=html.xpath('//a[last()]/@href')
# 位置小于3的
a=html.xpath('//a[position()<3]/@href')
# 倒数第二个
a=html.xpath('//a[last()-2]/@href')
# 11 节点轴选择
# ancestor:祖先节点
# 使用了* 获取所有祖先节点
a=html.xpath('//a/ancestor::*')
# # 获取祖先节点中的div
a=html.xpath('//a/ancestor::div')
# attribute:属性值
a=html.xpath('//a[1]/attribute::*')
# child:直接子节点
a=html.xpath('//a[1]/child::*')
# descendant:所有子孙节点
a=html.xpath('//a[6]/descendant::*')
# following:当前节点之后所有节点
a=html.xpath('//a[1]/following::*')
a=html.xpath('//a[1]/following::*[1]/@href')
# following-sibling:当前节点之后同级节点
a=html.xpath('//a[1]/following-sibling::*')
a=html.xpath('//a[1]/following-sibling::a')
a=html.xpath('//a[1]/following-sibling::*[2]/text()')
a=html.xpath('//a[1]/following-sibling::*[2]/@href')

print(a)
xpath

四:获取元素属性

from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR
from selenium.webdriver.common.keys import Keys #键盘按键操作
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素

browser=webdriver.Chrome()

browser.get('https://www.amazon.cn/')

wait=WebDriverWait(browser,10)
wait.until(EC.presence_of_element_located((By.ID,'cc-lm-tcgShowImgContainer')))

tag=browser.find_element(By.CSS_SELECTOR,'#cc-lm-tcgShowImgContainer img')

#获取标签属性,
print(tag.get_attribute('src'))

#获取标签ID,位置,名称,大小(了解)
print(tag.id)
print(tag.location)
print(tag.tag_name)
print(tag.size)

browser.close()
获取元素属性

五:元素交互操作

from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR
from selenium.webdriver.common.keys import Keys #键盘按键操作
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素

browser=webdriver.Chrome()
browser.get('https://www.amazon.cn/')
wait=WebDriverWait(browser,10)



input_tag=wait.until(EC.presence_of_element_located((By.ID,'twotabsearchtextbox')))
input_tag.send_keys('iphone 8')
button=browser.find_element_by_css_selector('#nav-search > form > div.nav-right > div > input')
button.click()

import time
time.sleep(3)

input_tag=browser.find_element_by_id('twotabsearchtextbox')
input_tag.clear() #清空输入框
input_tag.send_keys('iphone7plus')
button=browser.find_element_by_css_selector('#nav-search > form > div.nav-right > div > input')
button.click()
元素交互信息
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By  # 按照什么方式查找,By.ID,By.CSS_SELECTOR
from selenium.webdriver.common.keys import Keys  # 键盘按键操作
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait  # 等待页面加载某些元素
import time

driver = webdriver.Chrome()
driver.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
wait=WebDriverWait(driver,3)
# driver.implicitly_wait(3)  # 使用隐式等待

try:
    driver.switch_to.frame('iframeResult') ##切换到iframeResult
    sourse=driver.find_element_by_id('draggable')
    target=driver.find_element_by_id('droppable')

```
#方式一:基于同一个动作链串行执行
# actions=ActionChains(driver) #拿到动作链对象
# actions.drag_and_drop(sourse,target) #把动作放到动作链中,准备串行执行
# actions.perform()

#方式二:不同的动作链,每次移动的位移都不同
```

    ActionChains(driver).click_and_hold(sourse).perform()
    distance=target.location['x']-sourse.location['x']

```
track=0
while track < distance:
    ActionChains(driver).move_by_offset(xoffset=2,yoffset=0).perform()
    track+=2

ActionChains(driver).release().perform()

time.sleep(10)
```

finally:
    driver.close()

Action Chains
滑动验证码
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR
from selenium.webdriver.common.keys import Keys #键盘按键操作
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素



try:
    browser=webdriver.Chrome()
    browser.get('https://www.baidu.com')
    browser.execute_script('alert("hello world")') #打印警告
finally:
    browser.close()

在交互动作比较难实现的时候可以自己写JS(万能方法)
操作js

六:其他操作

#cookies
from selenium import webdriver

browser=webdriver.Chrome()
browser.get('https://www.zhihu.com/explore')
print(browser.get_cookies())
browser.add_cookie({'k1':'xxx','k2':'yyy'})
print(browser.get_cookies())

# browser.delete_all_cookies()
模拟浏览器前进后退
from selenium import webdriver

browser=webdriver.Chrome()
browser.get('https://www.zhihu.com/explore')
print(browser.get_cookies())
browser.add_cookie({'k1':'xxx','k2':'yyy'})
print(browser.get_cookies())
cookie
import time
from selenium import webdriver

browser=webdriver.Chrome()
browser.get('https://www.baidu.com')
browser.execute_script('window.open()')

print(browser.window_handles) #获取所有的选项卡
browser.switch_to_window(browser.window_handles[1])
browser.get('https://www.taobao.com')
time.sleep(10)
browser.switch_to_window(browser.window_handles[0])
browser.get('https://www.sina.com.cn')
browser.close()
选项卡管理
from selenium import webdriver
from selenium.common.exceptions import TimeoutException,NoSuchElementException,NoSuchFrameException

try:
    browser=webdriver.Chrome()
    browser.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
    browser.switch_to.frame('iframssseResult')

except TimeoutException as e:
    print(e)
except NoSuchFrameException as e:
    print(e)
finally:
    browser.close()
异常处理

 七:爬取示例

from selenium import webdriver
from selenium.webdriver.common.keys import Keys  # 键盘按键操作
import time
import os, requests, hashlib

path = str(time.time())


def get_good(bro):
    goods_list = bro.find_elements_by_class_name('gl-item')

    for good in goods_list:

        good_detail_url = good.find_element_by_css_selector('.p-img a').get_attribute('href')  # 获取商品详情

        good_img_url = good.find_element_by_css_selector('.p-img a img').get_attribute('src')  # 获取商品图片
        if not good_img_url:  # 此只有当鼠标滑动到商品的时候才会进行加载出来
            good_img_url = 'https:' + good.find_element_by_css_selector('.p-img a img').get_attribute('data-lazy-img')
        good_price = good.find_element_by_css_selector('.p-price i').text
        good_brief = good.find_element_by_css_selector('.p-name a ').get_attribute('title')
        good_comment = good.find_element_by_css_selector('.p-commit strong  a').text

        response = requests.get(good_img_url)
        good_name = good_brief.split(' ')[0][:5]

        md5 = hashlib.md5()
        md5.update(good_img_url.encode('utf-8'))
        file_name = '%s%s.jpg' % (good_name, md5.hexdigest())
        photo_path = 'photo'
        if not os.path.exists(photo_path):
            os.mkdir(photo_path)

        file_path = os.path.join(photo_path, file_name)

        with open(file_path, 'wb') as f:
            print("%s下载之中:" % good_brief)
            for line in response.iter_content():
                f.write(line)

    next_page = bro.find_element_by_css_selector(".page .p-num .pn-next em")  # 点击下一页 继续获取商品

    time.sleep(1)

    next_page.click()

    time.sleep(1)
    get_good(bro)  # 循环调用函数


if __name__ == '__main__':

    name = input('商品名>>:')
    bro = webdriver.Chrome()
    bro.get("https://www.jd.com")
    bro.implicitly_wait(10)
    search_input = bro.find_element_by_id('key')

    search_input.send_keys(name)
    search_input.send_keys(Keys.ENTER)

    try:
        print('商品获取中')
        get_good(bro)
    except Exception as e:
        print("结束")
    finally:
        bro.close()
爬取京东商品
from selenium import webdriver
from selenium.webdriver.common.keys import Keys  # 键盘按键操作
import time
import pymysql


def get_goods(bro):
    goods_lists = bro.find_elements_by_class_name('gl-item')

    return goods_lists


def get_goods_info(goods_lists):

    for good in goods_lists:


        good_detail_url = good.find_element_by_css_selector('.p-img a').get_attribute('href')  # 获取商品详情

        good_img_url = good.find_element_by_css_selector('.p-img a img').get_attribute('src')  # 获取商品图片
        if not good_img_url:  # 此只有当鼠标滑动到商品的时候才会进行加载出来
            good_img_url = 'https:' + good.find_element_by_css_selector('.p-img a img').get_attribute('data-lazy-img')
        good_price = good.find_element_by_css_selector('.p-price i').text
        good_brief = good.find_element_by_css_selector('.p-name a ').get_attribute('title')
        good_comment = good.find_element_by_css_selector('.p-commit strong  a').text

        return good_detail_url, good_img_url, good_price, good_comment, good_brief


def write_database(good_detail_url, good_img_url, good_price, good_comment, good_brief):
    db = pymysql.connect(host="localhost", user="root",
                         password="123", db="syl", port=3306)

    cur = db.cursor()

    sql_insert = """insert into goods(good_detail,good_image,good_price,good_comment,good_brief) values({},{},{},{},{})""".format(
        repr(good_detail_url), repr(good_img_url), good_price, repr(good_comment), repr(good_brief))

    try:
        cur.execute(sql_insert)

        # 提交
        db.commit()

    except Exception as e:
        # 错误回滚
        print(e)
        db.rollback()
    finally:

        db.close()


def next_get_good():
    next_page = bro.find_element_by_css_selector(".page .p-num .pn-next em")  # 点击下一页 继续获取商品

    time.sleep(1)

    next_page.click()

    time.sleep(1)

    bro.implicitly_wait(10)

    main(bro)


def main(bro):


    goods_lists = get_goods(bro)

    response_good_info = get_goods_info(goods_lists)

    good_detail_url, good_img_url, good_price, good_comment, good_brief = response_good_info


    write_database( good_detail_url, good_img_url, good_price, good_comment, good_brief)

    next_get_good()

if __name__ == '__main__':



    name = input('商品名>>:')


    bro = webdriver.Chrome()

    bro.implicitly_wait(10)
    bro.get("https://www.jd.com")

    search_input = bro.find_element_by_id('key')

    search_input.send_keys(name)
    search_input.send_keys(Keys.ENTER)
    try:
        main(bro)
    except Exception as e:
        print(e)
        print("结束")
    finally:
        bro.close()
爬取京东商品存入数据库
from selenium import webdriver
import requests

import os
import hashlib

path = 'photo'


def get_url(base_url):
    browser.get(base_url)

    browser.implicitly_wait(10)


def get_image_url():
    images_list = browser.find_elements_by_css_selector('.goods-item .figure-img img ')

    yield images_list


def get_image(images):
    image = images.get_attribute('src')

    image_title = images.get_attribute('alt')

    yield image, image_title


def download_image(image, image_title):
    if not os.path.exists(path):  # 判断存储路径是否存在
        os.mkdir(path)

    md5 = hashlib.md5()
    md5.update(image_title.encode('utf-8'))

    file_name = '%s%s.jpg' % (image_title, md5.hexdigest())  # 防止文件名重复

    file_path = os.path.join(path, file_name)  # 拼接文件路径

    response = requests.get(image)  # 请求图片数据流
    with open(file_path, 'wb') as f:
        print("%s下载之中:" % image_title)
        for line in response.iter_content():
            f.write(line)


def main():
    for i in range(page_num):
        base_url = 'https://www.plmm.com.cn/tags-199-%s.html' % i
        get_url(base_url)
        images_list = list(get_image_url())[0]
        for images in images_list:
            images_detail = list(get_image(images))[0]

            image_detail, image_title = images_detail

            download_image(image_detail, image_title)


if __name__ == '__main__':

    request_url = 'https://www.plmm.com.cn/tags-199-0.html'
    browser = webdriver.Chrome()
    browser.implicitly_wait(10)
    browser.get(request_url)

    page = browser.find_elements_by_class_name('page-num')   # 获取所有的页面

    page_num = len(page) + 1   # 通过len判断有多少页面 因为首页没有page-num +1补上首页

    try:
        main()
    except Exception as e:
        print(e)
    finally:
        print('爬取结束')
        browser.close()
爬取漂亮美美网图片
原文地址:https://www.cnblogs.com/SR-Program/p/11944669.html