selenium模块在爬虫中的应用

1. 相关概念

1. selenium模块

  是一个基于浏览器自动化的模块

2. 与爬虫之间的关联

  便捷的捕获到动态加载到的数据(可见即可得)

  实现模拟登陆

3.环境安装

pip3 install selenium

简单演示

from selenium import webdriver
from time import sleep

# 后面是你的浏览器驱动位置,记得前面加r'','r'是防止字符转义的
driver = webdriver.Chrome(r'chromedriver.exe')

# 用get打开百度页面
driver.get("http://www.baidu.com")

# 查找页面的“设置”选项,并进行点击
driver.find_elements_by_link_text('设置')[0].click()
sleep(2)

# 打开设置后找到“搜索设置”选项,设置为每页显示50条
driver.find_elements_by_link_text('搜索设置')[0].click()
sleep(2)

# 选中每页显示50条
m = driver.find_element_by_id('nr')
sleep(2)
m.find_element_by_xpath('//*[@id="nr"]/option[3]').click()
m.find_element_by_xpath('.//option[3]').click()
sleep(2)

# 点击保存设置
driver.find_elements_by_class_name("prefpanelgo")[0].click()
sleep(2)

# 处理弹出的警告页面   确定accept() 和 取消dismiss()
driver.switch_to_alert().accept()
sleep(2)

# 找到百度的输入框,并输入 美女
driver.find_element_by_id('kw').send_keys('美女')
sleep(2)

# 点击搜索按钮
driver.find_element_by_id('su').click()
sleep(2)

# 在打开的页面中找到“Selenium - 开源中国社区”,并打开这个页面
driver.find_elements_by_link_text('美女_百度图片')[0].click()
sleep(3)

# 关闭浏览器
driver.quit()

2.基本使用

  准备好某一款浏览器的驱动程序:http://chromedriver.storage.googleapis.com/index.html

  版本的映射关系:https://blog.csdn.net/huilan_same/article/details/51896672

1. 访问京东网站,并搜索“苹果”

from time import sleep
from selenium import webdriver

bro = webdriver.Chrome(executable_path="chromedriver.exe")

# 录入路由地址
bro.get("https://www:jd.com/")
sleep(2)

# 进行标签定位
search_input = bro.find_element_by_id("key")

# 向搜索框中录入关键词
search_input.send_keys("苹果")

# 定位搜索按钮
btn = bro.find_element_by_xpath('//*[@id="search"]/div/div[2]/button')

# 点击搜索按钮
btn.click()
sleep(2)

#执行js(滑动滚轮)
bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
sleep(2)

# 获取页面的源码数据
page_text = bro.page_source
print(page_text)

# 退出访问
bro.quit()

2.selenium爬取动态加载的数据

from time import sleep
from selenium import webdriver
from lxml import etree

bro = webdriver.Chrome(executable_path="chromedriver.exe")

bro.get("http://125.35.6.84:81/xk/")
sleep(2)

page_text = bro.page_source
page_text_list = [page_text]

for i in range(3):
    bro.find_element_by_id("pageIto_next").click()  # 点击下一页
    sleep(2)
    page_text_list.append(bro.page_source)

for page_text in page_text_list:
    tree = etree.HTML(page_text)
    tree_list = tree.xpath('//ul[@id="gzlist"]/li')
    for lis in tree_list:
        title = lis.xpath('./dl/@title')[0]
        num = lis.xpath('./ol/@title')[0]

sleep(2)
bro.quit()

3.动作链

  一系列连续的动作

  在实现标签定位时,如果发现定位的标签是存在于iframe标签中的,则在定位时必须执行一个固定的操作:bro.switch_to.frame('id')

from selenium import webdriver
from time import sleep
from selenium.webdriver import ActionChains

bro
= webdriver.Chrome(executable_path='chromedriver.exe')
bro.get(
'https://www.runoob.com/try/try.php?filename=jqueryui-api-droppable') bro.switch_to.frame('iframeResult') div_tag = bro.find_element_by_id('draggable') # 拖动=点击+滑动 action = ActionChains(bro) action.click_and_hold(div_tag) for i in range(5): #perform让动作链立即执行 action.move_by_offset(17,5).perform() sleep(0.5) action.release() sleep(3) bro.quit()

4.模拟12306登录

超级鹰识别代码

# Cjy.py

import requests
from hashlib import md5

class Chaojiying_Client(object):

    def __init__(self, username, password, soft_id):
        self.username = username
        password =  password.encode('utf8')
        self.password = md5(password).hexdigest()
        self.soft_id = soft_id
        self.base_params = {
            'user': self.username,
            'pass2': self.password,
            'softid': self.soft_id,
        }
        self.headers = {
            'Connection': 'Keep-Alive',
            'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
        }

    def PostPic(self, im, codetype):
        """
        im: 图片字节
        codetype: 题目类型 参考 http://www.chaojiying.com/price.html
        """
        params = {
            'codetype': codetype,
        }
        params.update(self.base_params)
        files = {'userfile': ('ccc.jpg', im)}
        r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers)
        return r.json()

    def ReportError(self, im_id):
        """
        im_id:报错题目的图片ID
        """
        params = {
            'id': im_id,
        }
        params.update(self.base_params)
        r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
        return r.json()

模拟登陆

from selenium import webdriver
from time import sleep
from PIL import Image
from selenium.webdriver import ActionChains
from Cjy import Chaojiying_Client
from selenium.webdriver import ActionChains

bro = webdriver.Chrome(executable_path='chromedriver.exe')
bro.get('https://kyfw.12306.cn/otn/login/init')
sleep(5)
bro.save_screenshot('main.png')

code_img_tag = bro.find_element_by_xpath('//*[@id="loginForm"]/div/ul[2]/li[4]/div/div/div[3]/img')
location = code_img_tag.location
size = code_img_tag.size

# 裁剪的区域范围
rangle = (
int(location['x']), int(location['y']), int(location['x'] + size['width']), int(location['y'] + size['height']))

i = Image.open('./main.png')
frame = i.crop(rangle)
frame.save('code.png')


def get_text(imgPath, imgType):
    chaojiying = Chaojiying_Client('bobo328410948', 'bobo328410948', '899370')
    im = open(imgPath, 'rb').read()
    return chaojiying.PostPic(im, imgType)['pic_str']


# 55,70|267,133 ==[[55,70],[33,66]]
result = get_text('./code.png', 9004)
all_list = []
if '|' in result:
    list_1 = result.split('|')
    count_1 = len(list_1)
    for i in range(count_1):
        xy_list = []
        x = int(list_1[i].split(',')[0])
        y = int(list_1[i].split(',')[1])
        xy_list.append(x)
        xy_list.append(y)
        all_list.append(xy_list)
else:
    x = int(result.split(',')[0])
    y = int(result.split(',')[1])
    xy_list = []
    xy_list.append(x)
    xy_list.append(y)
    all_list.append(xy_list)
print(all_list)
# action = ActionChains(bro)
for a in all_list:
    x = a[0]
    y = a[1]
    ActionChains(bro).move_to_element_with_offset(code_img_tag, x, y).click().perform()
    sleep(1)

bro.find_element_by_id('username').send_keys('123456')
sleep(1)
bro.find_element_by_id('password').send_keys('67890000000')
sleep(1)
bro.find_element_by_id('loginSub').click()

sleep(5)
bro.quit()

爬取梨视频

import requests
from lxml import etree
import re
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
}
url = 'https://www.pearvideo.com/category_1'
page_text = requests.get(url,headers=headers).text
tree = etree.HTML(page_text)
li_list = tree.xpath('//*[@id="listvideoListUl"]/li')
for li in li_list:
    detail_url = 'https://www.pearvideo.com/'+li.xpath('./div/a/@href')[0]
    title = li.xpath('./div/a/div[2]/text()')[0]+'.mp4'
    detail_page_text = requests.get(detail_url,headers=headers).text
    ex = 'srcUrl="(.*?)",vdoUrl'
    video_url = re.findall(ex,detail_page_text,re.S)[0]
    video_data = requests.get(video_url,headers=headers).content
    with open(title,'wb') as fp:
        fp.write(video_data)

5. 移动端数据的爬取

1. fiddler是一款抓包工具,代理服务器

  - 青花瓷

  - miteproxy

  - 配置:让其可以抓取https协议的请求

  - tools -> options -> https -> 安装证书

2. http:客户端和服务端进行数据交互的某种形式

  - https:安全的http协议

  - https的加密方式采用的是证书密钥加密。

3.步骤

  1. 配置下fiddler的端口

  2. 将手机和fiddler所在的电脑处在同一个网段下(pc开启wifi,手机连接)

  3. 在手机中访问fiddler的ip+port:192.168.14.110:50816,在当前页面中点击对应的连接下载证书

  4. 在手机中安装且信任证书

  5. 设置手机网络的代理:开启代理==》fiddler对应pc端的ip地址和fiddler自己端口号

待续

原文地址:https://www.cnblogs.com/zangyue/p/12203214.html