4-爬虫-selenium

selenium介绍

基于浏览器自动化的一个模块

selenium和爬虫之间的关联

  • 1.便捷去捕获动态加载的数据
    • 页面的可见即可得
  • 2.便捷的实现模拟登录

selenium的缺点:

  • 效率太慢
  • 优点:
    • 可见即可得

selenium的基本使用

下载:pip install selenium

事先准备好一个浏览器的驱动程序

下载谷歌驱动:http://chromedriver.storage.googleapis.com/index.html

from selenium import webdriver
from time import sleep

#1.实例化一个浏览器对象
bro = webdriver.Chrome(executable_path='./chromedriver')
#2.发起一个请求
bro.get('https://www.jd.com')
#3.进行标签定位
search_input = bro.find_element_by_xpath('//*[@id="key"]')
search_input.send_keys('macPro')
btn = bro.find_element_by_xpath('/html/body/div[1]/div[4]/div/div[2]/div/div[2]/button')
btn.click()
sleep(2)
#4.执行js代码
bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
sleep(2)
#5.可以获取当前页面的源码数据
page_text = bro.page_source
print(page_text)
sleep(3)
bro.quit()

获取动态加载数据

爬取前5页的企业名称

from selenium import webdriver
from time import sleep
from lxml import etree


url = 'http://125.35.6.84:81/xk/'
bro = webdriver.Chrome(executable_path='./chromedriver')
bro.get(url)
sleep(2)
page_text_list  = [bro.page_source]
#点击下一页
for i in range(5):
    nextPage_a = bro.find_element_by_xpath('//*[@id="pageIto_next"]')
    nextPage_a.click()
    sleep(2)
    page_text_list.append(bro.page_source)
for page_text in page_text_list:
    tree = etree.HTML(page_text)
    name = tree.xpath('//*[@id="gzlist"]/li[1]/dl/a/text()')[0]
    print(name)
sleep(1)
bro.quit()

动作链操作

完成一系列连续的行为动作

from selenium.webdriver import ActionChains   # 导入动作链类


bro = webdriver.Chrome(executable_path='./chromedriver')
bro.get('https://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
sleep(1)
#注意:如果定位的标签是存在于iframe表示的子页面中,则常规的标签定位报错
#处理:使用如下指定操作
bro.switch_to.frame('iframeResult')   # 括号里放的iframe标签的id
div_tag = bro.find_element_by_id('draggable')

#实例化一个动作链对象且将该对象绑定到指定的浏览器中
action = ActionChains(bro)
action.click_and_hold(div_tag) #对指定标签实现点击且长按操作
for i in range(5):
    action.move_by_offset(40,30).perform() #perform让动作链立即执行
    sleep(0.5)
sleep(3)
bro.quit()


'''
注:
action.move_by_offset(10,20) # 以浏览器的左下角为坐标原点
action.move_to_element_with_offset(img_tag,x,y)   # 以标签的左下角为坐标远点
'''

获取cookie

browser.get_cookies()

无头浏览器

没有可视化界面的浏览器

谷歌无头浏览器

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
 
# 创建一个参数对象,用来控制chrome以无界面模式打开
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
 
# 创建浏览器对象
browser = webdriver.Chrome(executable_path='./chromedriver',chrome_options=chrome_options)
 
# 上网
url = 'http://www.baidu.com/'
browser.get(url)
time.sleep(3)
print(browser.page_source)

browser.save_screenshot('baidu.png')  #截屏
 
browser.quit()

12306模拟登录

url:https://kyfw.12306.cn/otn/login/init

超级鹰

#!/usr/bin/env python
# coding:utf-8

import requests
from hashlib import md5

class Chaojiying_Client(object):

    def __init__(self, username, password, soft_id):
        self.username = username
        password =  password.encode('utf8')
        self.password = md5(password).hexdigest()
        self.soft_id = soft_id
        self.base_params = {
            'user': self.username,
            'pass2': self.password,
            'softid': self.soft_id,
        }
        self.headers = {
            'Connection': 'Keep-Alive',
            'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
        }

    def PostPic(self, im, codetype):
        """
        im: 图片字节
        codetype: 题目类型 参考 http://www.chaojiying.com/price.html
        """
        params = {
            'codetype': codetype,
        }
        params.update(self.base_params)
        files = {'userfile': ('ccc.jpg', im)}
        r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers)
        return r.json()

    def ReportError(self, im_id):
        """
        im_id:报错题目的图片ID
        """
        params = {
            'id': im_id,
        }
        params.update(self.base_params)
        r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
        return r.json()
def transdform_code_img(img_path,img_type):
    chaojiying = Chaojiying_Client('超级鹰账号', '密码', 'ID')    #用户中心>>软件ID 生成一个替换 96001
    im = open(img_path, 'rb').read()                                                    #本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
    return chaojiying.PostPic(im, img_type)['pic_str']

模拟登陆

from PIL import Image  # 导入图片处理模块
#pip install PIL/Pillow


bro = webdriver.Chrome(executable_path='./chromedriver')

url = 'https://kyfw.12306.cn/otn/login/init'
bro.get(url)
sleep(2)

# 识别处理验证码:不可以对验证码图片地址单独发请求,因为会请求到另一张非本次登录对应的验证码图片

bro.save_screenshot('main.png')
# 获取裁剪的区域

# 验证码图片标签
img_tag = bro.find_element_by_xpath('//*[@id="loginForm"]/div/ul[2]/li[4]/div/div/div[3]/img')

location = img_tag.location # 验证码图片左下角(起始)位置坐标
size = img_tag.size # 验证码图片的尺寸

# 左下角+右上角两点坐标获取
rangle = (int(location['x']),int(location['y']),int(location['x']+size['width']),int(location['y']+size['height']))

# 根据rangle表示的裁剪区域进行图片的裁剪
i = Image.open('./main.png')
frame = i.crop(rangle)
frame.save('./code.png')

# 识别验证码
result = transdform_code_img('./code.png',9004)
print(result) # 需要在验证码中点击的位置坐标
# x1,y1|x2,y2|x3,y3  == [[x1,y1],[x2,y2],[x3,y3]]


# 将返回的验证码处理成对应坐标点
all_list = []
if '|' in result:
    list_1 = result.split('|')
    count_1 = len(list_1)
    for i in range(count_1):
        xy_list = []
        x = int(list_1[i].split(',')[0])
        y = int(list_1[i].split(',')[1])
        xy_list.append(x)
        xy_list.append(y)
        all_list.append(xy_list)
else:
    x = int(result.split(',')[0])
    y = int(result.split(',')[1])
    xy_list = []
    xy_list.append(x)
    xy_list.append(y)
    all_list.append(xy_list)
 
 
# 动作链操作根据坐标点击图片验证码  
for loc in all_list:# 循环的次数就是点击的次数
    x = loc[0]
    y = loc[1]   ActionChains(bro).move_to_element_with_offset(img_tag,x,y).click().perform()
    sleep(1)


# 输入账号密码
sleep(2)
bro.find_element_by_id('username').send_keys('123456')
sleep(1)
bro.find_element_by_id('password').send_keys('123456')   
sleep(1)

# 点击登陆
bro.find_element_by_id('loginSub').click()
sleep(3)
bro.quit()        

 js反爬

PyExecJS工具类如何使用:

  • 大前提:需要在本机安装好nodejs的开发环境
  • 环境安装:pip install PyExecJS
  • 将想要执行的js相关代码,保存到一个js原文件中

- 1.将反混淆网站中的代码粘贴到jsCode.js文件中

- 2.在该js文件中添加一个自定义函数getPostParamCode,该函数是为了获取且返回post请求的动态加密参数:

function getPostParamCode(method, city, type, startTime, endTime){
    var param = {};
    param.city = city;
    param.type = type;
    param.startTime = startTime;
    param.endTime = endTime;
    return getParam(method, param);
}
import requests
headers = {
    'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'
}

- 3.在py源文件中可以基于PyExecJS模拟执行步骤2中定义好的自定义函数,用 requests 库来模拟 POST 请求获取动态加密的参数:

#调用getParam将动态变化的请求参数
import execjs
 
#1.实例化对象
node = execjs.get()
 
# Params
method = 'GETCITYWEATHER'
city = '北京'
type = 'HOUR'
start_time = '2018-01-25 00:00:00'
end_time = '2018-01-25 23:00:00'
 
#2.将js源文件进行编译
file = 'test.js'
ctx = node.compile(open(file,encoding='utf-8').read())
 
# Get params
js = 'getPostParamCode("{0}", "{1}", "{2}", "{3}", "{4}")'.format(method, city, type, start_time, end_time)
#3.eval执行相关编译后的js函数
params = ctx.eval(js)

url = 'https://www.aqistudy.cn/apinew/aqistudyapi.php'
param = {
    'd':params
}
#获取了加密后的响应数据
code_response_text = requests.get(url=url,params=param,headers=headers).text
code_response_text

- 4.接下来我们再调用一下 JavaScript 中的 decodeData() 方法即可实现解密:

#将加密后的响应数据进行解密
import execjs
import requests

node = execjs.get()
 
# Params
method = 'GETCITYWEATHER'
city = '北京'
type = 'HOUR'
start_time = '2018-01-25 00:00:00'
end_time = '2018-01-25 23:00:00'
 
# Compile javascript
file = 'test.js'
ctx = node.compile(open(file,encoding='utf-8').read())
 
# Get params
js = 'getPostParamCode("{0}", "{1}", "{2}", "{3}", "{4}")'.format(method, city, type, start_time, end_time)
params = ctx.eval(js)

#发起post请求
url = 'https://www.aqistudy.cn/apinew/aqistudyapi.php'
response_text = requests.post(url, data={'d': params}).text

#对加密的响应数据进行解密
js = 'decodeData("{0}")'.format(response_text)
decrypted_data = ctx.eval(js)
print(decrypted_data)
原文地址:https://www.cnblogs.com/wgwg/p/13263224.html