python爬虫--selenium模块.上来自己动!

selenium

基本操作

from selenium import webdriver
from time import sleep
#实例化一个浏览器对象
bro = webdriver.Chrome(executable_path=r'C:pycahrm文件chromedriver.exe')
url = 'https://www.jd.com/'

#用户发起请求
bro.get(url)

#定位标签
search_input = bro.find_element_by_id('key')

#对指定标签进行数据交互
search_input.send_keys('macPro')

btn = bro.find_element_by_xpath('//*[@id="search"]/div/div[2]/button')
btn.click()
sleep(2)

#执行js代码
jsCode = 'window.scrollTo(0,document.body.scrollHeight)'
bro.execute_script(jsCode)

sleep(3)
bro.quit()

selenium
- 概念：基于浏览器自动化的一个模块。
- 环境的安装：
    - pip install selenium
- selenium和爬虫之间的关联：
    - 模拟登录
    - 便捷的捕获到动态加载的数据（重点）
    	获取的页码源码数据 : page_source
        - 特点：可见及可得
        - 缺点：效率低

- selenium的具体使用
    - 准备浏览器的驱动程序：http://chromedriver.storage.googleapis.com/index.html
- 动作链：ActionChains，一系列的行为动作
    - 使用流程：
        - 实例化一个动作连对象，需要将指定的浏览器和动作连对象进行绑定
        - 执行相关的连续的动作
        - perform()立即执行动作连制定好的动作

滑动操作

方式一:

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains #动作链
import time
bro = webdriver.Chrome(executable_path=r'C:pycahrm文件chromedriver.exe')


bro.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
bro.implicitly_wait(3)

bro.switch_to.frame('iframeResult')  #切换到frame标签下
frame_tag = bro.find_element_by_id('draggable') #如果不切换到iframe下,就找不到该标签

begin_tag = bro.find_element_by_id('draggable') #滑动块的起始位置
end_tag = bro.find_element_by_id('droppable')#滑动块的终止位置

actions = ActionChains(bro) #拿到动作链对象
actions.drag_and_drop(begin_tag,end_tag) #把动作放到动作链中，准备串行执行
actions.perform() #开始执行
time.sleep(2)
bro.quit()

方式二:

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains
import time
bro = webdriver.Chrome(executable_path=r'C:pycahrm文件chromedriver.exe')


bro.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
bro.implicitly_wait(3)

bro.switch_to.frame('iframeResult')
frame_tag = bro.find_element_by_id('draggable')

begin_tag = bro.find_element_by_id('draggable')
end_tag = bro.find_element_by_id('droppable')

ActionChains(bro).click_and_hold(begin_tag).perform() #起始位置的滑动块,点击并按住
distance = end_tag.location['x']-begin_tag.location['x']
#滑动的距离
trsck = 0
while trsck < distance:   		      ActionChains(bro).move_by_offset(xoffset=50,yoffset=0).perform()     #xoffset横向滑动距离为50像素
    trsck += 50
ActionChains(bro).release().perform() #滑动结束,释放滑动块

time.sleep(2)
bro.quit()

基于selenium模拟登陆12306

import requests
from hashlib import md5

class Chaojiying_Client(object):

    def __init__(self, username, password, soft_id):
        self.username = username
        password =  password.encode('utf8')
        self.password = md5(password).hexdigest()
        self.soft_id = soft_id
        self.base_params = {
            'user': self.username,
            'pass2': self.password,
            'softid': self.soft_id,
        }
        self.headers = {
            'Connection': 'Keep-Alive',
            'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
        }

    def PostPic(self, im, codetype):
        """
        im: 图片字节
        codetype: 题目类型 参考 http://www.chaojiying.com/price.html
        """
        params = {
            'codetype': codetype,
        }
        params.update(self.base_params)
        files = {'userfile': ('ccc.jpg', im)}
        r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers)
        return r.json()

    def ReportError(self, im_id):
        """
        im_id:报错题目的图片ID
        """
        params = {
            'id': im_id,
        }
        params.update(self.base_params)
        r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
        return r.json()

from selenium import webdriver
from selenium.webdriver import ActionChains
from time import sleep
from PIL import Image #安装PIL或者是Pillow
from CJY import Chaojiying_Client

#封装一个识别验证码的函数
def transformCode(imgPath,imgType):
    chaojiying = Chaojiying_Client('username', 'password', '902590')
    im = open(imgPath, 'rb').read()
    return chaojiying.PostPic(im, imgType)['pic_str']


bro = webdriver.Chrome(executable_path=r'C:pycahrm文件chromedriver.exe')

bro.get('https://kyfw.12306.cn/otn/login/init')
sleep(2)
#将当前浏览器页面进行图片保存
bro.save_screenshot('./main.png')
#将验证码的局部区域进行裁剪
#捕获标签在页面中的位置信息
img_tag = bro.find_element_by_xpath('//*[@id="loginForm"]/div/ul[2]/li[4]/div/div/div[3]/img')
location = img_tag.location#标签的起始位置坐标（左下角坐标）
size = img_tag.size#标签的尺寸
#裁剪范围对应的矩形区域
rangle = (int(location['x']),int(location['y']),int(location['x']+size['width']),int(location['y']+size['height']))
#使用Image工具进行指定区域的裁剪
i = Image.open('./main.png')
frame = i.crop(rangle)#crop就是根据指定的裁剪范围进行图片的截取
frame.save('code.png')

#调用打码平台进行验证码的识别
result = transformCode('./code.png',9004)
print(result) #x1,y1|x2,y2|x3,y3

#x1,y1|x2,y2|x3,y3 ==>[[x1,y1],[x2,y2],[x3,y3]]
all_list = []#[[x1,y1],[x2,y2],[x3,y3]]
if '|' in result:
    list_1 = result.split('|')
    count_1 = len(list_1)
    for i in range(count_1):
        xy_list = []
        x = int(list_1[i].split(',')[0])
        y = int(list_1[i].split(',')[1])
        xy_list.append(x)
        xy_list.append(y)
        all_list.append(xy_list)
else:
    x = int(result.split(',')[0])
    y = int(result.split(',')[1])
    xy_list = []
    xy_list.append(x)
    xy_list.append(y)
    all_list.append(xy_list)


for point in all_list:
    x = point[0]
    y = point[1]
    ActionChains(bro).move_to_element_with_offset(img_tag,x,y).click().perform()
    sleep(1)


bro.find_element_by_id('username').send_keys('xxxxxx')
sleep(1)
bro.find_element_by_id('password').send_keys('xxxx')
sleep(1)

bro.find_element_by_id('loginSub').click()

sleep(10)
print(bro.page_source)
bro.quit()

selenium规避风险

规避检测
from selenium import webdriver
from selenium.webdriver import ChromeOptions
option = ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation'])

bro = webdriver.Chrome(executable_path='./chromedriver.exe',options=option)

url = 'https://www.taobao.com/'

bro.get(url)

# 当用爬虫程序发起的请求时,在后台进行window.navigator.webdriver的js的注入,返回值为true
# 正常访问一个页面是注入js会返回一个undefind

无头浏览器

#无头浏览器
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from time import sleep
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')

bro = webdriver.Chrome(executable_path='./chromedriver.exe',chrome_options=chrome_options) #看不见页面,不显示可视化页面
url = 'https://www.taobao.com/'
bro.get(url)
sleep(2)
bro.save_screenshot('123.png') 

print(bro.page_source)