通过 PIL 和 Python-tesseract 模拟登陆

 通过 PIL 和 Python-tesseract 处理验证码并且进行识别来模拟登陆,在测试中对像素进行增强能显著提升 字符识别的能力

#-*- coding:utf-8 -*-
try:
    from PIL import Image, ImageEnhance
except ImportError:
    import Image
import pytesseract
import re
import requests
from lxml import etree
import os

"""
PIL:Python Imaging Library,已经是Python平台事实上的图像处理标准库了。PIL功能非常强大,但API却非常简单易用。
a、Python-tesseract是一个基于google's Tesseract-OCR的独立封装包;

b、Python-tesseract功能是识别图片文件中文字,并作为返回参数返回识别结果;

c、Python-tesseract默认支持tiff、bmp格式图片,只有在安装PIL之后,才能支持jpeg、gif、png等其他图片格式;
#
# #img =Image.open('./1bri.jpg')
# img = Image.open('./validate.png')
# ##图像处理##
# #转换为RGB图像
# img = img.convert("RGB")
# #PIL图像增强lambda
# imgbri=img.point(lambda i : i*1.4) #对每一个像素点进行增强,效果很明显
# code = pytesseract.image_to_string(imgbri,lang='eng')
# code = re.sub('W','',code)  #re.sub  替换特殊字符为空
# print(code)
"""
#定义 url,headers
codeUrl = 'https://so.xxxxxxx.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx'
headers =  {
    'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
    }
#定义request 会话对象
requestsSession = requests.Session()

def getCode():
    try:
        responsePage = requestsSession.get(url=codeUrl,headers=headers).text
        xpathObj = etree.HTML(responsePage)
        codeSrc = xpathObj.xpath('//img[@id="imgCode"]/@src')
        imgUrl = "https://so.gushiwen.cn" + str(codeSrc[0])
        codeImg = requestsSession.get(url=imgUrl,headers=headers).content
        with open('./imgCode.jpg','wb') as fp:
            fp.write(codeImg)
        img = Image.open('./imgCode.jpg')
        #图像处理##
        #转换为RGB图像
        img = img.convert("RGB")
        #PIL图像增强lambda
        imgbri=img.point(lambda i : i*1.4) #对每一个像素点进行增强,效果很明显
        codePy = pytesseract.image_to_string(imgbri,lang='eng')
        code = re.sub('W','',codePy)
        return code
    except:
        raise

def login(code):
    try:
        loginUrl = 'https://so.gushiwen.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx'

        data = {
            "__VIEWSTATE":"j/z/lKxfNiw1nRO/l6WdCHHn1M89UMpBP9XLld0+alXWaHrgxsN1ji/XVcpLvnKFEKgkURigXyxl3PVieTvJbySKqvpWp9jg4aqvo5Zb8YyeC0v8PW1i92b/pAI=",
            "__VIEWSTATEGENERATOR":"C93BE1AE",
            "from":"http://so.xxxxxx.cn/user/collect.aspx",
            "email":"xxxxxxx",
            "pwd":"xxxx",
            "code":code,
            "denglu":"登录"
        }
        loginPage = requestsSession.post(url=loginUrl,data=data).text
        with open('./loginPage','w',encoding='utf-8') as lp:
            lp.write(loginPage)

    except:
        raise
if __name__ == "__main__":
    code = getCode()
    login(code)
原文地址:https://www.cnblogs.com/zy09/p/14097339.html