数字加减法验证码 识别 使用K-近邻算法(KNN)

验证码如上所示

100*30

下面咱们开始神奇的旅程

 下载批量验证码图片数据集用来训练   

此验证码比较简单就下载了500

#!/usr/bin/env python
# -*- coding:utf-8 -*-

# Author: zhibo.wang
# E-mail: gm.zhibo.wang@gmail.com
# Date  : 20/06/29 10:06:45
# Desc  :


import random
import requests
import uuid
import time
import os

def is_exists(path_):
    if not os.path.exists(path_):
        os.makedirs(path_)


is_exists("source/")

for i in range(0, 500):
    print(i)
    url = 'https://www.okcis.cn/php/checkUser/code.php'
    resp = requests.get(url, headers={
        "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/75.0.3770.90 Chrome/75.0.3770.90 Safari/537.36"})
    with open('source/' + str(uuid.uuid4()) + '.png', 'wb') as f:
        f.write(resp.content)
    time.sleep(0.1)

 二值化并切割验证码


#!/usr/bin/env python
# -*- coding:utf-8 -*-

# Author: zhibo.wang
# E-mail: gm.zhibo.wang@gmail.com
# Date  : 20/06/29 10:06:45
# Desc  :

import
os import uuid import numpy as np from PIL import Image import time def is_exists(path_): if not os.path.exists(path_): os.makedirs(path_) is_exists("letters/") def splitAndSave(path): path = './source/' + path pix = np.array(Image.open(path).convert('L')) pix = (pix > 200) * 255 split_parts = [ [3, 23], [23, 43], [43, 63] ] for part in split_parts: letter = pix[0:, part[0]: part[1]] im = Image.fromarray(np.uint8(letter)) save_path = './letters/' + str(uuid.uuid4()) + '.png' print(' ', save_path) im.save(save_path) if __name__ == '__main__': im_paths = filter(lambda fn: os.path.splitext(fn)[1].lower() == '.png', os.listdir('./source')) for im_path in im_paths: print(im_path) splitAndSave(im_path)

给切割好的数据打标签

每次选中一个类型的数据放入复制粘贴到train 文件夹下  然后 修改 n 字段进行每个类别的自动修改

#!/usr/bin/env python
# -*- coding:utf-8 -*-

# Author: zhibo.wang
# E-mail: gm.zhibo.wang@gmail.com
# Date  : 20/06/29 11:36:11
# Desc  :

import os

def list_all_files(rootdir):
    _files = []
    try:
        list = os.listdir(rootdir)
        for i in range(0, len(list)):
            path = os.path.join(rootdir, list[i])
            if os.path.isdir(path):
                _files.extend(list_all_files(path))
            if os.path.isfile(path):
                _files.append(path)
    except Exception:
        pass
    return _files


def rename_filename(filename, newfilename):
    os.rename(filename, newfilename)



def run():
    n = "10"
    path_ = 'train'
    all_files = [i for i in list_all_files(path_) if len(i.split("/")[-1]) == 40]
    for i in range(0, len(all_files)):
        file_name = all_files[i]
        new_file_name = "{0}/{1}:{2}.png".format(path_, n, i)   # win文件名不能有:请自行修改
        print(new_file_name)
        rename_filename(file_name, new_file_name)


if __name__ == "__main__":
    run()

数据打标签完成开始训练模型

训练数据 knn


#!/usr/bin/env python
# -*- coding:utf-8 -*-

# Author: zhibo.wang
# E-mail: gm.zhibo.wang@gmail.com
# Date  : 20/06/29 10:06:45
# Desc  :

import
os from PIL import Image import numpy as np import joblib from sklearn.neighbors import KNeighborsClassifier from utils import list_all_files def list_all_files(rootdir): _files = [] try: list = os.listdir(rootdir) for i in range(0, len(list)): path = os.path.join(rootdir, list[i]) if os.path.isdir(path): _files.extend(list_all_files(path)) if os.path.isfile(path): _files.append(path) except Exception: pass return _files def load_dataset(): X = [] y = [] path_ = "train" all_flies = list_all_files(path_) keys = [str(i) for i in range(1, 11)] + ["+", "-"] for k in keys: for file_ in all_flies: if file_.split("/")[-1].split(":")[0] == str(k): path = file_ pix = np.asarray(Image.open(path).convert("L")) X.append(pix.reshape(20*30)) y.append(k) return np.asarray(X), np.asarray(y) if __name__ == "__main__": X, y = load_dataset() knn = KNeighborsClassifier() knn.fit(X, y) joblib.dump(knn, 'knn.pkl')

训练完成下来测试下效果


#!/usr/bin/env python
# -*- coding:utf-8 -*-

# Author: zhibo.wang
# E-mail: gm.zhibo.wang@gmail.com
# Date  : 20/06/29 10:06:45
# Desc  :


import
os import numpy as np from PIL import Image import joblib def split_letters(path): pix = np.array(Image.open(path).convert('L')) pix = (pix > 200) * 255 split_parts = [ [3, 23], [23, 43], [43, 63] ] letters = [] for part in split_parts: letter = pix[0:, part[0]: part[1]] letters.append(letter.reshape(20*30)) return letters def get_captcha_result(model_path, filename): sipo_knn = joblib.load(model_path) letters = split_letters(filename) return "".join([str(i) for i in sipo_knn.predict(letters)]) if __name__ == "__main__": for test in os.listdir('./test'): datas = test, get_captcha_result('knn.pkl', './test/' + test) print(datas)

结果挺满意的百分百识别正确

下来就是进行简单的计算了   此处略。。。。。。。。。

原文地址:https://www.cnblogs.com/dockers/p/13299311.html