软工试水日报-百度图片爬取 3/31

今天我们来尝试下爬取百度图片，为了团队作业做准备嘛
原理其实就是伪装成网页向百度发送一个json，之后百度返回数据给我们，解析好即可下载
import os
import numpy
import imghdr
from PIL import Image
import json

import requests
headers = {
    'Accept-Encoding': 'gzip, deflate, sdch',
    'Accept-Language': 'en-US,en;q=0.8',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive',
}


# 获取百度图片下载图片
def download_image(key_word, save_name, download_max):
    download_sum = 0
    str_gsm = '80'
    # 把每个类别的图片存放在单独一个文件夹中
    save_path = 'images' + '/' + save_name
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    while download_sum < download_max:
        str_pn = str(download_sum)
        url = 'https://image.baidu.com/search/acjson?'
        param = {
            'tn': 'resultjson_com',
            'logid': '8846269338939606587',
            'ipn': 'rj',
            'ct': '201326592',
            'is': '',
            'fp': 'result',
            'queryWord': key_word,
            'cl': '2',
            'lm': '-1',
            'ie': 'utf-8',
            'oe': 'utf-8',
            'adpicid': '',
            'st': '-1',
            'z': '',
            'ic': '',
            'hd': '',
            'latest': '',
            'copyright': '',
            'word': key_word,
            's': '',
            'se': '',
            'tab': '',
            'width': '',
            'height': '',
            'face': '0',
            'istype': '2',
            'qc': '',
            'nc': '1',
            'fr': '',
            'expermode': '',
            'force': '',
            'cg': 'girl',
            'pn': str(download_sum*29+1),
            'rn': '30',
            'gsm': '1e',
        }
        # 将编码形式转换为utf-8
        page_text = requests.get(url=url, headers=headers, params=param)
        page_text.encoding = 'utf-8'
        page_text = page_text.text
        page_text =json.loads(page_text)
        # 先取出所有链接所在的字典，并将其存储在一个列表当中
        info_list = page_text['data']
        # 由于利用此方式取出的字典最后一个为空，所以删除列表中最后一个元素
        del info_list[-1]
        # 定义一个存储图片地址的列表
        img_path_list = []
        for info in info_list:
            img_path_list.append(info['thumbURL'])
        # 再将所有的图片地址取出，进行下载
        # n将作为图片的名字
        n = (download_sum*29+1)
        for img_path in img_path_list:
            img_data = requests.get(url=img_path, headers=headers).content
            img_path = save_path +'/'+ str(n) + '.jpg'
            with open(img_path, 'wb') as fp:
                fp.write(img_data)
            n=n+1
        download_sum=download_sum+1
    print('下载完成')


# 删除不是JPEG或者PNG格式的图片
def delete_error_image(father_path):
    # 获取父级目录的所有文件以及文件夹
    try:
        image_dirs = os.listdir(father_path)
        for image_dir in image_dirs:
            image_dir = os.path.join(father_path, image_dir)
            # 如果是文件夹就继续获取文件夹中的图片
            if os.path.isdir(image_dir):
                images = os.listdir(image_dir)
                for image in images:
                    image = os.path.join(image_dir, image)
                    try:
                        # 获取图片的类型
                        image_type = imghdr.what(image)
                        # 如果图片格式不是JPEG同时也不是PNG就删除图片
                        if image_type is not 'jpeg' and image_type is not 'png':
                            os.remove(image)
                            print('已删除：%s' % image)
                            continue
                        # 删除灰度图
                        img = numpy.array(Image.open(image))
                        if len(img.shape) is 2:
                            os.remove(image)
                            print('已删除：%s' % image)
                    except:
                        os.remove(image)
                        print('已删除：%s' % image)
    except:
        pass


if __name__ == '__main__':
    # 定义要下载的图片中文名称和英文名称，ps：英文名称主要是为了设置文件夹名
    key_words = {
        # 这里填写想查询的关键词
        }
    # 每个类别下载30页
    for key_word in key_words:
        save_name = key_word
        download_image(key_word, save_name,10)

    # 删除错误图片
    delete_error_image('images/')