google image

google图片抓取

google图片是base64加密的,而且base64后的信息放在script信息里面

import pymysql
from lxml import etree
import logging
import requests
import time
import threading
from threading import RLock
import re
import os

lock = RLock()
import base64
import ssl

ssl._create_default_https_context = ssl._create_unverified_context

# 添加日志

logging.basicConfig(
    level=logging.INFO,  # 定义输出到文件的log级别,大于此级别的都被输出
    format='%(asctime)s  %(filename)s  %(levelname)s : %(message)s',  # 定义输出log的格式
    datefmt='%Y-%m-%d %H:%M:%S',  # 时间
    filename='drugimagesError.log',  # log文件名
    filemode='a')  # 写入模式“w”或“a”


class google_images(object):

    def __init__(self):
        self.strat_record = 1
        self.end_record = 10000001
        self.db = pymysql.connect(host='localhost', port=3306, database='yao_zhi', user='root', password='root',
                                  charset='utf8')
        self.cursor = self.db.cursor()
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}

        while True:
            self.parse_page()

    def parse_page(self):
        lock.acquire()
        num = self.cursor.execute(
            "select id, me_pizhunwenhao, me_name, me_jixing, me_key from guo_cai_jin_kou_yao_pin where id > {} limit 1000".format(
                self.strat_record))
        lock.release()
        if str(num) == str(0):
            exit()

        data_tuple = self.cursor.fetchall()
        threading_list = []
        for data_one in data_tuple:
            id = data_one[0]
            approvalNumber = data_one[1]
            drugName = data_one[2]
            dosageForm = data_one[3]
            try:
                specifications = re.findall(r".+?,", data_one[4])[0]
            except:
                specifications = data_one[4]
            self.strat_record = id
            logging.info("id:%s  approvalNumber:%s   drugName:%s   dosageForm:%s   specifications:%s" % (
                id, approvalNumber, drugName, dosageForm, specifications))
            print("id:%s  approvalNumber:%s   drugName:%s   dosageForm:%s   specifications:%s" % (
                id, approvalNumber, drugName, dosageForm, specifications))

            if str(id) == str(self.end_record):
                exit()
            lock.acquire()
            num = self.cursor.execute("select id from drugimages where approvalNumber = '{}' ".format(approvalNumber))
            lock.release()
            if not num:
                t = threading.Thread(target=self.parse_page_data,
                                     args=(id, approvalNumber, drugName, dosageForm, specifications,))
                t.start()
                threading_list.append(t)
                time.sleep(3)

        for t_one in threading_list:
            t_one.join()

    def parse_page_data(self, id, approvalNumber, drugName, dosageForm, specifications):
        print("id:%s  approvalNumber:%s   drugName:%s   specifications:%s" % (
            id, approvalNumber, drugName, specifications))
        keyword = drugName + ' ' + dosageForm + ' ' + specifications
        url = 'https://www.google.com/search?biw=1920&bih=900&tbm=isch&q=%s' % keyword
        print(url)
        data_particular = etree.HTML(requests.get(url=url, headers=self.headers).content)
        images_list = data_particular.xpath('//span[@id="xjs"]/script/text()')[0]
        images_list_link = re.findall(r'data:image/jpeg;base64,/9j/4AAQSkZJRgABAQAAAQABAAD.*"]?', images_list)

        num = 0
        for link in images_list_link:
            num += 1
            url_link = link.replace('"]', '')
            image = url_link.encode('utf-8').decode('unicode_escape')
            image_data = image.replace('data:image/jpeg;base64,', '')
            drugsql = "insert into drugimages(approvalNumber, drugName, specifications, image, num, durgid) values('{}', '{}', '{}', '{}', {}, {})"
            drugsql_data = drugsql.format(approvalNumber, drugName, specifications, image, int(num), int(id))
            print('sql_data:%s' % drugsql_data)
            logging.info("id:%s  approvalNumber:%s   drugName:%s   specifications:%s" % (
                id, approvalNumber, drugName, specifications))
            lock.acquire()
            self.cursor.execute(drugsql_data)
            self.db.commit()
            lock.release()
            pic_content = base64.b64decode(image_data)
            page_id = int(id / 1000)
            file = './images/' + 'page' + str(page_id) + '/'
            if not os.path.exists(file):
                os.makedirs(file)
            files = file + 'id' + str(id) + '/'
            if not os.path.exists(files):
                os.makedirs(files)
            file = open(files + str(approvalNumber) + '-' + str(num) + '.jpg', 'wb')
            file.write(pic_content)
            file.close()
            if str(num) == str(30):
                break


if __name__ == '__main__':
    google_images()

  

原文地址:https://www.cnblogs.com/yoyo1216/p/10144493.html