线程池爬取好花网

'''
需要修改部分:
1. style_list
2. skudic['type']
3.with open ('spulist_1.json','wt') as f00: 文件存储路径
'''

from selenium import webdriver
from selenium.webdriver.common.by import By  # 通过什么
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC  #期望的条件达到
from selenium.webdriver.support.wait import WebDriverWait  # 等待
from selenium.webdriver.chrome.options import Options
import uuid
import json
import requests

# chrome_options = Options()
# chrome_options.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统如果无界面不加这条会
# driver = webdriver.Chrome(options=chrome_options)
driver = webdriver.Chrome()
# style_list = ['mg/','bhh/','knx/','xrk/','zll/','mtx/']
style_list = ['mg/']
base_url = 'https://www.haohua.com/xianhua/'

total_url = []
main_imgs = []
big_lists = []  # 所有的sku的原始列表 多个[title, desc, price, sku_dic, spu_imgs]

spulist = []
skulist = []
spu_pic_list = []
index = 0

# spu 需要内容:title,detail,spu_main_img,price(uuid 生成一个随机字符串,用来和sku 匹配)
# spuimgs 需要内容:uid main_imgs (主图的几张图片)
# sku 需要内容:uid,type,name,price,img


for style in style_list:
    url = base_url + style
    driver.get(url)

    # 显示等待,明确等待满足某一个条件为止
    WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "imghover")))
    # 平滑滚动
    height = driver.execute_script("return document.body.clientHeight")

    driver.execute_script("""
        window.scrollTo({
            top: %s,
            behavior:"smooth"
        });
        """ % height)
    urls=[]
    # 要拿到li 标签下的a标签的href 属性
    tags = driver.find_elements_by_class_name('imghover')

    
    for tag in tags:
        # https: // www.haohua.com / xianhua / 45177.html
        tag_url = tag.get_attribute('href')
        urls.append(tag_url)
    total_url.append(urls)


# 给定一个url, 进行爬取数据
def task(single_url):
    driver.get(single_url)
    # 要爬的数据:标题,简介,价格,图片,sku名称,图片
    title = driver.find_element_by_class_name('shop-title').text
    desc = driver.find_element_by_class_name('shop-description').text
    price = driver.find_element_by_class_name('sell-val').text
    skus = driver.find_element_by_class_name('specs-item').find_elements_by_tag_name('img')
    sku_dic = {}
    for sku in skus:
        sku_img = sku.get_attribute('src')
        sku_name = sku.get_attribute('title')
        sku_dic[sku_name] = sku_img
    spu_img_tags = driver.find_element_by_class_name('shop-preview-item').find_elements_by_tag_name('li')
    spu_imgs = []
    for spu_img in spu_img_tags:
        img = spu_img.find_element_by_tag_name('img').get_attribute('src')
        spu_imgs.append(img)
    main_imgs.append(spu_imgs[0])

    return [title, desc, price, sku_dic, spu_imgs]


from concurrent.futures import ThreadPoolExecutor
import time

pool = ThreadPoolExecutor(3)
type_urls = total_url[0]
def main():
    for type_url in type_urls:
        done = pool.submit(task,type_url)
        lis = done.result()
        big_lists.append(lis)
        main_imgs.append(lis[4][1])
        print(lis)  # 列表套列表,要取出主图放入文件夹中

if __name__ == '__main__':
    main()
    pool.shutdown(wait=True)



img_name_url = []
for imgurl in main_imgs:
    img_name = imgurl.split('/')[-1]
    img_name_url.append([img_name,imgurl])


sku_name_urls = []
spu_name_urls = []


for big_list in big_lists:
    spudic = {}  # 每一个spu的参数字典

    spudic['title'] = big_list[0].split('-')[-1]
    spudic['detail'] = big_list[1]
    spudic['price'] = big_list[2]
    spudic['spu_main_img'] = 'main_img/' + big_list[4][0].split('/')[-1]  # 'https://www.haohua.com/upload/image/2018-11/22/2761c_182d6.png',
    uid = str(uuid.uuid4())
    spudic['uid'] = uid
    spulist.append(spudic)
    # big_list[3]   {'12枝紫罗兰': img_url, '9枝紫罗兰': img_url}  sku的图片存到 SKUimg
    for sku_name in big_list[3]:
        # 把所有的sku图片放在一个列表中,以备以后下载图片
        sku_name_urls.append(big_list[3][sku_name])
        skudic = {}
        skudic['uid'] = uid
        skudic['type'] = 1
        skudic['name'] = sku_name
        skudic['price'] = big_list[2]
        skudic['img'] = 'SKUimg/' + big_list[3][sku_name].split('/')[-1]
        skulist.append(skudic)
    # 下面是 SPUpictures 参数
    for spupicture in big_list[4]:  # [url1,url2....]
        # 把所有的spu图片放在一个列表中,以备以后下载图片
        spu_name_urls.append(spupicture)
        spu_img_pic = {}
        spu_img_pic['uid'] = uid
        spu_img_pic['img'] = 'SPUimg/' + spupicture.split('/')[-1]
        index += 1
        spu_img_pic['index'] = index
        spu_pic_list.append(spu_img_pic)



# 这里是一张主图 main_img
for name_url in img_name_url:
    r = requests.get(name_url[1])
    save_url = 'F:/期中架构/practice2/main_img/' + name_url[0]

    if r.status_code == 200:
        content = r.content
        with open(save_url, 'ab') as f:
            f.write(content)

# 这里是sku图片  sku_name_urls = []
for sku_name in sku_name_urls:    # 一个sku 的图片
    r = requests.get(sku_name)
    save_url = 'F:/期中架构/practice2/SKUimg/' + sku_name.split('/')[-1]

    if r.status_code == 200:
        content = r.content
        with open(save_url, 'ab') as f:
            f.write(content)

# 这里是 spu的几张主图 spu_name_urls = []

for spu_name in spu_name_urls:    # 一个sku 的图片
    r = requests.get(spu_name)
    save_url = 'F:/期中架构/practice2/SPUimg/' + spu_name.split('/')[-1]

    if r.status_code == 200:
        content = r.content
        with open(save_url, 'ab') as f:
            f.write(content)


# 这里是type = 5 的所有数据
# spulist = [],skulist = [],spu_pic_list = []  写入三个文件中


with open ('spulist_1.json','wt') as f00:
    json.dump(spulist,f00)

with open ('skulist_1.json','wt') as f11:
    json.dump(skulist,f11)

with open ('spu_pic_list_1.json','wt') as f22:
    json.dump(spu_pic_list,f22)
原文地址:https://www.cnblogs.com/Afrafre/p/10767839.html