爬取汽车网站汽车数据

from selenium import webdriver
from selenium.webdriver.firefox.options import Options
import datetime
import openpyxl
import re
import time
import os


def get_connect():
    firefox_options = Options()
    # 设置无头
    firefox_options.headless = True
    browser = webdriver.Firefox(firefox_options=firefox_options)
    browser.get("https://www.dongchedi.com/auto/library/x-x-x-x-x-x-x-x-x-x-x")
    browser.implicitly_wait(5)
    return browser


def parse_car_data():
    browser = get_connect()
    # 汽车数据存储
    car_data = []
    # 品牌id
    car_brand_id = 1
    # 车系id
    car_bank_id = 1
    # 解析第一个ul里的li A B C... 并除去  不限和热门两个
    lis = browser.find_elements_by_xpath("//div[@class='wrap tw-bg-white']//"
                                         "div[@class='jsx-1042301898 item-wrap']//"
                                         "div[@class='jsx-1042301898 item-list']//"
                                         "ul[@class='jsx-975855502 tw-flex md:tw-flex-none']//"
                                         "li")[2:]
    # 获取汽车类型  轿车 SUV MPV
    car_type_spans = browser.find_elements_by_xpath("//div[@class='wrap tw-bg-white']//"
                                                    "section//"
                                                    "div[@class='jsx-964070570 tw-flex']//"
                                                    "ul[@class='jsx-964070570 tw-flex-1']//"
                                                    "li//"
                                                    "a[@class='jsx-964070570']//"
                                                    "span[@class='jsx-964070570 series-type_car-name__3pZLx']")
    index = 1
    for li in lis:
        li.click()
        # 获取 A B C...下的所有品牌
        brand_lis = browser.find_elements_by_xpath("//div[@class='wrap tw-bg-white']//"
                                                   "div[@class='jsx-1042301898 item-wrap']//"
                                                   "div[@class='jsx-1042301898 item-list']//"
                                                   "div[@class='jsx-1207899626 more-list-wrap']//"
                                                   "ul[" + str(index) + "]//li")
        index += 1
        for brand_li in brand_lis:
            brand_li.click()
            brand_name = brand_li.text
            print("{}品牌数据开始爬取---------->".format(brand_name))
            for car_type_span in car_type_spans:
                car_type_span.click()
                # 解决加载不全 1 拖动滚动条 2 窗口放大
                browser.set_window_size(1000, 30000)
                time.sleep(3)

                car_type = car_type_span.text
                # 获取车系数据
                car_bank_lis = browser.find_elements_by_xpath("//div[@class='wrap tw-bg-white']//"
                                                              "section//"
                                                              "div[@class='jsx-3448462877 list-wrap']//"
                                                              "ul[@class='jsx-3448462877 list tw-grid tw-grid-cols-12 tw-gap-x-12 tw-gap-y-16']//"
                                                              "li")
                car_bank_lis_len = len(car_bank_lis)
                if car_bank_lis_len == 0:
                    continue
                else:
                    for car_bank_li in range(1, car_bank_lis_len + 1):
                        print("第{}个车系数据开始爬取---------->".format(car_bank_id))
                        bank_name = browser.find_element_by_xpath("//div[@class='wrap tw-bg-white']//"
                                                                  "section//"
                                                                  "div[@class='jsx-3448462877 list-wrap']//"
                                                                  "ul[@class='jsx-3448462877 list tw-grid tw-grid-cols-12 tw-gap-x-12 tw-gap-y-16']//"
                                                                  "li[" + str(car_bank_li) + "]//"
                                                                  "a[@class='jsx-2744368201 item-link']//"
                                                                  "p[@class='jsx-2744368201 car-name']").text
                        car_price = browser.find_element_by_xpath("//div[@class='wrap tw-bg-white']//"
                                                                  "section//"
                                                                  "div[@class='jsx-3448462877 list-wrap']//"
                                                                  "ul[@class='jsx-3448462877 list tw-grid tw-grid-cols-12 tw-gap-x-12 tw-gap-y-16']//"
                                                                  "li[" + str(car_bank_li) + "]//"
                                                                  "a[@class='jsx-2744368201 item-link']//"
                                                                  "p[@class='jsx-2744368201 price']").text
                        car_image_src = browser.find_element_by_xpath("//div[@class='wrap tw-bg-white']//"
                                                                      "section//"
                                                                      "div[@class='jsx-3448462877 list-wrap']//"
                                                                      "ul[@class='jsx-3448462877 list tw-grid tw-grid-cols-12 tw-gap-x-12 tw-gap-y-16']//"
                                                                      "li[" + str(car_bank_li) + "]//"
                                                                      "div[@class='jsx-2682525847 button-wrap tw-grid tw-grid-cols-12 tw-gap-x-3']//"
                                                                      "a[2]").get_attribute("href")
                        car_data.append([car_brand_id, car_bank_id, brand_name, bank_name, car_type, car_price, car_image_src,get_time()])
                        car_bank_id += 1
            print("{}品牌数据爬取结束---------->".format(brand_name))
            car_brand_id += 1
    print("数据开始保存---------->")
    save_car_data(car_data)
    print("数据保存成功---------->")


def format_car_data(data):
    new_data = data.replace(" ", "")
    return re.sub(u"\\(.*?\\)|\\{.*?}|\\[.*?]", "", new_data)


def save_car_data(car_data):
    path = "../dataset/" + get_time() + "_car_data.xlsx"
    if os.path.exists(path) is False:
        wk = openpyxl.Workbook()
        sheet = wk.active
        header ='品牌id', '车系id', '品牌', '车系', '类型', '价格', '图片链接', '截止时间'
        sheet.append(header)
        wk.save(path)
    if len(car_data) != 0:
        wk = openpyxl.load_workbook(path)
        sheet = wk.active
        for item in car_data:
            sheet.append(item)
        wk.save(path)


def get_time():
    return datetime.datetime.now().strftime("%Y_%m_%d")


def start():
    parse_car_data()


if __name__ == '__main__':
    start()

原文地址:https://www.cnblogs.com/MoooJL/p/15627402.html