python商品分类信息

  采集商品分类信息

from selenium.webdriver.common.action_chains import ActionChains
from pyquery import PyQuery as pq
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
# 抓取分类数据
def tianmao_catch_category():
    driver = get_driver('', False)
    try:
        url = 'https://www.tmall.com/?ali_trackid=2:mm_26632258_3504122_55934697:1609295236_235_1586302010&union_lens=recoveryid:1609295236_235_1586302010&clk1=3a059b6fd5d21a5e9086e711fdf3afe4&bxsign=tbkJxFfRkMJdwE3OwpP483v2+4G1PrzCDIDumBW7tv5QzQfc+xlm3i2oiRMn2bJl4qaPrxH6ekD1p3hgS1sBUJbM4REq9LyuFhLBITi5yXSBSs='
        driver.get(url)

        time.sleep(10)
        # spans = doc("div[id='imgid']").find("div[class='imgpage']").find(
        #     "ul[class^='imglist clearfix pageNum']").find("li[class='imgitem']")
        # spans = doc("ul[class='normal-nav clearfix']").find("li[class^='j_MenuNav nav-item nav-item-']")
        spans=driver.find_elements_by_xpath("//ul[@class='normal-nav clearfix']/li")
        isbreak = False
        count1 = 0
        count2 = 0
        list1 = []

        for span in spans:
            # 鼠标事件
            ActionChains(driver).move_to_element(span).perform()
            data_title = str(span.text).replace(' /', '/').strip().replace('','')
            # 删除类似ue615  字符
            ts = data_title.split(' ')
            if len(ts)==1:
                list1.append(ts[0])
            elif len(ts)==2:
                list1.append(ts[1])
            time.sleep(3)

        selenium_html = driver.execute_script("return document.documentElement.outerHTML")
        doc = pq(selenium_html)
        sub_spans = doc("div[class='content-con j_categoryContent']").find(
            "div[class='pannel-con j_CategoryMenuPannel']").find("div[class^='pannel-']")
        print('
')
        index = 0
        netname = '天猫'
        for sp in sub_spans.items():
            category_one = list1[index]
            index += 1
            two_item = sp.find("div[class='hot-word-con']").find("div[class='hot-word-line']")
            for ts in two_item.items():
                category_two = ts.find("div[class='line-title']").find("div[class='title-text']").text()
                sps = ts.find("div[class='line-con']").find("a[class^='hot-word']")
                for sp in sps.items():
                    category_three = sp.text()
                    print(category_one, category_two, category_three)
                    db.saveCategory(netname, category_one, category_two, category_three)
            print('
')
    except Exception as ex:
        print(ex)
    driver.quit()

# 抓取分类数据
def jingdong_catch_category():
    driver = get_driver('', False)
    # proxy_one = ip_read()
    # driver = get_driver(proxy_one, False)
    try:

        url = 'https://www.jd.com/?cu=true&utm_source=baidu-pinzhuan&utm_medium=cpc&utm_campaign=t_288551095_baidupinzhuan&utm_term=0f3d30c8dba7459bb52f2eb5eba8ac7d_0_48ba7a220ee5462c97fc2d5f3691e5c5'
        driver.get(url)
        # selenium_html = driver.execute_script("return document.documentElement.outerHTML")
        # doc = pq(selenium_html)
        time.sleep(10)
        # spans = doc("div[id='imgid']").find("div[class='imgpage']").find(
        #     "ul[class^='imglist clearfix pageNum']").find("li[class='imgitem']")
        # spans = doc("ul[class='normal-nav clearfix']").find("li[class^='j_MenuNav nav-item nav-item-']")
        spans = driver.find_elements_by_xpath("//ul[@class='JS_navCtn cate_menu']/li[@class='cate_menu_item']")

        list1 = []

        for span in spans:
            ActionChains(driver).move_to_element(span).perform()
            data_title = str(span.text).replace('/ ', '/').replace(' /', '/').strip().replace('', '')

            print('data_title=',data_title)

            list1.append(data_title)
            time.sleep(3)

        selenium_html = driver.execute_script("return document.documentElement.outerHTML")
        doc = pq(selenium_html)
        sub_spans = doc("div[id='J_popCtn']").find("div[class='cate_part clearfix']")
        print('
')
        index = 0
        netname = '京东'

        for sp in sub_spans.items():
            category_one = list1[index]

            two_item = sp.find("div[class='cate_part_col1']").find("div[class='cate_channel']").find("a[class='cate_channel_lk']")
            index1 = 0
            category_two=''
            for ts in two_item.items():

                category_three=''
                if index1==0:
                    category_two = str(ts.text())
                else:
                    category_three= str(ts.text())
                    print(category_one, category_two, category_three)
                    db.saveCategory(netname, category_one, category_two, category_three)
                index1+=1


            two_item = sp.find("div[class='cate_part_col1']").find("div[class='cate_detail']").find(
                "dl[class^='cate_detail_item cate_detail_item']")
            index1 = 0
            category_two = ''
            for ts in two_item.items():

                category_three = ''
                if index1 == 0:
                    category_two = str(ts.find("dt[class='cate_detail_tit']").find("a[class='cate_detail_tit_lk']").text())
                else:
                    sps = ts.find("dd[class='cate_detail_con']").find("a[class='cate_detail_con_lk']")
                    for sp in sps.items():
                        category_three = str(sp.text())
                        print(category_one, category_two, category_three)
                        db.saveCategory(netname, category_one, category_two, category_three)
                index1 += 1
            index += 1
            print('
')
        print(index)
    except Exception as ex:
        print(ex)

    driver.quit()

  

原文地址:https://www.cnblogs.com/shaosks/p/14214849.html