阿X妈X团长信息采集脚本

某宝使用Chrome 自动化会被检测,所以采用Firefox浏览器做自动化,运行此脚本请按照Firefox和将驱动文件放在运行目录中的driversgeckodriver.exe路径

# -*- coding: utf-8 -*-
# @Time : 2020-4-22 22:39
# @Author : hlikex
# @File : main.py

from selenium import webdriver
from urllib import parse
from selenium.webdriver.chrome.options import Options
import logging
import time
import random
import pandas as pd
import os
import re


class Taobao:

    def __init__(self):
        self.root_url = "https://ad.alimama.com/zhaoshang/cpevent/index.htm?srcCode=1&pageNo=1&onlyCanJoin=0&pageSize=800"
        if os.path.exists("data.xls"):
            self.df = pd.read_excel("data.xls",index_col=0)
        else:
            self.df = pd.DataFrame(columns=("团长",'旺旺','V标团长','总成交金额(星级)',"总成交笔数(星级)",'打爆能力(星级)','服务时间','钉钉','团队介绍'))
        self.driver = webdriver.Firefox(executable_path=os.getcwd() + "driversgeckodriver.exe")

        self.driver.implicitly_wait(10)
        logging.basicConfig(format='%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s',
                            level=logging.WARNING)

    def isElementExist(self,id, element):
        try:
            self.driver.find_element(id,element)
            return True
        except:
            return False

    def login(self):
        self.driver.get(self.root_url)
        try:
            self.driver.switch_to.frame('taobaoLoginIfr')
            self.driver.find_element_by_id('fm-login-id').send_keys("username")   # 你的账号
            self.driver.find_element_by_id('fm-login-password').send_keys('password') # 你的密码
            self.driver.find_element_by_css_selector('[class="fm-button fm-submit password-login"]').click()
            time.sleep(3)
            print(self.driver.current_url)
            self.driver.switch_to.default_content()
            self.driver.refresh()
time.sleep(2.5)

except Exception as e: print(e) def getInfo(self): elements = self.driver.find_elements_by_tag_name('tr') for element in elements: try: html = element.get_attribute("outerHTML") Name = re.findall(r"{cpName:'(.*?)',cpPubId:'.*?'}",html)[0] list = self.df["团长"].values.tolist() if Name in list: continue WUid = re.findall(r'uid=(.*?)&',html)[1] star = re.findall(r'app/gallery/mx-effects/star?num=(d)',html) if str(html).find('V标团长')!=-1: V = "" else: V = "" element.find_element_by_css_selector('[class="card pointer"]').click() time.sleep(2) html2 = self.driver.find_element_by_css_selector('[class="adv-threebq adv-threebu"]').get_attribute("outerHTML") ServiceDate = re.findall(r"服务时间:</div><div>(.*?)</div>",html2)[0] dings = re.findall(r'dingtalk_id=(.*?)"',html2) ding = "".join(str(parse.unquote(i)) for i in dings) # self.driver.find_element_by_class_name('word-break').get_attribute("outerHTML") about = self.driver.find_element_by_xpath('/html/body/div[4]/div/div/div[2]/div/div[1]/div/div[2]/div[3]/p[2]').text if str(about).find('...更多')!=-1: self.driver.find_element_by_xpath('/html/body/div[4]/div/div/div[2]/div/div[1]/div/div[2]/div[3]/p[2]/span').click() about = self.driver.find_element_by_xpath('/html/body/div[5]/div/div/div/div[1]/div[2]').get_attribute('outerHTML') about = re.sub(r'<.*?>', "", about) self.driver.find_element_by_link_text('确定').click() print({"团长":Name,'旺旺':WUid,'V标团长':V,'总成交金额(星级)': star[0],"总成交笔数(星级)": star[1],'打爆能力(星级)': star[2],'服务时间':ServiceDate,'钉钉':ding,'团队介绍':about}) num = len(self.df) df2 = pd.DataFrame( {"团长":Name,'旺旺':WUid,'V标团长':V,'总成交金额(星级)': star[0],"总成交笔数(星级)": star[1],'打爆能力(星级)': star[2],'服务时间':ServiceDate,'钉钉':ding,'团队介绍':about}, index=[num + 1]) self.driver.switch_to.default_content() time.sleep(1.5) self.driver.find_element_by_link_text('关闭').click() self.df = self.df.append(df2) self.df.to_excel("data.xls") except IndexError as e: pass except Exception as e : if self.isElementExist('link text',"确定"): self.driver.find_element_by_link_text('关闭').click() if self.isElementExist('link text',"关闭"): self.driver.find_element_by_link_text('关闭').click() self.df.to_excel("data.xls") print(e) def nextPage(self): try: self.driver.switch_to.default_content() self.driver.find_element_by_css_selector('[class="mc-iconfont adv-threefY rotate180 "]').click() except Exception as e: print(e) def run(self): self.login() while True: try: page = re.findall(r"pageNo=(.*?)&",str(self.driver.current_url))[0] except IndexError: print("用户被注销") print("当前页码:{}".format(page)) self.getInfo() time.sleep(60) self.driver.refresh() if __name__ == '__main__': T = Taobao() T.run()
原文地址:https://www.cnblogs.com/hlikex/p/12768185.html