python3 IEDriver抓取时报数据

    最近做了测试抓取XX时报的数据,由于需要事先登录,并且有验证码,关于验证码解决有两个途径:一是利用打码平台,其原理是把验证码的图片上传发送给打码平台,

  然后返回其验证码。二就是自己研究验证码技术问题。这个有时间再研究。

    目前主要是测试从XX时报抓取数据,目前暂时用了笨方法,利用人工介入,输入验证码。

    登录界面:

    

    具体代码如下:

  

#coding=utf-8
import os
import re
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from selenium.webdriver.common.action_chains import ActionChains
import collections
import mongoDbBase
import numpy
import imagehash
from PIL import Image,ImageFile
import datetime
class finalNews_IE:
    def __init__(self,strdate,logonUrl,firstUrl,keyword_list,exportPath,codedir):
        self.iniDriver()
        self.db = mongoDbBase.mongoDbBase()
        self.date = strdate
        self.firstUrl = firstUrl
        self.logonUrl = logonUrl
        self.keyword_list = keyword_list
        self.exportPath = exportPath
        self.codedir = codedir
        self.hash_code_dict ={}


    def iniDriver(self):
        # 通过配置文件获取IEDriverServer.exe路径
        IEDriverServer = "C:Program FilesInternet ExplorerIEDriverServer.exe"
        os.environ["webdriver.ie.driver"] = IEDriverServer
        self.driver = webdriver.Ie(IEDriverServer)

    def WriteData(self, message, fileName):
        fileName = os.path.join(os.getcwd(), self.exportPath + '/' + fileName)
        with open(fileName, 'a') as f:
            f.write(message)

    # 获取图片文件的hash值
    def get_ImageHash(self,imagefile):
        hash = None
        if os.path.exists(imagefile):
            with open(imagefile, 'rb') as fp:
                hash = imagehash.average_hash(Image.open(fp))
        return hash

    # 点降噪
    def clearNoise(self, imageFile, x=0, y=0):
        if os.path.exists(imageFile):
            image = Image.open(imageFile)
            image = image.convert('L')
            image = numpy.asarray(image)
            image = (image > 135) * 255
            image = Image.fromarray(image).convert('RGB')
            # save_name = "D:workpython36_crawlVeriycodemode_5590.png"
            # image.save(save_name)
            image.save(imageFile)
            return image

    #切割验证码
    # rownum:切割行数;colnum:切割列数;dstpath:图片文件路径;img_name:要切割的图片文件
    def splitimage(self, imagePath,imageFile,rownum=1, colnum=4):
        img = Image.open(imageFile)
        w, h = img.size
        if rownum <= h and colnum <= w:
            print('Original image info: %sx%s, %s, %s' % (w, h, img.format, img.mode))
            print('开始处理图片切割, 请稍候...')

            s = os.path.split(imageFile)
            if imagePath == '':
                dstpath = s[0]
            fn = s[1].split('.')
            basename = fn[0]
            ext = fn[-1]

            num = 1
            rowheight = h // rownum
            colwidth = w // colnum
            file_list =[]
            for r in range(rownum):
                index = 0
                for c in range(colnum):
                    # (left, upper, right, lower)
                    # box = (c * colwidth, r * rowheight, (c + 1) * colwidth, (r + 1) * rowheight)
                    if index < 1:
                        colwid = colwidth + 6
                    elif index < 2:
                        colwid = colwidth + 1
                    elif index < 3:
                        colwid = colwidth

                    box = (c * colwid, r * rowheight, (c + 1) * colwid, (r + 1) * rowheight)
                    newfile = os.path.join(imagePath, basename + '_' + str(num) + '.' + ext)
                    file_list.append(newfile)
                    img.crop(box).save(newfile, ext)
                    num = num + 1
                    index += 1
            return file_list

    def compare_image_with_hash(self, image_hash1,image_hash2, max_dif=5):
        """
                max_dif: 允许最大hash差值, 越小越精确,最小为0
                推荐使用
                """
        dif = image_hash1 - image_hash2
        # print(dif)
        if dif < 0:
            dif = -dif
        if dif <= max_dif:
            return True
        else:
            return False

    # 截取验证码图片
    def savePicture(self):
        # self.driver.get(self.logonUrl)
        # self.driver.maximize_window()
        # time.sleep(2)

        self.driver.save_screenshot(self.codedir +"Temp.png")
        checkcode = self.driver.find_element_by_id("checkcode")
        location = checkcode.location  # 获取验证码x,y轴坐标
        size = checkcode.size  # 获取验证码的长宽
        rangle = (int(location['x']), int(location['y']), int(location['x'] + size['width']),
                  int(location['y'] + size['height']))  # 写成我们需要截取的位置坐标
        i = Image.open(self.codedir +"Temp.png")  # 打开截图
        result = i.crop(rangle)  # 使用Image的crop函数,从截图中再次截取我们需要的区域
        filename = datetime.datetime.now().strftime("%M%S")
        filename =self.codedir +"Temp_code.png"
        result.save(filename)
        self.clearNoise(filename)
        file_list = self.splitimage(self.codedir,filename)
        time.sleep(3)
        verycode =''
        for f in file_list:
            imageHash = self.get_ImageHash(f)
            if imageHash:
                for h, code in self.hash_code_dict.items():
                    flag = self.compare_image_with_hash(imageHash, h, 0)
                    if flag:
                        # print(code)
                        verycode += code
                        break

        print(verycode)
        return verycode
        # print(verycode)
        # self.driver.close()

    def getVerycode(self, txtFile="verycode.txt"):
        f = open(txtFile, 'r')
        result = f.read()
        return result

    def longon(self):
        for f in range(0,10):
            for l in range(1,5):
                file = os.path.join(self.codedir, "codeLibrarycode" +  str(f) + '_'+str(l) + ".png")
                # print(file)
                hash = self.get_ImageHash(file)
                self.hash_code_dict[hash]= str(f)

        flag = True
        try:
            self.driver.get(self.logonUrl)
            self.driver.maximize_window()
            time.sleep(2)
            verycode = self.savePicture()
            if len(verycode)==4:
                accname = self.driver.find_element_by_id("username")
                # accname = self.driver.find_element_by_id("//input[@id='username']")
                accname.send_keys('ctrchina')

                accpwd = self.driver.find_element_by_id("password")
                # accpwd.send_keys('123456')
                # code = self.getVerycode()
                checkcode = self.driver.find_element_by_name("checkcode")
                checkcode.send_keys(verycode)
                submit = self.driver.find_element_by_name("button")
                submit.click()
            else:
                flag = False
        except Exception as e1:
            message = str(e1.args)
            flag = False
        return flag

    # 获取版面链接及关键字
    def saveUrls(self):
        error = ''
        while True:
            flag = self.longon()
            time.sleep(2)
            if flag:
                try:
                    codefault = self.driver.find_element_by_xpath("//table[@class='table_login']/tbody/tr/td/font")
                    if codefault:
                        continue
                except Exception as e1:
                    pass
                break
        try:
            time.sleep(2)
            self.driver.get(self.firstUrl)
            self.driver.maximize_window()
            # urllb = "//div[@id='pageLink']/ul/div/div/a"
            urllb = "//a[@id='pageLink']"
            time.sleep(2)
            elements = self.driver.find_elements_by_xpath(urllb)
            url_layout_dict = collections.OrderedDict()
            for element in elements:
                layout = element.text
                # print(layout)
                if len(layout) == 0:
                    continue
                # layout = txt[txt.find(":") + 1:]
                link =  element.get_attribute("href")
                print(link)
                if link not in url_layout_dict:
                    url_layout_dict[link] = layout
            index = 0
            for sub_url,layout in url_layout_dict.items():
                if index==0:
                    sub_url=""
                print(index)
                self.getArticleLink(sub_url,layout)
                index+=1
        except Exception as e1:
            print("saveUrlsException")
            print("saveUrlsException:Exception" + str(e1.args))


    def getArticleLink(self, url,layout):
        error = ''
        try:
            if url:
                self.driver.get(url)
                self.driver.maximize_window()
            time.sleep(2)
            dt = datetime.datetime.now().strftime("%Y.%m.%d")
            urllb = "//div[@id='titleList']/ul/li/a"
            elements = self.driver.find_elements_by_xpath(urllb)
            url_layout_dict = {}
            for element in elements:
                txt = element.text
                txt = txt[txt.rfind(")") + 1:len(txt)]
                if txt.find("无标题") > -1 or txt.find("公 告") > -1 or txt.find("FINANCIAL NEWS") > -1 or txt.find(dt) > -1:
                    continue
                link = element.get_attribute("href")
                print(link)
                url_layout_dict[link] = layout
            self.db.SavefinalUrl(url_layout_dict,self.date)
        except Exception as e1:
            print("getArticleLink:Exception")
            print("getArticleLink:Exception" + str(e1.args))
            error = e1.args


    def catchdata(self):

        rows = self.db.GetfinalUrl(self.date)
        lst = []
        for row in rows:
            lst.append(row)
        print("rowcount:"+str(len(lst)))
        count =1
        for row in lst:
            url = row['url']
            layout = row['layout']
            try:
                self.driver.get(url)
                self.driver.maximize_window()
                time.sleep(1)
                title = ""

                # t1 = doc("div[class='text_c']")
                element = self.driver.find_element_by_class_name("text_c")
                title = element.find_element_by_css_selector("h3").text
                st = element.find_element_by_css_selector("h1").text
                if st:
                    title += "
" + st
                st = element.find_element_by_css_selector("h2").text
                if st:
                    title += "
" + st

                st = element.find_element_by_css_selector("h4").text
                if st:
                    if st.find("记者") == -1:
                        title += "
" + st
                    # else:
                    #     author = st.replace("记者","").replace("本报","").strip()
                elements = self.driver.find_elements_by_xpath("//div[@id='ozoom']/p")

                content = ""

                key = ""
                index = 0
                author = ''
                for element in elements:
                    txt = element.text.strip().replace("
", "")
                    content += txt
                    if index == 0:
                        if txt.find("记者") > 0 and txt.find("报道") > 0:
                            author = txt[txt.find("记者") + 2:txt.find("报道")]
                        elif txt.find("记者") > 0 and txt.find("报道") == -1:
                            author = txt[txt.find("记者") + 2:len(txt)]
                        elif txt.find("记者") == -1 and txt.find("报道") == -1:
                            author = txt.strip()
                    index += 1

                for k in self.keyword_list:
                    if content.find(k)>-1 or title.find(k)>-1:
                        key+=k+","
                if key:
                    key = key[0:len(key)-1]
                author = author.replace("记者", "").strip()
                if len(author)>6:
                    author = ""
                print(count)
                print(layout)
                print(url)
                print(title)
                print(author)
                count+=1
                # print(content)
                self.db.updatefinalUrl(url)
                self.db.SavefinalData(self.date,layout,url,title,author,key,content)
            except Exception as e1:
                error = e1.args
        self.driver.close()

    def export(self):
        rows = self.db.GetfinalData(self.date)
        lst = []
        for dataRow1 in rows:
            lst.append(dataRow1)
        count =1
        # dt = datetime.datetime.now().strftime("%Y-%m-%d")
        fileName = '金融时报_' + self.date + '.csv'
        header = "发表日期,关键字,作者,全文字数,标题,版面,链接,正文"
        if len(lst)>0:
            self.WriteData(header, fileName)

        for dataRow in lst:
            date = str(dataRow['date'])
            layout = str(dataRow['layout'])
            url = str(dataRow['url'])
            title = str(dataRow['title']).replace(",","").replace("
"," ")
            author = str(dataRow['author']).replace(",","")
            key = str(dataRow['key']).replace(",","")
            wordcount = str(dataRow['wordcount'])
            content = str(dataRow['content']).replace(",","").replace("
"," ")

            # txt = "
%s,%s,%s,%s,%s,%s" % (
            #     date, key, title, author, wordcount, url)
            txt = "
%s,%s,%s,%s,%s,%s,%s,%s" % (
                date, key, author, wordcount, title,layout, url, content)
            try:
                self.WriteData(txt, fileName)
            except Exception as e1:
                print(str(e1))
            print(count)
            count += 1


#
# dt = datetime.datetime.now().strftime("%Y-%m-%d")
# ym = datetime.datetime.now().strftime("%Y-%m")
# day = datetime.datetime.now().strftime("%d")
#
# codepath='E:/python36_crawl/mediaInfo/verycode.txt'
#
# logonUrl="http://epaper.financialnews.com.cn/dnis/client/jrsb/index.jsp"
# # firsturl="http://epaper.financialnews.com.cn/jrsb/html/2018-09/18/node_2.htm"
# firsturl="http://epaper.financialnews.com.cn/jrsb/html/"+ym+"/"+day+"/node_2.htm"
# # print(firsturl)
# keyword_list ="银保监会,央行,中国银行,中行,中银".split(",")
# exportPath="E:/News"
# codedir='E:python36_crawlVeriycode'
# obj = finalNews_IE(dt,logonUrl,firsturl,keyword_list,exportPath,codedir)
# # obj.saveUrls()
# obj.catchdata()
# obj.export()
# # obj.savePicture()
View Code

  采集时报2

 

  layoutElement.get_attribute("onclick")

  

  layoutLink = layoutElement.get_attribute("onclick")
#coding=utf-8
import os
import re
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from selenium.webdriver.common.action_chains import ActionChains
import collections
import mongoDbBase
import datetime
import numpy
from PIL import Image
import RClient
class firstfinal:
    def __init__(self, strdate, firstUrl, keyword_list, exportPath,dirpath):

        self.db = mongoDbBase.mongoDbBase()
        self.date = strdate
        self.firstUrl = firstUrl
        self.keyword_list = keyword_list
        self.exportPath = exportPath
        self.dirpath = dirpath
        self.rclient = RClient.RClient()


    def iniDriver(self):
        # 通过配置文件获取IEDriverServer.exe路径
        IEDriverServer = "C:Program Filesinternet explorerIEDriverServer.exe"
        os.environ["webdriver.ie.driver"] = IEDriverServer
        self.driver = webdriver.Ie(IEDriverServer)

    def WriteData(self, message, fileName):
        fileName = os.path.join(os.getcwd(), self.exportPath + '/' + fileName)
        with open(fileName, 'a') as f:
            f.write(message)

    def getVerycode(self, txtFile="verycode.txt"):
        f = open(txtFile, 'r')
        result = f.read()
        return result
        # 点降噪

    def clearNoise(self, imageFile, x=0, y=0):
        if os.path.exists(imageFile):
            image = Image.open(imageFile)
            image = image.convert('L')
            image = numpy.asarray(image)
            image = (image > 135) * 255
            image = Image.fromarray(image).convert('RGB')
            # save_name = "D:workpython36_crawlVeriycodemode_5590.png"
            # image.save(save_name)
            image.save(imageFile)
            return image
    def savePicture(self):
        # self.iniDriver()
        # self.driver.get(self.firstUrl)
        # self.driver.maximize_window()

        logon = self.driver.find_element_by_xpath("//div[@class='topMenu']/div[2]/a")  # 索引从1开始
        # href = logon.get_attribute("href")
        # self.driver.execute_script(href)
        logon.click()
        # self.driver.maximize_window()
        time.sleep(2)
        checkcode = self.driver.find_element_by_id("Verify")
        temppng = "E:python36_crawlVeriycodeTemp.png"

        self.driver.save_screenshot("E:python36_crawlVeriycodeTemp.png")
        location = checkcode.location  # 获取验证码x,y轴坐标
        size = checkcode.size  # 获取验证码的长宽
        rangle = (int(location['x']), int(location['y']), int(location['x'] + size['width']),
                  int(location['y'] + size['height']))  # 写成我们需要截取的位置坐标
        # i = Image.open("D:workpython36_crawlVeriycodecodeTemp.png")  # 打开截图
        i = Image.open(temppng)
        result = i.crop(rangle)  # 使用Image的crop函数,从截图中再次截取我们需要的区域
        # imagefile = datetime.datetime.now().strftime("%Y%m%d%H%M%S")+".png"
        # imagefile = os.path.join("D:workpython36_crawlVeriycodecode",imagefile)
        result.save(temppng)
        # self.driver.close()
        # time.sleep(2)
        return temppng


    def longon(self):

        self.iniDriver()
        self.driver.get(self.firstUrl)
        self.driver.maximize_window()


        logon = self.driver.find_element_by_xpath("//div[@class='topMenu']/div[2]/a")#索引从1开始
        # href = logon.get_attribute("href")
        # self.driver.execute_script(href)
        logon.click()
        self.driver.maximize_window()
        time.sleep(2)
        # if os.path.exists(self.codepath):
        #     os.system(self.codepath)

        # code = self.getVerycode()
        accname = self.driver.find_element_by_name("username")
        # accname = self.driver.find_element_by_id("//input[@id='username']")
        accname.send_keys('ctrchina')


        # time.sleep(15)
        accpwd = self.driver.find_element_by_name("password")
        # 在服务器上浏览器记录密码了,就不需要设置了
        accpwd.send_keys('123456')

        checkcode = self.driver.find_element_by_name("code")
        temppng = self.savePicture()
        code = self.rclient.test(temppng)
        checkcode.send_keys(code)

        submit = self.driver.find_element_by_xpath("//div[@class='UserFrom']/div[8]/button")
        submit.click()

        time.sleep(4)
        # self.driver.refresh()

        # 获取版面链接及关键字

    def catchData(self):
        flag = True
        try:

            layoutlb = "//ul[@class='BNameList']/li/a"
            artclelb = "//div[@id='SetContent']/ul/li/a"
            contentlb = "//div[@id='SetContent']/ul/li/a"
            layoutElements = self.driver.find_elements_by_xpath(layoutlb)
            layoutCount = len(layoutElements)
            layoutIndex = 0
            layout = ''
            # 版面循环
            print("layoutCount="+str(layoutCount))
            while layoutIndex<layoutCount:
                if layoutIndex >0:
                    self.driver.get(self.firstUrl)
                    self.driver.maximize_window()
                    layoutElements = self.driver.find_elements_by_xpath(layoutlb)
                    layoutElement = layoutElements[layoutIndex]
                    layoutLink = layoutElement.get_attribute("onclick")
                    self.driver.execute_script(layoutLink)
                else:
                    layoutElement = layoutElements[layoutIndex]
                layout = layoutElement.text
                print(layout)
                articleElements = self.driver.find_elements_by_xpath(artclelb)
                articleCount = len(articleElements)
                print("articleCount=" + str(articleCount))
                articleIndex = 0
                # 每个版面中文章列表循环
                while articleIndex < articleCount:
                    if articleIndex > 0 :
                        self.driver.get(self.firstUrl)
                        self.driver.maximize_window()
                        layoutElements = self.driver.find_elements_by_xpath(layoutlb)
                        layoutElement = layoutElements[layoutIndex]
                        layoutLink = layoutElement.get_attribute("onclick")
                        self.driver.execute_script(layoutLink)

                    elements = self.driver.find_elements_by_xpath(contentlb)
                    sublink = elements[articleIndex].get_attribute("onclick")  #
                    title = elements[articleIndex].text
                    print(title)
                    self.driver.execute_script(sublink)
                    author = self.driver.find_element_by_id("Setauthor").text
                    subE = self.driver.find_elements_by_xpath("//div[@id='SetContent']/p")
                    content = ''
                    for se in subE:
                        content += se.text
                    key = ''
                    for k in self.keyword_list:
                        if content.find(k) > -1 or title.find(k) > -1:
                            key += k + ","
                    if key:
                        key = key[0:len(key) - 1]
                    print(author)
                    # print(content)
                    print(key)
                    print('
')
                    articleIndex += 1
                    self.db.SaveFirsFinalData(self.date, layout, self.firstUrl, title, author, key, content)
                layoutIndex+=1

        except Exception as e1:
            error = e1.args
            flag = True

    def export(self):
        try:
            rows = self.db.GetFirsFinalData(self.date)
            lst = []
            for dataRow1 in rows:
                lst.append(dataRow1)
            count = 1
            dt = datetime.datetime.now().strftime("%Y-%m-%d")
            fileName = '第一财经日报_' + self.date + '.csv'
            header = "发表日期,关键字,作者,全文字数,标题,版面,链接,正文"
            if len(lst)>0:
                self.WriteData(header, fileName)
            # 所有的文章链接都是一样的
            url = 'http://buy.yicai.com/read/index/id/5.html'
            for dataRow in lst:
                date = str(dataRow['date'])
                layout = str(dataRow['layout'])
                # url = str(dataRow['url'])

                title = str(dataRow['title']).replace(",", "").replace("
", " ")
                author = str(dataRow['author']).replace(",", "")
                key = str(dataRow['key']).replace(",", "")
                wordcount = str(dataRow['wordcount'])
                content = str(dataRow['content']).replace(",", "").replace("
", " ")

                # txt = "
%s,%s,%s,%s,%s,%s" % (
                #     date, key, title, author, wordcount, url)
                txt = "
%s,%s,%s,%s,%s,%s,%s,%s" % (
                    date, key, author, wordcount, title, layout, url, content)
                try:
                    self.WriteData(txt, fileName)
                except Exception as e1:
                    print(str(e1))
                print(count)
                count += 1
        except Exception as e1:
            error = e1.args

    def test(self):
        dt = datetime.datetime.now().strftime("%Y-%m-%d")
        # dt="2018-10-08"
        dirpath = "E:python36_crawl"
        # codepath= os.path.join(dirpath,"mediaInfoVerycode.txt")
        # codepath='E:/python36_crawl/mediaInfo/verycode.txt'
        # file_list = os.listdir("D:workpython36_crawlVeriycodecode")
        # firsturl="http://buy.yicai.com/read/index/id/5.html"
        firsturl = 'http://buy.yicai.com/read/index/id/5.html'
        keyword_list = "银保监会,央行,中国银行,中行,中银".split(",")
        exportPath = "E:/News"
        obj = firstfinal(dt, firsturl, keyword_list, exportPath, dirpath)
        obj.longon()
        obj.catchData()
        obj.export()

# dt = datetime.datetime.now().strftime("%Y-%m-%d")
# # dt="2018-10-08"
# dirpath ="E:python36_crawl"
# # codepath= os.path.join(dirpath,"mediaInfoVerycode.txt")
# # codepath='E:/python36_crawl/mediaInfo/verycode.txt'
# # file_list = os.listdir("D:workpython36_crawlVeriycodecode")
# # firsturl="http://buy.yicai.com/read/index/id/5.html"
# firsturl='http://buy.yicai.com/read/index/id/5.html'
# keyword_list = "银保监会,央行,中国银行,中行,中银".split(",")
# exportPath = "E:/News"
# obj = firstfinal(dt, firsturl, keyword_list, exportPath,dirpath)
# obj.longon()
# obj.catchData()
# # while True:
# #     obj.savePicture()
# obj.export()
View Code
# coding=utf-8
import datetime
import finalNews_IE
import firstfinal
import Mail
import time
import os

# def WriteData(message, fileName):
#     fileName = os.path.join(os.getcwd(),  'mailflag.txt')
#     with open(fileName) as f:
#         f.write(message)
def run():
    attachmentFileDir ="E:\News"
    mailflagfile = os.path.join(os.getcwd(), 'mailflag.txt')
    while True:
        date = datetime.datetime.now()
        strtime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        print(strtime + " 正常循环")
        dt = datetime.datetime.now().strftime("%Y-%m-%d")
        ym = datetime.datetime.now().strftime("%Y-%m")
        day = datetime.datetime.now().strftime("%d")

        fileName = '金融时报_' + dt + '.csv'
        fileName = os.path.join(attachmentFileDir, fileName)

        firstfileName = '第一财经日报_' + dt + '.csv'
        firstfileName = os.path.join(attachmentFileDir, firstfileName)

        if not os.path.exists(fileName):
            # 采集金融时报数据
            logonUrl = "http://epaper.financialnews.com.cn/dnis/client/jrsb/index.jsp"
            # firsturl="http://epaper.financialnews.com.cn/jrsb/html/2018-09/18/node_2.htm"
            firsturl = "http://epaper.financialnews.com.cn/jrsb/html/" + ym + "/" + day + "/node_2.htm"
            # print(firsturl)
            keyword_list = "银保监会,央行,中国银行,中行,中银".split(",")
            exportPath = "E:/News"
            codedir = 'E:python36_crawlVeriycode'
            obj = finalNews_IE.finalNews_IE(dt, logonUrl, firsturl, keyword_list, exportPath, codedir)
            obj.saveUrls()
            obj.catchdata()
            obj.export()
        if not os.path.exists(firstfileName):
            # 采集第一采集日报数据
            dirpath = "E:python36_crawl"
            firsturl = 'http://buy.yicai.com/read/index/id/5.html'
            keyword_list = "银保监会,央行,中国银行,中行,中银".split(",")
            exportPath = "E:/News"
            obj = firstfinal.firstfinal(dt, firsturl, keyword_list, exportPath, dirpath)
            obj.longon()
            obj.catchData()
            obj.export()
        if date.strftime('%H:%M')=="08:50":
            # 发送邮件
            obj = Mail.Mail()
            obj.test()
            # WriteData(dt,mailflagfile)
            time.sleep(100)
        else:
            time.sleep(10)


run()

# try:

    # dt = datetime.datetime.now().strftime("%Y-%m-%d")
    # ym = datetime.datetime.now().strftime("%Y-%m")
    # day = datetime.datetime.now().strftime("%d")
    # # 采集金融时报数据
    # logonUrl = "http://epaper.financialnews.com.cn/dnis/client/jrsb/index.jsp"
    # # firsturl="http://epaper.financialnews.com.cn/jrsb/html/2018-09/18/node_2.htm"
    # firsturl = "http://epaper.financialnews.com.cn/jrsb/html/" + ym + "/" + day + "/node_2.htm"
    # # print(firsturl)
    # keyword_list = "银保监会,央行,中国银行,中行,中银".split(",")
    # exportPath = "E:/News"
    # codedir = 'E:python36_crawlVeriycode'
    # obj = finalNews_IE.finalNews_IE(dt, logonUrl, firsturl, keyword_list, exportPath, codedir)
    # obj.saveUrls()
    # obj.catchdata()
    # obj.export()
    #
    # # 采集第一采集日报数据
    # dirpath = "E:python36_crawl"
    # firsturl = 'http://buy.yicai.com/read/index/id/5.html'
    # keyword_list = "银保监会,央行,中国银行,中行,中银".split(",")
    # exportPath = "E:/News"
    # obj = firstfinal.firstfinal(dt, firsturl, keyword_list, exportPath, dirpath)
    # obj.longon()
    # obj.catchData()
    # obj.export()

    # 发送邮件
#     obj = Mail.Mail()
#     obj.test()
# except Exception as e1:
#     print(str(e1))
View Code
原文地址:https://www.cnblogs.com/shaosks/p/9697977.html