[Python爬虫] 之三十:Selenium +phantomjs 利用 pyquery抓取栏目

  

  一、介绍

    本例子用Selenium +phantomjs爬取栏目(http://tv.cctv.com/lm/)的信息

   

  二、网站信息

    

    

    

    

  

  三、数据抓取

    首先抓取所有要抓取网页链接,共39页,保存到数据库里面

    

    def getUrls(self):
        urls = []
        urls.append('http://tv.cctv.com/lm/')
        for index in range(2,40):
            urls.append("javascript:window.scroll(0,145);DataInteraction({0});showPageTitle_fenyei2('ELMT1413526954890942',{0});".format(index))
        self.db.SaveCCTVColumnUrls(urls,'0')
View Code

    针对上面的网站信息,来进行抓取

    1、首先抓取信息列表

      

      抓取代码:Elements = doc("div[id='text_box_0']").find('dl').find('dd')

    2、栏目名称,链接

      

      column1Element = element.find('div[class="text"]').find('h3').find('a')

      columnName = column1Element.text().encode('utf8').replace(',', ',').replace(' ', '')

      columnUrl = column1Element.attr('href')

  四,实现代码

    

# coding=utf-8
import os
import re
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from datetime import datetime,timedelta
import selenium.webdriver.support.ui as ui
import time
from pyquery import PyQuery as pq
import columnData
import mongoDB
class cctvColumnInfo:

    def __init__(self):
        #通过配置文件获取IEDriverServer.exe路径
        # self.urls = self.getUrls()
        # IEDriverServer ='C:Program FilesInternet ExplorerIEDriverServer.exe'
        # self.driver = webdriver.Ie(IEDriverServer)
        # self.driver.maximize_window()
        self.driver = webdriver.PhantomJS(service_args=['--load-images=false'])#service_args=['--load-images=false']
        self.driver.set_page_load_timeout(10)
        self.driver.maximize_window()
        self.db = mongoDB.mongoDbBase()

    def WriteUrl(self,url):
        fileName = os.path.join(os.getcwd(), 'cctvColumn/cctvColumn_url.txt')
        with open(fileName, 'a') as f:
            f.write('
'+url)

    def getUrls(self):
        urls = []
        urls.append('http://tv.cctv.com/lm/')
        for index in range(2,40):
            urls.append("javascript:window.scroll(0,145);DataInteraction({0});showPageTitle_fenyei2('ELMT1413526954890942',{0});".format(index))
        self.db.SaveCCTVColumnUrls(urls,'0')
        # return urls

    def WriteLog(self, message,date):
        fileName = os.path.join(os.getcwd(), 'cctvColumn/cctvColumn-'+date + '.txt')
        with open(fileName, 'a') as f:
            f.write(message)

    def getColumnInfo(self, colInfo):
        ts = colInfo.split('主持人')
        firstBroadcastTime = ts[0]
        ts1 = ts[1].split('播出频道')
        columnHost = '主持人' + ts1[0]
        broadcastChannel = '播出频道' + ts1[1]
        return firstBroadcastTime, columnHost, broadcastChannel

    def CatchData(self):

        urlIndex = 0
        urls = self.db.GetCCTVColumnUrls()
        itemIndex = 0
        for u in urls:
            url = u['url']
            try:
                if url == 'http://tv.cctv.com/lm/':
                    self.driver.get(url)
                else:
                    self.driver.execute_script(url)
                urlIndex += 1
                time.sleep(2)
                selenium_html = self.driver.execute_script("return document.documentElement.outerHTML")
                doc = pq(selenium_html)
                # Elements = doc("div[@id='text_box_0']/dl/dd")
                Elements = doc("div[id='text_box_0']").find('dl').find('dd')
                message = ''

                # for element in Elements:
                column_name = url.encode('utf8')
                print url
                for element in Elements.items():
                    colobj = columnData.columnData()
                    itemIndex+=1
                    firstBroadcastTime = ''
                    ReplayBroadcastTime = ''
                    firstBroadcastChannel = ''
                    # column1Element = element.find('div[@class="text"]/h3/a')
                    # column1Element = element.find_element_by_xpath("//div[@class='ui-page-next']")
                    column1Element = element.find('div[class="text"]').find('h3').find('a')
                    columnName = column1Element.text().encode('utf8').replace(',', '').replace('
', '')
                    columnUrl = column1Element.attr('href')

                    colobj.setColumnName(columnName)
                    colobj.setColumnUrl(columnUrl)
                    column_name += '
' + columnName
                    # time.sleep(3)
                    print columnName

                    # column2Element = element.find('div[@class="text"]/p/a')
                    column2Element = element.find('div[class="text"]').find('p').find('a')
                    columnTimeName = column2Element.text().encode('utf8').replace(',', '').replace('
', '')
                    columnTimeUrl = column2Element.attr('href')
                    colobj.setColumnTimeName(columnTimeName)
                    colobj.setColumnTimeUrl(columnTimeUrl)
                    # print columnTimeName + '; ' + columnTimeUrl

                    # column34Elements = element.find('div[@class="text"]/span/a')
                    column34Elements = element.find('div[class="text"]').find('span').find('a')

                    # for column34Element in column34Elements:
                    column34Index = 0
                    pastVideoUrl = ''
                    officialWebsiteUrl = ''
                    for column34Element in column34Elements.items():
                        if column34Index == 0:
                            pastVideoUrl = column34Element.attr('href')
                            colobj.setPastVideoUrl(pastVideoUrl)
                        else:
                            officialWebsiteUrl = column34Element.attr('href')
                            colobj.setOfficialWebsiteUrl(officialWebsiteUrl)
                        column34Index += 1

                    # columnImageElement = element.find('div[@class="img"]/a/img')
                    columnImageElement = element.find('div[class="img"]').find('a').find('img')
                    colImgUrl = columnImageElement.attr('src')

                    if colImgUrl == None:
                        columnImageElement = element.find('div[class="image"]').find('a').find('img')
                        colImgUrl = columnImageElement.attr('src')
                    # print colImgUrl
                    colobj.setColImgUrl(colImgUrl)
                    # 首播时间
                    firstBroadcastTime1 = ''
                    # 主持人
                    columnHost = ''
                    # 播出频道
                    firstBroadcastChannel1 =''
                    # columnInfos = element.find('div[@class="lr"]/div')
                    columnInfos = element.find('div[class="lr"]').find('div')
                    if columnInfos:
                        for colInfo in columnInfos.items():
                            firstBroadcastTime1, columnHost, firstBroadcastChannel1 = self.getColumnInfo(
                                colInfo.text().encode('utf8').replace(',', '').replace('
', ''))
                            columnHost = columnHost.replace(',', '')
                        if not firstBroadcastTime:
                            firstBroadcastTime = firstBroadcastTime1
                        if not firstBroadcastChannel:
                            firstBroadcastChannel = firstBroadcastChannel1
                    colobj.setColumnHost(columnHost)
                    colobj.setFirstBroadcastChannel(firstBroadcastChannel1)
                    colobj.setFirstBroadcastTime(firstBroadcastTime1)
                    # 栏目名称,首播时间,重播时间,播出频道,主持人,栏目url,栏目名称1(带时间的),栏目名称1url,往期视频url,栏目官网url,),栏目对应图片url
                    mess = '
{0},{1},{2},{3},{4},{5},{6},{7},{8},{9},{10}'.format(columnName, firstBroadcastTime,
                                                                                   ReplayBroadcastTime,
                                                                                   firstBroadcastChannel, columnHost,
                                                                                   columnUrl, columnTimeName,
                                                                                   columnTimeUrl, pastVideoUrl,
                                                                                   officialWebsiteUrl, colImgUrl)

                    # print mess
                    message += mess

                    self.db.SaveCCTVColumnData(colobj,itemIndex)
                    self.db.SaveCCTVColumnUrl(columnUrl, '1', columnName)

                date = time.strftime('%Y-%m-%d')
                self.WriteLog(message, date)
                self.WriteUrl(column_name)
                self.db.SetCCTVColumnUrlCrawlState(url)
            except TimeoutException,e:
                print 'timeout url:  '+url

        self.driver.close()
        self.driver.quit()

    def getBroadCast(self):
        urls = self.db.GetSubCCTVColumnUrls()

        for u in urls:
            firstBroadcastTime = ''
            ReplayBroadcastTime = ''
            firstBroadcastChannel = ''
            messsage = ''
            url = u['url']
            # url='http://tv.cctv.com/lm/xqds'
            # url='http://tv.cctv.com/lm/24xiaoshi/'
            columnName = u['columnName']

            #     u'http://tv.cctv.com/lm/kanjian'
            try:
                self.driver.get(url)
                time.sleep(2)
                selenium_html = self.driver.execute_script("return document.documentElement.outerHTML")
                doc = pq(selenium_html)
                Elements = doc("p[class='p_1']")

                index = 0
                for element in Elements.items():
                    if index == 0:
                        firstBroadcastTime = element.text().encode('utf8').replace(',', '').replace('
', '')
                    elif index == 1:
                        ReplayBroadcastTime = element.text().encode('utf8').replace(',', '').replace('
', '')
                    elif index == 2:
                        firstBroadcastChannel = element.text().encode('utf8').replace(',', '').replace('
', '')
                        break
                    index += 1
                if index == 0:
                    Elements = doc("div[class='head_msg']").find('table').find('tbody').find('tr')

                    for element in Elements.items():
                        messsage+=element.text().encode('utf8').replace(',', '').replace('
', '')

                    if messsage:
                        firstBroadcastTime, ReplayBroadcastTime, firstBroadcastChannel= self.getBroadInfo(columnName.encode('utf8'),messsage)
                self.db.SetCCTVColumnUrlCrawlState(url)

                if firstBroadcastChannel:
                    colobj = columnData.columnData()
                    colobj.setColumnName(columnName)
                    colobj.setFirstBroadcastTime(firstBroadcastTime)
                    colobj.setFirstBroadcastChannel(firstBroadcastChannel)
                    colobj.setReplayBroadcastTime(ReplayBroadcastTime)
                    self.db.UpdateCCTVColumnData(colobj)
                    print '
'
                    print url
                    print columnName
                    print firstBroadcastTime
                    print firstBroadcastChannel
                    print ReplayBroadcastTime

            except TimeoutException, e:
                print 'TimeoutException:'+url



    def getBroadInfo(self,columnName,column):
        # column ='首播频道: CCTV-14首播时间: 周三17:15'
        firstBroadcastTime = ''
        ReplayBroadcastTime = ''
        firstBroadcastChannel = ''
        column=column.replace('栏目大全','')
        if '>>' in column:
            index = column.index('>>')
            column = column[0:index]

        if 'CCTV13' in column:
            column = column.replace('CCTV13', 'CCTV-13')
        if 'CCTV6' in column:
            column = column.replace('CCTV6', 'CCTV-6')
        if 'CCTV1' in column:
            column = column.replace('CCTV1','CCTV-1')

        if '官方微信' in column:
            index = column.index('官方微信')
            column = column[0:index]


        # if '停播' in column or '关闭' in column:
        #     return firstBroadcastTime, ReplayBroadcastTime, firstBroadcastChannel
        # elif '>>' in column:
        #     index = column.index('>>')
        #     column = column[0:index]

        if '首播时间' in column:
            if '重播时间' in column:
                cols = column.split('重播时间')
                firstBroadcastTime = cols[0]
                if '独播频道' in cols[1]:
                    ReplayBroadcastTime = '重播时间' + cols[1].split('独播频道')[0]
                    firstBroadcastChannel = '独播频道' + cols[1].split('独播频道')[1]
                elif '首播频道' in cols[1]:
                    ReplayBroadcastTime = '重播时间' + cols[1].split('首播频道')[0]
                    firstBroadcastChannel = '首播频道' + cols[1].split('首播频道')[1]

                elif '播出频道' in cols[1]:
                    ReplayBroadcastTime = '重播时间' + cols[1].split('播出频道')[0]
                    firstBroadcastChannel = '播出频道' + cols[1].split('播出频道')[1]
            elif '独播频道' in column:
                cols = column.split('独播频道')
                firstBroadcastTime = cols[0]
                firstBroadcastChannel = '独播频道' + cols[1]
            elif '播出频道' in column:
                cols = column.split('播出频道')
                firstBroadcastTime = cols[0]
                firstBroadcastChannel = '播出频道' + cols[1]

            elif '首播频道' in column:
                cols = column.split('首播频道')
                index = column.index('首播频道')
                if index==0:
                    cols = column.split('首播时间')
                    firstBroadcastChannel = cols[0]
                    firstBroadcastTime = '首播时间' + cols[1]
                else:
                    firstBroadcastTime = cols[0]
                    firstBroadcastChannel = '首播频道' + cols[1]
        else:
            if '首播(' in column and '重播(' in column:
                if '独播频道' in column:
                    cols = column.split('独播频道')
                    firstBroadcastChannel = '独播频道' + cols[1]
                    firstBroadcastTime = cols[0]
                    # '首播(生活): 一-六18:52 日18:42重播(生活): 一-五 日16:08首播(文史): 一-五22:43六日22:33/30重播(文史): 二-五06:46六日06:24'
                    if '(生活版)' in columnName:
                        if '首播(文史)' in firstBroadcastTime:
                            temp = firstBroadcastTime.split('首播(文史)')[0]
                            if '重播(生活)' in temp:
                                firstBroadcastTime = '首播时间: '+temp.split('重播(生活)')[0].replace('首播(生活): ','')
                                ReplayBroadcastTime = '重播时间: '+temp.split('重播(生活)')[1].replace(': ','')

                    # 首播(文史): 一-五22:43六日22:33/30重播(文史): 二-五06:46六日06:24首播(生活): 一-六18:52 日18:42重播(生活): 一-五 日16:08
                    elif '(文史版)' in columnName:
                        if '首播(生活)' in firstBroadcastTime:
                            temp = firstBroadcastTime.split('首播(生活)')[0]
                            if '重播(文史)' in temp:
                                firstBroadcastTime = '首播时间: '+temp.split('重播(文史)')[0].replace('首播(文史): ','')
                                ReplayBroadcastTime = '重播时间: '+ temp.split('重播(文史)')[1].replace(': ','')

                elif '播出频道' in column:
                    cols = column.split('播出频道')
                    firstBroadcastTime = cols[0]
                    firstBroadcastChannel = '播出频道' + cols[1]

                elif '首播频道' in column:
                    cols = column.split('首播频道')
                    firstBroadcastTime = cols[0]
                    firstBroadcastChannel = '首播频道' + cols[1]
        return firstBroadcastTime,ReplayBroadcastTime,firstBroadcastChannel


    def exportColumnInfo(self):
        columns = self.db.GetCCTVColumnData()

        for col in columns:
            columnName = col['columnName'].encode('utf8')
            firstBroadcastTime = col['firstBroadcastTime'].encode('utf8')
            firstBroadcastTime=firstBroadcastTime.replace('首播时间: ','')

            firstBroadcastChannel = col['firstBroadcastChannel'].encode('utf8').replace("播出频道:", "").replace("独播频道:", "").replace("首播频道:", "")
            firstBroadcastChannel =firstBroadcastChannel.replace(")","").replace("(","").replace("CCTV-8电视剧","CCTV-8 电视剧")
            firstBroadcastChannel = firstBroadcastChannel.replace("CCTV-1综合频道", "CCTV-1 综合频道")
            firstBroadcastChannel = firstBroadcastChannel.replace("CCTV-1高清频道", "CCTV-1 高清频道")
            firstBroadcastChannel = firstBroadcastChannel.replace("CCTV13", "CCTV-13")
            firstBroadcastChannel = firstBroadcastChannel.replace("CCTV1", "CCTV-1")
            firstBroadcastChannel = firstBroadcastChannel.replace("CCTV-少儿", "CCTV-14 少儿")
            firstBroadcastChannel = firstBroadcastChannel.replace("CCTV6", "CCTV-6")
            firstBroadcastChannel = firstBroadcastChannel.replace("CCTV-12社会与法", "CCTV-12 社会与法")

            replayBroadcastTime = col['replayBroadcastTime'].encode('utf8')
            replayBroadcastTime = replayBroadcastTime.replace('重播时间:', '')
            columnHost = col['columnHost'].encode('utf8')
            columnUrl = col['columnUrl'].encode('utf8')
            columnTimeName = col['columnTimeName'].encode('utf8')
            columnTimeUrl = col['columnTimeUrl']
            if columnTimeUrl:
                columnTimeUrl = columnTimeUrl.encode('utf8')
            officialWebsiteUrl = col['officialWebsiteUrl'].encode('utf8')
            pastVideoUrl = col['pastVideoUrl'].encode('utf8')
            colImgUrl = col['colImgUrl'].encode('utf8')

            # 栏目名称,首播时间,重播时间,播出频道,主持人,栏目url,栏目名称1(带时间的),栏目名称1url,往期视频url,栏目官网url,),栏目对应图片url
            message = '
{0},{1},{2},{3},{4},{5},{6},{7},{8},{9},{10}'.format(columnName, firstBroadcastTime,
                                                                           replayBroadcastTime,
                                                                           firstBroadcastChannel, columnHost,
                                                                           columnUrl, columnTimeName,
                                                                           columnTimeUrl, pastVideoUrl,
                                                                           officialWebsiteUrl, colImgUrl)

            date = time.strftime('%Y-%m-%d')
            self.WriteLog(message, date)

obj = cctvColumnInfo()
# obj.getUrls()
# obj.CatchData()
# obj.getBroadCast()
obj.exportColumnInfo()
View Code
# coding=utf-8
import os
from pymongo import MongoClient
from pymongo import ASCENDING, DESCENDING
import codecs
import time
import columnData
import datetime
import re

class mongoDbBase:
    # def __init__(self, databaseIp = '127.0.0.1',databasePort = 27017,user = "ott",password= "ott", mongodbName='OTT_DB'):
    def __init__(self, connstr='mongodb://ott:ott@127.0.0.1:27017/', mongodbName='OTT'):
        # client = MongoClient(connstr)
        # self.db = client[mongodbName]
        client = MongoClient('127.0.0.1', 27017)
        self.db = client.OTT
        self.db.authenticate('ott', 'ott')


    def SaveCCTVColumnData(self,columnData,index):
        count = self.db.column_data.find({'columnName': columnData.getColumnName()}).count()
        if count == 0:
            dictM ={'columnName':columnData.getColumnName(),
                    'firstBroadcastTime':columnData.getFirstBroadcastTime(),
                    'replayBroadcastTime':'',
                    'firstBroadcastChannel':columnData.getFirstBroadcastChannel(),
                    'columnHost':columnData.getColumnHost(),
                    'columnUrl':columnData.getColumnUrl(),
                    'columnTimeName':columnData.getColumnTimeName(),
                    'columnTimeUrl':columnData.getColumnTimeUrl(),
                    'officialWebsiteUrl':columnData.getOfficialWebsiteUrl(),
                    'pastVideoUrl': columnData.getPastVideoUrl(),
                    'colImgUrl':columnData.getColImgUrl(),
                    'index':index}
            self.db.column_data.insert(dictM)

    def GetCCTVColumnData(self):
        columns = self.db.column_data.find({},{'_id':0})
        return columns

    def UpdateCCTVColumnData(self, columnData):
        dictM ={'$set':{'replayBroadcastTime':columnData.getReplayBroadcastTime(),
                        'firstBroadcastTime':columnData.getFirstBroadcastTime(),
                        'firstBroadcastChannel': columnData.getFirstBroadcastChannel()}}
        self.db.column_data.update({"columnName":columnData.getColumnName()},dictM)

    def SaveCCTVColumnUrl(self, url,suburl,columnName):
        dictM = {'url': url, 'iscrawl': '0','suburl':suburl,'columnName':columnName}
        # db.urls.find({iscrawl:'1'}).count()
        count = self.db.columnurls.find({'url': url}).count()
        if count == 0:
            self.db.columnurls.insert(dictM)

    def SaveCCTVColumnUrls(self, urlList,suburl):
        index = 0
        for url in urlList:

            # db.urls.find({iscrawl:'1'}).count()
            count = self.db.columnurls.find({'url': url}).count()
            if count == 0:
                dictM = {'url': url, 'iscrawl': '0', 'suburl': suburl,'index':index}
                self.db.columnurls.insert(dictM)
                index += 1
                # self.db.Meeting.update({'title': meet["title"],'date': meet["date"]}, {'$set': dictM}, {'upsert': True})

    def GetCCTVColumnUrls(self):
        urls = self.db.columnurls.find({'iscrawl': '0','suburl':'0'}, {'_id': 0, 'url': 1})
        # for url in urls:
        #     #http://top.chinaz.com/hangye/index_yule.html
        #     print urls['url']
        #     break
        return urls

    def GetSubCCTVColumnUrls(self):
        urls = self.db.columnurls.find({'iscrawl': '0', 'suburl': '1'}, {'_id': 0, 'url': 1,'columnName':1})
        # urls = self.db.columnurls.find({'firstBroadcastChannel': re.compile('栏目'), 'suburl': '1'}, {'_id': 0, 'url': 1, 'columnName': 1})
        return urls
        # def SetUrlCrawlState(self,urlList):
        #     for url in urlList:
        #         self.db.urls.update({'url':url},{'$set':{'iscrawl':'1'}})

    def SetCCTVColumnUrlCrawlState(self, url):
        # db.urls.update({iscrawl:'1'},{'$set':{iscrawl:'0'}},false,true)
        self.db.columnurls.update({'url': url}, {'$set': {'iscrawl': '1'}})

    


# d = mongoDbBase()

# urls = []
# urls.append('abc')
# # d.SaveUrls(urls)
# d.SetUrlCrawlState(urls)
View Code
    def download(self, url, name):
        try:
            # url='http://pp.myapp.com/ma_icon/0/icon_10910_1523714409/96'
            # name='D:workpython_crawldown2019.jpg'
            pic = requests.get(url, timeout=5)
            with open(name, 'wb') as f:
                f.write(pic.content)
        except requests.exceptions.ConnectionError:
            print('当前图片无法下载')
原文地址:https://www.cnblogs.com/shaosks/p/8759388.html