[Python爬虫] 之八：Selenium +phantomjs抓取微博数据

　　基本思路：在登录状态下，打开首页，利用高级搜索框输入需要查询的条件，点击搜索链接进行搜索。如果数据有多页，每页数据是20条件，读取页数然后循环页数，对每页数据进行抓取数据。

　　在实践过程中发现一个问题，利用IE驱动，在利用高级搜索后，抓取数据时，抓取不到，只能抓取第一条数据，其它的数据是空的，很奇怪，不知道什么原因，后来用phantomjs就可以抓取到，但是用phantomjs又出现一个问题是，高级搜索链接死活找不到，因此也就没有办法进行高级搜索了，但是利用IE驱动就可以。基于这种情况，就利用IE驱动和phantomjs结合起来进行。首先利用IE驱动进行高级条件是设置，点击“高级搜索”链接，进而得到要抓取数据的首页的url,然后再获取页数，在此基础上再利用phantomjs进行具体数据的抓取。

　　1、设置高级查询条件

　　2、获取爬虫的页数

# elements = self.urldriver.find_elements_by_xpath("//div[@class='layer_menu_list W_scroll']/ul/li")
elements = self.urldriver.find_elements_by_xpath(pageCountLable)
#要爬虫的页数
self.pageCount = len(elements)

3、获取微博记录

Elements = self.driver.find_elements_by_xpath("//div[@class='WB_cardwrap S_bg2 clearfix']")


4、获取每条微博对应的URL：

　　hrefElements = self.driver.find_elements_by_xpath("//a[@class='W_textb']")

　5、具体代码如下

# coding=utf-8
import os
import re
from selenium import webdriver
import selenium.webdriver.support.ui as ui
from selenium.webdriver.common.keys import Keys
import time
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.select import Select
import IniFile
from selenium.webdriver.common.keys import Keys
import LogFile
class weibo:

    def __init__(self):
        #通过配置文件获取IEDriverServer.exe路径
        configfile = os.path.join(os.getcwd(),'config.conf')
        self.cf = IniFile.ConfigFile(configfile)
        IEDriverServer = self.cf.GetValue("section", "IEDriverServer")
        #每抓取一页数据延迟的时间，单位为秒，默认为5秒
        self.pageDelay = 5
        pageInteralDelay = self.cf.GetValue("section", "pageInteralDelay")
        if pageInteralDelay:
            self.pageDelay = int(pageInteralDelay)

        os.environ["webdriver.ie.driver"] = IEDriverServer
        self.urldriver = webdriver.Ie(IEDriverServer)
        # self.driver = webdriver.PhantomJS()
        self.wait = ui.WebDriverWait(self.urldriver, 20)
        self.urldriver.maximize_window()



    def scroll_top(self):
        '''
        滚动条拉到顶部
        :return:
        '''
        if self.urldriver.name == "chrome":
            js = "var q=document.body.scrollTop=0"

        else:
            js = "var q=document.documentElement.scrollTop=0"
        return self.urldriver.execute_script(js)

    def scroll_foot(self):
        '''
        滚动条拉到底部
        :return:
        '''

        if self.urldriver.name == "chrome":
            js = "var q=document.body.scrollTop=10000"

        else:
            js = "var q=document.documentElement.scrollTop=10000"
        return self.urldriver.execute_script(js)


    def logon(self):
        '''
        登录
        :return:
        '''
        isneedLogon = False
        try:
            gn_login = self.driver.find_element_by_xpath("//div[@class='gn_login']")
            isneedLogon = True
        except Exception, e:
            if e.msg.find('Unable') > -1:  # 找不到登录元素说明已经登录
                print 'logon'
        if isneedLogon:
            userNameInput = self.driver.find_element_by_xpath("//input[@name='username']")
            userNameInput.send_keys('手机号')
            passwordInput = self.driver.find_element_by_xpath("//input[@name='password']")
            passwordInput.send_keys('XXXXX')
            # 输入用户名密码登录
            logon_elements = self.driver.find_element_by_xpath("//a[@class='W_btn_a btn_32px']")
            logon_elements.click()

    def Set_CurrentUrl(self):
        firstUrl = self.cf.GetValue("section", "firstUrl")
        if len(firstUrl) > 0 :
            self.urldriver.get(firstUrl)
            self.urldriver.implicitly_wait(5)
            self.adv_Setting()
            time.sleep(5)
            # self.urldriver.implicitly_wait(4)
            #滚动到底部，便于发现页数
            self.scroll_foot()
            #要爬虫网页的URL
            print self.urldriver.current_url
            if self.urldriver.current_url== firstUrl:
                time.sleep(5)

            self.current_url = self.urldriver.current_url.replace('Refer=g','page=')


            pageCountLable = self.cf.GetValue("section", "pageCountLable")
            try:
                # elements = self.urldriver.find_elements_by_xpath("//div[@class='layer_menu_list W_scroll']/ul/li")
                elements = self.urldriver.find_elements_by_xpath(pageCountLable)
                #要爬虫的页数
                self.pageCount = len(elements)
                # print self.pageCount
            except Exception,e:
                print e.message

            self.urldriver.close()
            self.urldriver.quit()
            self.driver = webdriver.PhantomJS()
            self.wait = ui.WebDriverWait(self.driver, 20)
            self.driver.maximize_window()
        else:
            print 'please set first url'

    def CatchData(self):
        '''
        抓取数据
        :return:
        '''
        start = time.clock()
        # #打印标题
        # print self.driver.title
        htmls= self.cf.GetValue("section", "htmlLable").split(';')
        htmlLables = []
        for h in htmls:
            if len(h) > 0:
                htmlLables.append(h)
        logfile = os.path.join(os.getcwd(), r'log.txt')
        log = LogFile.LogFile(logfile)

        pageIndex = 1
        pageCount = self.pageCount
        pageCount = 2
        recordCount = 0
        weiboOriginalUrlLabel = self.cf.GetValue("section", "weiboOriginalUrlLabel")
        while pageCount > 0:
            url = self.current_url +str(pageIndex)
            self.driver.get(url)
            #延迟5秒
            self.driver.implicitly_wait(5)
            pageCount = pageCount -1
            for className in htmlLables:
                self.wait.until(lambda driver: self.driver.find_elements_by_xpath(className))
                Elements = self.driver.find_elements_by_xpath(className)

                #查找微博对应的原始url
                urlList = []
                # self.wait.until(lambda driver: self.driver.find_elements_by_xpath("//a[@class='W_textb']"))
                # hrefElements = self.driver.find_elements_by_xpath("//a[@class='W_textb']")
                self.wait.until(lambda driver: self.driver.find_elements_by_xpath(weiboOriginalUrlLabel))
                hrefElements = self.driver.find_elements_by_xpath(weiboOriginalUrlLabel)
                for hrefe in hrefElements:
                    urlList.append(hrefe.get_attribute('href').encode('utf8'))

                self.driver.implicitly_wait(2)
                index = 0
                strMessage = ' '
                strsplit = '
------------------------------------------------------------------------------------
'
                index = 0
                for element in Elements:
                    print ' '
                    txt =  element.text.encode('utf8')
                    #每个文本前面第一个字符是c，去掉
                    txt = txt[1:]
                    print ' '
                    print txt
                    print '微博链接：'+urlList[index]
                    print strsplit

                    strMessage = txt+ "
"
                    strMessage += '微博链接：'+ urlList[index] + "
"
                    strMessage += strsplit
                    strMessage = unicode(strMessage,'utf8')
                    log.WriteLog(strMessage)
                    # self.printTopic(txt)
                    recordCount = recordCount + 1
                    index = index + 1

            pageIndex = pageIndex + 1
            self.driver.implicitly_wait(10)

        self.driver.close()
        self.driver.quit()
        end = time.clock()

        print ' '
        print "共抓取了: %d 页数据" % self.pageCount
        print "共抓取了: %d 个微博记录" % recordCount
        print "整个过程用时间: %f 秒" % (end - start)

    def adv_Setting(self):
        '''
        高级搜索框设置：
        1、在父窗口点击“高级搜索链接”，打开高级搜索窗口
        2、在关键字输入框输入要搜索的关键字
        3、选择类型，默认是全部
        4、选择包含，默认是全部
        5、设置开始日期，开始时间，结束日期，结束时间
        6、设置地点
        :return: True :设置完成；False:出现错误
        '''

        try:
            # 首先延迟3秒，否则会出现没有找到高级搜索这个链接
            # time.sleep(3)
            # 1、打开高级搜索窗口
            # self.driver.switch_to_default_content()searchInp_form

            self.wait.until(lambda driver: self.urldriver.find_element_by_xpath("//a[@class='adv_settiong']"))
            adv_elements = self.urldriver.find_element_by_xpath("//a[@class='adv_settiong']")
            adv_elements.click()

            # 2、在关键字输入框输入搜索关键字
            time.sleep(5)
            keyword = self.cf.GetValue("adv_setting", "keywords")
            keyword = keyword.replace(' ','')
            if len(keyword) > 0:
                js = "var obj = document.getElementsByName('keyword')[0];obj.value='" + keyword + "';"
                self.urldriver.execute_script(js)

            # 3、选择类型，默认是全部
            # 全部：radio01；热门：radio02；原创：radio03；关注人：radio04；认证用户：radio05；媒体：radio07
            type_select = self.cf.GetValue("adv_setting", "type_select")
            type_select = type_select.replace(' ', '')
            if len(type_select) > 0 :
                type_elements = self.urldriver.find_element_by_id(type_select)
                # type_elements = self.driver.find_element_by_id("radio03")
                type_elements.click()

            # 4、选择包含，默认是全部
            # 全部：radio_sub1；含图片：radio_sub2；含视频：radio_sub3；含音乐：radio_sub4；含短链：radio_sub5
            contain_select = self.cf.GetValue("adv_setting", "contain_select")
            contain_select = contain_select.replace(' ', '')
            if len(contain_select) > 0:
                contain_elements = self.urldriver.find_element_by_id(contain_select)
                # contain_elements = self.driver.find_element_by_id("radio_sub2")
                contain_elements.click()

            # 5、 开始日期
            starttime = self.cf.GetValue("adv_setting", "stime")
            starttime = starttime.replace(' ', '')
            #如何开始日期没有设置，开始时间就不用设置了
            if len(starttime) > 0:
                js = 'var obj = document.getElementsByName("stime")[0];obj.removeAttribute("readonly");obj.value="' + starttime + '";'
                self.urldriver.execute_script(js)

                # 开始时间  0时到23时
                startHour = self.cf.GetValue("adv_setting", "startHour")
                startHour = startHour.replace(' ', '')
                if len(startHour)>0:
                    self.urldriver.find_element_by_xpath("//select[@name='startHour']/option[@value='0']").click()
                    # startHour_element = self.driver.find_element_by_xpath("//select[@name='startHour']")
                    # # startHour_element.find_element_by_xpath("//option[@value='" + startHour + "']").click()
                    # Select(startHour_element).select_by_visible_text(startHour)

            # 结束日期
            endtime = self.cf.GetValue("adv_setting", "etime")
            endtime = endtime.replace(' ', '')
            if len(endtime) > 0:
                js = 'var obj = document.getElementsByName("etime")[0];obj.removeAttribute("readonly");obj.value="' + endtime + '";'
                self.urldriver.execute_script(js)

                # 结束时间 0时到23时
                endHour = self.cf.GetValue("adv_setting", "endHour")
                endHour = endHour.replace(' ', '')
                if len(endHour) > 0:
                    self.urldriver.find_element_by_xpath("//select[@name='endHour']/option[@value='23']").click()
                    # endHour_element = self.driver.find_element_by_xpath("//select[@name='endHour']")
                    # endHour_element.find_element_by_xpath("//option[@value='0']").click()
                    # Select(endHour_element).select_by_visible_text(endHour)

            # 6、选择省份
            # self.driver.find_element_by_xpath("//select[@name='prov']/option[@value='11']").click()
            province = self.cf.GetValue("adv_setting", "province")
            province = province.replace(' ', '')
            if len(province) > 0:
                prov_element = self.urldriver.find_element_by_xpath("//select[@name='prov']")
                province = unicode(province, "utf8")
                Select(prov_element).select_by_visible_text(province)

                city = self.cf.GetValue("adv_setting", "city")
                city = city.replace(' ', '')
                if len(city) > 0:
                    # 选择城市
                    city_element = self.urldriver.find_element_by_xpath("//select[@name='city']")
                    city = unicode(city, "utf8")
                    Select(city_element).select_by_visible_text(city)

            # 点击搜索微博链接
            ss_elements = self.urldriver.find_element_by_xpath("//a[@class='W_btn_cb']")
            ss_elements.click()
            # time.sleep(20)
            return True
        except Exception,e:
            return False


# #测试抓取微博数据
obj = weibo()
obj.Set_CurrentUrl()
obj.CatchData()



配置文件的内容

[section]
#IE驱动的路径
iedriverserver = C:Program FilesInternet ExplorerIEDriverServer.exe

pageinteraldelay = 5

#要搜索的 微博标签，如果有多个，中间用分号隔开
htmlLable = //div[@class='WB_cardwrap S_bg2 clearfix']

#要获取爬虫也是的标签
pageCountLable = //div[@class='layer_menu_list W_scroll']/ul/li
#首页url
firstUrl = http://s.weibo.com/weibo/1?topnav=1&wvr=6&b=1

#current_url = http://s.weibo.com/weibo/%25E8%25B6%25B3%25E7%2590%2583&region=custom:11:8&scope=ori&suball=1&timescope=custom:2017-03-28:2017-03-28&Refer=g
#current_url = http://s.weibo.com/weibo/%25E8%25B6%25B3%25E7%2590%2583&region=custom:11:1000&scope=ori&suball=1&timescope=custom:2017-03-20:2017-03-28&page=1

#查找微博对应的原始url
weiboOriginalUrlLabel = //a[@class='W_textb']

#微博高级搜索关键字段设置项
[adv_setting]

#文本输入框要搜索的关键字
keywords = 足球

#类型的选择
type_select = radio03

#包含的选择
contain_select = radio_sub1

#开始日期的选择
stime = 2017-03-20
#开始时间的选择,可以为空
starthour =

#结束日期的选择
etime = 2017-03-28
#结束时间,可以为空
endhour =

#选择省份
province = 北京
#选择城市
city = 海淀区