[Python爬虫] 之八:Selenium +phantomjs抓取微博数据

  基本思路:在登录状态下,打开首页,利用高级搜索框输入需要查询的条件,点击搜索链接进行搜索。如果数据有多页,每页数据是20条件,读取页数 然后循环页数,对每页数据进行抓取数据。

  在实践过程中发现一个问题,利用IE驱动,在利用高级搜索后,抓取数据时,抓取不到,只能抓取第一条数据,其它的数据是空的,很奇怪,不知道什么原因,后来用phantomjs就可以抓取到,但是用phantomjs又出现一个问题是,高级搜索链接死活找不到,因此也就没有办法进行高级搜索了,但是利用IE驱动就可以。基于这种情况,就利用IE驱动和phantomjs结合起来进行。首先利用IE驱动进行高级条件是设置,点击“高级搜索”链接,进而得到要抓取数据的首页的url,然后再获取页数,在此基础上再利用phantomjs进行具体数据的抓取。

  1、设置高级查询条件

  

  2、获取爬虫的页数

  

  

  

# elements = self.urldriver.find_elements_by_xpath("//div[@class='layer_menu_list W_scroll']/ul/li")
elements = self.urldriver.find_elements_by_xpath(pageCountLable)
#要爬虫的页数
self.pageCount = len(elements)

3、获取微博记录

  

Elements = self.driver.find_elements_by_xpath("//div[@class='WB_cardwrap S_bg2 clearfix']")

4、获取每条微博对应的URL:

   

  hrefElements = self.driver.find_elements_by_xpath("//a[@class='W_textb']")




 5、具体代码如下

  

# coding=utf-8
import os
import re
from selenium import webdriver
import selenium.webdriver.support.ui as ui
from selenium.webdriver.common.keys import Keys
import time
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.select import Select
import IniFile
from selenium.webdriver.common.keys import Keys
import LogFile
class weibo:

def __init__(self):
#通过配置文件获取IEDriverServer.exe路径
configfile = os.path.join(os.getcwd(),'config.conf')
self.cf = IniFile.ConfigFile(configfile)
IEDriverServer = self.cf.GetValue("section", "IEDriverServer")
#每抓取一页数据延迟的时间,单位为秒,默认为5秒
self.pageDelay = 5
pageInteralDelay = self.cf.GetValue("section", "pageInteralDelay")
if pageInteralDelay:
self.pageDelay = int(pageInteralDelay)

os.environ["webdriver.ie.driver"] = IEDriverServer
self.urldriver = webdriver.Ie(IEDriverServer)
# self.driver = webdriver.PhantomJS()
self.wait = ui.WebDriverWait(self.urldriver, 20)
self.urldriver.maximize_window()



def scroll_top(self):
'''
滚动条拉到顶部
:return:
'''
if self.urldriver.name == "chrome":
js = "var q=document.body.scrollTop=0"

else:
js = "var q=document.documentElement.scrollTop=0"
return self.urldriver.execute_script(js)

def scroll_foot(self):
'''
滚动条拉到底部
:return:
'''

if self.urldriver.name == "chrome":
js = "var q=document.body.scrollTop=10000"

else:
js = "var q=document.documentElement.scrollTop=10000"
return self.urldriver.execute_script(js)


def logon(self):
'''
登录
:return:
'''
isneedLogon = False
try:
gn_login = self.driver.find_element_by_xpath("//div[@class='gn_login']")
isneedLogon = True
except Exception, e:
if e.msg.find('Unable') > -1: # 找不到登录元素说明已经登录
print 'logon'
if isneedLogon:
userNameInput = self.driver.find_element_by_xpath("//input[@name='username']")
userNameInput.send_keys('手机号')
passwordInput = self.driver.find_element_by_xpath("//input[@name='password']")
passwordInput.send_keys('XXXXX')
# 输入用户名密码登录
logon_elements = self.driver.find_element_by_xpath("//a[@class='W_btn_a btn_32px']")
logon_elements.click()

def Set_CurrentUrl(self):
firstUrl = self.cf.GetValue("section", "firstUrl")
if len(firstUrl) > 0 :
self.urldriver.get(firstUrl)
self.urldriver.implicitly_wait(5)
self.adv_Setting()
time.sleep(5)
# self.urldriver.implicitly_wait(4)
#滚动到底部,便于发现页数
self.scroll_foot()
#要爬虫网页的URL
print self.urldriver.current_url
if self.urldriver.current_url== firstUrl:
time.sleep(5)

self.current_url = self.urldriver.current_url.replace('Refer=g','page=')


pageCountLable = self.cf.GetValue("section", "pageCountLable")
try:
# elements = self.urldriver.find_elements_by_xpath("//div[@class='layer_menu_list W_scroll']/ul/li")
elements = self.urldriver.find_elements_by_xpath(pageCountLable)
#要爬虫的页数
self.pageCount = len(elements)
# print self.pageCount
except Exception,e:
print e.message

self.urldriver.close()
self.urldriver.quit()
self.driver = webdriver.PhantomJS()
self.wait = ui.WebDriverWait(self.driver, 20)
self.driver.maximize_window()
else:
print 'please set first url'

def CatchData(self):
'''
抓取数据
:return:
'''
start = time.clock()
# #打印标题
# print self.driver.title
htmls= self.cf.GetValue("section", "htmlLable").split(';')
htmlLables = []
for h in htmls:
if len(h) > 0:
htmlLables.append(h)
logfile = os.path.join(os.getcwd(), r'log.txt')
log = LogFile.LogFile(logfile)

pageIndex = 1
pageCount = self.pageCount
pageCount = 2
recordCount = 0
weiboOriginalUrlLabel = self.cf.GetValue("section", "weiboOriginalUrlLabel")
while pageCount > 0:
url = self.current_url +str(pageIndex)
self.driver.get(url)
#延迟5秒
self.driver.implicitly_wait(5)
pageCount = pageCount -1
for className in htmlLables:
self.wait.until(lambda driver: self.driver.find_elements_by_xpath(className))
Elements = self.driver.find_elements_by_xpath(className)

#查找微博对应的原始url
urlList = []
# self.wait.until(lambda driver: self.driver.find_elements_by_xpath("//a[@class='W_textb']"))
# hrefElements = self.driver.find_elements_by_xpath("//a[@class='W_textb']")
self.wait.until(lambda driver: self.driver.find_elements_by_xpath(weiboOriginalUrlLabel))
hrefElements = self.driver.find_elements_by_xpath(weiboOriginalUrlLabel)
for hrefe in hrefElements:
urlList.append(hrefe.get_attribute('href').encode('utf8'))

self.driver.implicitly_wait(2)
index = 0
strMessage = ' '
strsplit = ' ------------------------------------------------------------------------------------ '
index = 0
for element in Elements:
print ' '
txt = element.text.encode('utf8')
#每个文本前面第一个字符是c,去掉
txt = txt[1:]
print ' '
print txt
print '微博链接:'+urlList[index]
print strsplit

strMessage = txt+ " "
strMessage += '微博链接:'+ urlList[index] + " "
strMessage += strsplit
strMessage = unicode(strMessage,'utf8')
log.WriteLog(strMessage)
# self.printTopic(txt)
recordCount = recordCount + 1
index = index + 1

pageIndex = pageIndex + 1
self.driver.implicitly_wait(10)

self.driver.close()
self.driver.quit()
end = time.clock()

print ' '
print "共抓取了: %d 页数据" % self.pageCount
print "共抓取了: %d 个微博记录" % recordCount
print "整个过程用时间: %f 秒" % (end - start)

def adv_Setting(self):
'''
高级搜索框设置:
1、在父窗口点击“高级搜索链接”,打开高级搜索窗口
2、在关键字输入框输入要搜索的关键字
3、选择类型,默认是全部
4、选择包含,默认是全部
5、设置开始日期,开始时间,结束日期,结束时间
6、设置地点
:return: True :设置完成;False:出现错误
'''

try:
# 首先延迟3秒,否则会出现没有找到高级搜索这个链接
# time.sleep(3)
# 1、打开高级搜索窗口
# self.driver.switch_to_default_content()searchInp_form

self.wait.until(lambda driver: self.urldriver.find_element_by_xpath("//a[@class='adv_settiong']"))
adv_elements = self.urldriver.find_element_by_xpath("//a[@class='adv_settiong']")
adv_elements.click()

# 2、在关键字输入框输入搜索关键字
time.sleep(5)
keyword = self.cf.GetValue("adv_setting", "keywords")
keyword = keyword.replace(' ','')
if len(keyword) > 0:
js = "var obj = document.getElementsByName('keyword')[0];obj.value='" + keyword + "';"
self.urldriver.execute_script(js)

# 3、选择类型,默认是全部
# 全部:radio01;热门:radio02;原创:radio03;关注人:radio04;认证用户:radio05;媒体:radio07
type_select = self.cf.GetValue("adv_setting", "type_select")
type_select = type_select.replace(' ', '')
if len(type_select) > 0 :
type_elements = self.urldriver.find_element_by_id(type_select)
# type_elements = self.driver.find_element_by_id("radio03")
type_elements.click()

# 4、选择包含,默认是全部
# 全部:radio_sub1;含图片:radio_sub2;含视频:radio_sub3;含音乐:radio_sub4;含短链:radio_sub5
contain_select = self.cf.GetValue("adv_setting", "contain_select")
contain_select = contain_select.replace(' ', '')
if len(contain_select) > 0:
contain_elements = self.urldriver.find_element_by_id(contain_select)
# contain_elements = self.driver.find_element_by_id("radio_sub2")
contain_elements.click()

# 5、 开始日期
starttime = self.cf.GetValue("adv_setting", "stime")
starttime = starttime.replace(' ', '')
#如何开始日期没有设置,开始时间就不用设置了
if len(starttime) > 0:
js = 'var obj = document.getElementsByName("stime")[0];obj.removeAttribute("readonly");obj.value="' + starttime + '";'
self.urldriver.execute_script(js)

# 开始时间 0时到23时
startHour = self.cf.GetValue("adv_setting", "startHour")
startHour = startHour.replace(' ', '')
if len(startHour)>0:
self.urldriver.find_element_by_xpath("//select[@name='startHour']/option[@value='0']").click()
# startHour_element = self.driver.find_element_by_xpath("//select[@name='startHour']")
# # startHour_element.find_element_by_xpath("//option[@value='" + startHour + "']").click()
# Select(startHour_element).select_by_visible_text(startHour)

# 结束日期
endtime = self.cf.GetValue("adv_setting", "etime")
endtime = endtime.replace(' ', '')
if len(endtime) > 0:
js = 'var obj = document.getElementsByName("etime")[0];obj.removeAttribute("readonly");obj.value="' + endtime + '";'
self.urldriver.execute_script(js)

# 结束时间 0时到23时
endHour = self.cf.GetValue("adv_setting", "endHour")
endHour = endHour.replace(' ', '')
if len(endHour) > 0:
self.urldriver.find_element_by_xpath("//select[@name='endHour']/option[@value='23']").click()
# endHour_element = self.driver.find_element_by_xpath("//select[@name='endHour']")
# endHour_element.find_element_by_xpath("//option[@value='0']").click()
# Select(endHour_element).select_by_visible_text(endHour)

# 6、选择省份
# self.driver.find_element_by_xpath("//select[@name='prov']/option[@value='11']").click()
province = self.cf.GetValue("adv_setting", "province")
province = province.replace(' ', '')
if len(province) > 0:
prov_element = self.urldriver.find_element_by_xpath("//select[@name='prov']")
province = unicode(province, "utf8")
Select(prov_element).select_by_visible_text(province)

city = self.cf.GetValue("adv_setting", "city")
city = city.replace(' ', '')
if len(city) > 0:
# 选择城市
city_element = self.urldriver.find_element_by_xpath("//select[@name='city']")
city = unicode(city, "utf8")
Select(city_element).select_by_visible_text(city)

# 点击搜索微博链接
ss_elements = self.urldriver.find_element_by_xpath("//a[@class='W_btn_cb']")
ss_elements.click()
# time.sleep(20)
return True
except Exception,e:
return False


# #测试抓取微博数据
obj = weibo()
obj.Set_CurrentUrl()
obj.CatchData()


配置文件的内容
[section]
#IE驱动的路径
iedriverserver = C:Program FilesInternet ExplorerIEDriverServer.exe

pageinteraldelay = 5

#要搜索的 微博标签,如果有多个,中间用分号隔开
htmlLable = //div[@class='WB_cardwrap S_bg2 clearfix']

#要获取爬虫也是的标签
pageCountLable = //div[@class='layer_menu_list W_scroll']/ul/li
#首页url
firstUrl = http://s.weibo.com/weibo/1?topnav=1&wvr=6&b=1

#current_url = http://s.weibo.com/weibo/%25E8%25B6%25B3%25E7%2590%2583&region=custom:11:8&scope=ori&suball=1&timescope=custom:2017-03-28:2017-03-28&Refer=g
#current_url = http://s.weibo.com/weibo/%25E8%25B6%25B3%25E7%2590%2583&region=custom:11:1000&scope=ori&suball=1&timescope=custom:2017-03-20:2017-03-28&page=1

#查找微博对应的原始url
weiboOriginalUrlLabel = //a[@class='W_textb']

#微博高级搜索关键字段设置项
[adv_setting]

#文本输入框要搜索的关键字
keywords = 足球

#类型的选择
type_select = radio03

#包含的选择
contain_select = radio_sub1

#开始日期的选择
stime = 2017-03-20
#开始时间的选择,可以为空
starthour =

#结束日期的选择
etime = 2017-03-28
#结束时间,可以为空
endhour =

#选择省份
province = 北京
#选择城市
city = 海淀区

原文地址:https://www.cnblogs.com/shaosks/p/6644654.html