scrapy 学习笔记

1、scrapy 配合 selenium、phantomJS 抓取动态页面, 单纯的selemium 加 Firefox浏览器就可以抓取动态页面了,

但开启窗口太耗资源,而且一般服务器的linux 没有视窗系统,所以通过无窗口的phantomJS就行了,原理都是通过webket加载js渲染。

直接 pip install selenium 安装 selenium,下载 phantomJS 解压,并将 phantomJS.exe 加入到系统环境,然后就大功告成了。

#coding:utf-8
import sys
reload(sys)
sys.setdefaultencoding( "utf-8" )

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from scrapy.selector import Selector
import time
import os

def writeFile(dirPath, page):
    data = Selector(text = page).xpath("//td[@class='zwmc']/div/a")
    titles = data.xpath('string(.)').extract()
    timeMarks = Selector(text = browser.page_source).xpath("//td[@class='gxsj']/span/text()").extract()
    links = Selector(text = browser.page_source).xpath("//td[@class='zwmc']/div/a/@href").extract()

    for i in range(len(titles)):
        fileName = titles[i].replace(':', '-').replace('/', '-').replace('\', '-').replace('*', 'x').replace('|', '-').replace('?', '-').replace('<', '-').replace('>', '-').replace('"', '-').replace('
', '-').replace('	', '-')
        filePath = dirPath + os.sep + fileName + '.txt'

        with open(filePath, 'w') as fp:
            fp.write(titles[i])
            fp.write('$***$')
            fp.write(timeMarks[i])
            fp.write('$***$')
            fp.write(links[i])


def searchFunction(browser, url, keyWord, dirPath):
    browser.get(url)

#勾选城市
    browser.find_element_by_xpath("//input[@id='buttonSelCity']").click()
    browser.find_element_by_xpath("//table[@class='sPopupTabC']/tbody/tr[1]/td/label/input[@iname='北京']").click()
    browser.find_element_by_xpath("//table[@class='sPopupTabC']/tbody/tr[1]/td/label/input[@iname='上海']").click()
    browser.find_element_by_xpath("//table[@class='sPopupTabC']/tbody/tr[3]/td/label/input[@iname='南京']").click()
    browser.find_element_by_xpath("//table[@class='sPopupTabC']/tbody/tr[4]/td/label/input[@iname='苏州']").click()
    browser.find_element_by_xpath("//table[@class='sPopupTabC']/tbody/tr[4]/td/label/input[@iname='无锡']").click()
    browser.find_element_by_xpath("//div[@class='sPopupTitle250']/div/a[1]").click()

#定位搜索框
    searchBox = browser.find_element_by_xpath("//div[@class='keyword']/input[@type='text']")

#发送搜索内容 
    searchBox.send_keys(keyWord)

#确认搜索   
    browser.find_element_by_xpath("//div[@class='btn']/button[@class='doSearch']").click()

    totalCount = Selector(text = browser.page_source).xpath("//span[@class='search_yx_tj']/em/text()").extract()[0]
    pageOver = int(totalCount) / 40
    for i in range(pageOver):
        time.sleep(3)
        writeFile(dirPath, browser.page_source)
        browser.find_element_by_link_text("下一页").click()    

    time.sleep(3)
    writeFile(dirPath, browser.page_source) 


if __name__ == '__main__':
    print 'START'
    url = 'http://www.zhaopin.com/'
    keyWord = u"华为技术有限公司"
    dirPath = keyWord + u"招聘信息"

    if not os.path.exists(dirPath):
        os.makedirs(dirPath)

#定义一个火狐浏览器对象
    # browser = webdriver.Firefox()
    browser = webdriver.PhantomJS()
    searchFunction(browser, url, keyWord, dirPath)

    # browser.close()
    browser.quit()
    print 'END'

参考资料:

1、python+selenium+scrapy搭建简单爬虫  http://blog.csdn.net/treasure_z/article/details/51064493

2、数据抓取的艺术(一):Selenium+Phantomjs数据抓取环境配置 http://blog.chinaunix.net/uid-22414998-id-3692113.html

3、scrapy 和selenium如何连接起来 https://segmentfault.com/q/1010000002958344

原文地址:https://www.cnblogs.com/xunux/p/5907406.html