scrapy初步使用

使用scrapy 爬取数据
Scrapy is written in pure Python and depends on a few key Python packages (among others):

lxml, an efficient XML and HTML parser

parsel, an HTML/XML data extraction library written on top of lxml,

w3lib, a multi-purpose helper for dealing with URLs and web page encodings

twisted, an asynchronous networking framework

cryptography and pyOpenSSL, to deal with various network-level security needs


scrapy startproject tutorial  #建立爬虫项目


**************************************************************

#-*- coding:utf-8 -*-

import scrapy

class First(scrapy.Spider):   #需要继承scrapy.Spider类
    name = 'firstScrapy'      # 定义蜘蛛名

    start_urls = [
            'http://lab.scrapyd.cn/page/1/',
            'http://lab.scrapyd.cn/page/2/',
    ]
    def parse(self,response):
        '''
         start_requests已经爬取到页面，那如何提取我们想要的内容呢？那就可以在这个方法里面定义。
         这里的话，并木有定义，只是简单的把页面做了一个保存，并没有涉及提取我们想要的数据，后面会慢慢说到
         也就是用xpath、正则、或是css进行相应提取，这个例子就是让你看看scrapy运行的流程：
         1、定义链接；
         2、通过链接爬取（下载）页面；
         3、定义规则，然后提取数据；
         就是这么个流程，似不似很简单呀？
         '''
        page = response.url.split("/")[-2]     #根据上面的链接提取分页,如：/page/1/，提取到的就是：1
        filename = 'first-%s.html' % page    #拼接文件名，如果是第一页，最终文件名便是：mingyan-1.html
        with open(filename, 'wb') as f:        #python文件操作，不多说了；
            f.write(response.body)             #刚才下载的页面去哪里了？response.body就代表了刚才下载的页面！
        self.log('保存文件: %s' % filename)      # 打个日志


E:sourceScrapy	utorial	utorialspiders>scrapy crawl firstScrapy   #执行

***************************************************************


命令行：
scrapy shell http://lab.scrapyd.cn  #获取response 对象

response.css('title')    #CSS 选择器列表
response.css('title').extract()    #获取列表数据
response.css('title').extract()[0]

response.css('title').extract_first()   #获取第一个元素
response.css('title::text').extract_first()   #获取元素文本

**************************************************************************************
#-*- coding:utf-8 -*-

import scrapy

class GetMessage(scrapy.Spider):
    name = 'getmessage'

    start_urls = [
        'http://lab.scrapyd.cn/',
    ]

    def parse(self, response, **kwargs):

        pageOne = response.css('div.quote')
        for lines in pageOne:
            text = lines.css('.text::text').extract_first() # 提取名言
            author = lines.css('.author::text').extract_first() # 提取作者
            tags = lines.css('.tags .tag::text').extract() # 提取标签
            tag = ','.join(tags)  # 数组转换为字符串

            fileName = '%s-语录.txt' % author  # 爬取的内容存入文件，文件名为：作者-语录.txt
            pathFile = './data/' + fileName
            with open(pathFile,'a+',encoding='utf-8') as f: # 追加写入文件
                f.write(text + '
')
                f.write('标签:' + tag + '
')


***********************************************************************************************

#多页爬取

#-*- coding:utf-8 -*-

import scrapy

class GetMessage(scrapy.Spider):
    name = 'getmessage'

    start_urls = [
        'http://lab.scrapyd.cn/',
    ]

    def parse(self, response, **kwargs):

        pageOne = response.css('div.quote')
        for lines in pageOne:
            text = lines.css('.text::text').extract_first() # 提取名言
            author = lines.css('.author::text').extract_first() # 提取作者
            tags = lines.css('.tags .tag::text').extract() # 提取标签
            tag = ','.join(tags)  # 数组转换为字符串

            fileName = '%s-语录.txt' % author  # 爬取的内容存入文件，文件名为：作者-语录.txt
            pathFile = './data/' + fileName
            with open(pathFile,'a+',encoding='utf-8') as f: # 追加写入文件
                f.write(text + '
')
                f.write('标签:' + tag + '
')

        next_page = response.css('li.next a::attr(href)').get()   #获取跳转下一页的标签
        if next_page is not None:                                 #如果不为空 执行
            next_page = response.urljoin(next_page)               #如果是相对路径转换为绝对路径
            yield scrapy.Request(next_page, callback=self.parse)   #给 scrapy.Resquest 传递参数，页面和回调的函数
            #yield
来自:http://www.scrapyd.cn/doc/181.html