scrapy的CrawlSpider使用

1.创建项目

我这里的项目名称为scrapyuniversal，然后我创建在D盘根目录。创建方法如下

打开cmd，切换到d盘根目录。然后输入以下命令:

scrapy startproject scrapyuniversal

如果创建成功，d盘的根目录下将生成一个名为scrapyuniversal的文件夹。

2.创建crawl模板

打开命令行窗口，然后定位到d盘刚才创建的scrapyuniversal文件夹。然后输入以下命令

scrapy genspider -t crawl china tech.china.com

如创建成功会在scrapyuniversal目录下的spider目录里多一个spider文件，下面我们就来看这个spider文件。代码含注释

3.目录结构

scrapyuniversal
│  scrapy.cfg
│  spider.sql
│  start.py
│  
└─scrapyuniversal
    │  items.py
    │  loaders.py
    │  middlewares.py
    │  pipelines.py
    │  settings.py
    │  __init__.py
    │  
    ├─spiders
    │  │  china.py
    │  │  __init__.py
    │  │

4.china.py

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..items import NewsItem
from ..loaders import ChinaLoader
class ChinaSpider(CrawlSpider):
    name = 'china'
    allowed_domains = ['tech.china.com']

    start_urls = ['http://tech.china.com/articles/']
    #当我们follow一个链接时, 我们其实是用rules把这个链接返回的response再提取一遍.
    #第二个rule是设定设定只取两页
    rules = (
        Rule(LinkExtractor(allow='article/.*.html', restrict_xpaths='//div[@id="left_side"]//div[@class="con_item"]'),
             callback='parse_item'),
        Rule(LinkExtractor(restrict_xpaths="//div[@id='pageStyle']//span[text()<3]")),
    )

    def parse_item(self, response):
       #item存储方式，保存格式乱所以改用itemloader
       # item=NewsItem()
       # item['title']=response.xpath("//h1[@id='chan_newsTitle']/text()").extract_first()
       # item['url']=response.url
       # item['text']=''.join(response.xpath("//div[@id='chan_newsDetail']//text()").extract()).strip()
       # #re_first正则表达式提取时间
       # item['datetime']=response.xpath("//div[@id='chan_newsInfo']/text()").re_first('(d+-d+-d+sd+:d+:d+)')
       # item['source']=response.xpath('//div[@id="chan_newsInfo"]/text()').re_first("来源： (.*)").strip()
       # item['website']="中华网"
       # yield item

       loader = ChinaLoader(item=NewsItem(), response=response)
       loader.add_xpath('title', '//h1[@id="chan_newsTitle"]/text()')
       loader.add_value('url', response.url)
       loader.add_xpath('text', '//div[@id="chan_newsDetail"]//text()')
       loader.add_xpath('datetime', '//div[@id="chan_newsInfo"]/text()', re='(d+-d+-d+sd+:d+:d+)')
       loader.add_xpath('source', '//div[@id="chan_newsInfo"]/text()', re='来源：(.*)')
       loader.add_value('website', '中华网')
       yield loader.load_item()

　5.loader.py

#!/usr/bin/env python  
# encoding: utf-8  
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst,Join,Compose

class NewsLoader(ItemLoader):
    """
    定义一个通用Out Processor为TakeFirst
    TakeFirst：取迭代对象中第一个非空元素，相当于之前item用的extract_first
    """
    default_output_processor = TakeFirst()
class ChinaLoader(NewsLoader):
    """
    Compose第一个参数
    Join:把列表拼成字符串
    Compose第二个参数是一个匿名函数
    对字符串进一步处理
    """
    text_out=Compose(Join(),lambda s: s.strip())
    source_out=Compose(Join(),lambda s:s.strip())

　　6.items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

from scrapy import Field,Item


class NewsItem(Item):
    #标题
    title=Field()
    #链接
    url=Field()
    #正文
    text=Field()
    #发布时间
    datetime=Field()
    #来源
    source=Field()
    #站点名称，直接赋值中华网
    website=Field()

　　7.中间件的修改，随机获取useragent逻辑

# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals
import random
class ProcessHeaderMidware():
    """process request add request info"""
    def __init__(self):
       self.USER_AGENT_LIST= ["Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"
         "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
         "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
         "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
         "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
         "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
         "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
         "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
         "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
         "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
         "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
         "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
         "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
         ]
    def process_request(self, request, spider):
        """
        随机从列表中获得header， 并传给user_agent进行使用
        """
        ua = random.choice(self.USER_AGENT_LIST)
        spider.logger.info(msg='now entring download midware')
        if ua:
            request.headers['User-Agent'] = ua
            # Add desired logging message here.
            spider.logger.info(u'User-Agent is : {} {}'.format(request.headers.get('User-Agent'), request))

　　8.settings.py

# -*- coding: utf-8 -*-

# Scrapy settings for scrapyuniversal project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'scrapyuniversal'

SPIDER_MODULES = ['scrapyuniversal.spiders']
NEWSPIDER_MODULE = 'scrapyuniversal.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'scrapyuniversal (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'scrapyuniversal.middlewares.ScrapyuniversalSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'scrapyuniversal.middlewares.ScrapyuniversalDownloaderMiddleware': 543,
#}
DOWNLOADER_MIDDLEWARES = {
    'scrapyuniversal.middlewares.ProcessHeaderMidware': 543,

}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'scrapyuniversal.pipelines.ScrapyuniversalPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

HTTP_PROXY="127.0.0.1:5000" #替换成需要的代理