Scrapy 深入了解实战

Scrapy 框架

### 安装 流程
- 环境的安装:
    - mac/linux:pip install scrapy
    - window:
        - pip install wheel
        - 下载twisted http://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted
        - 进入下载目录,执行 pip install Twisted‑17.1.0‑cp35‑cp35m‑win_amd64.whl
        - pip install pywin32
        - pip install scrapy
# 新建一个 scrapy 工程
	scrapy startproject ProName
# 进入爬虫工程
	cd ProName
# 创建一个爬虫文件
	#   python 解释器   scrapy  genspider  文件名  url地址
	python.exe  -m scrapy genspider first www.baidu.com
    
# 执行工程
	python.exe -m scrapy crawl spiderName

first.py 文件内容

# encoding:utf-8
import scrapy

# 爬虫 类  , 父类是 Spider,还有其他形式的父类
class FirstSpider(scrapy.Spider):
    # 爬虫文件名称, 爬虫源文件的唯一标识
    name = 'first'

    # 允许的域名,一般不用
    # allowed_domains = ['www.baidu.com']

    # 起始的URL请求地址 列表:
        # 1. 列表中存放的 URL 都可以被 scrapy 都进行 异步网络请求
        # 2.
    start_urls = ['https://www.baidu.com/','https://www.sogou.com/']

    # 用于数据解析 , 相当于 异步回调的处理 结果的函数
    def parse(self, response):
        '''
        :param response: 响应对象
        :return:
        '''
        print(response)

执行 工程 获取数据

### 执行 scrapy 工程中的爬虫脚本

## robots.txt 问题
	# 在setting 中 
    	ROBOTSTXT_OBEY = False
## 日志问题
	# 输入日志(全日志)
	- python.exe  -m scrapy crawl first
    # 不输入日志
	- python.exe  -m scrapy crawl first --nolog
	# setting 修改日志等级
        LOG_LEVEL='ERROR'
        LOG_FILE='./LOG.log'

## UA 伪装,settings.py配置
	USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'

持久化存储

# 两种方式
	- 基于终端指令的持久化存储
    	- 只可以将 parse方法返回的结果存储到本地磁盘中
        # 指令 , 只能是符合 scrapy标准的文件格式
        - scrapy crawl spiderName -o filepath
        
	- 基于管道的持久化存储
    	# 编码流程:
			- 1. 在爬虫文件中进行解析
			- 2. 要在 item.py文件中 创建对应的字段/属性,来存储解析后的结果值 
			- 3. 在parse将解析的数据封装到 item类型的对象中
             - 4. 在parse.py文件中将 item 类型的对象 提交到管道
                    yield item
             - 5. 管道类的 process_item 方法接收到 item,可以执行任意持久化
             - 6. 在配置setting文件中开启管道设置	
                	# 300 表示的优先级 , 数值越小优先级越高
                	ITEM_PIPELINES = {
   'choutiPro.pipelines.ChoutiproPipeline': 300,
}			
        # 一个管道 定义一种存储 数据的方式 : mysql,本地,psql等需要多个管道类,并将管道类添加到settings配置文件中

实例代码

piplines.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter


class ChoutiproPipeline:
    fp = None

    def open_spider(self, spider):
        '''
            #  重写父类的方法
            #  此方法就执行一次.
        :param spider:
        :return:
        '''
        print('开始爬虫')
        self.fp = open('chouti.txt', 'w', encoding='utf-8')

    def close_spider(self, spider):
        '''
             #  重写父类的方法
             #  此方法就执行一次.
         :param spider:
         :return:
         '''
        print('结束爬虫')
        self.fp.close()

    # 该方法 调用后. 就能接收到 item的对象
    def process_item(self, item, spider):
        '''

        :param item: item对象
        :param spider:
        :return:
        '''
        author = item['author']
        title = item['title']
        print(author, title)
        self.fp.write(author + ":" + title + "
")

        return item


import pymysql


class ChoutiproPipelineMySQL():
    conn = None
    cursor = None

    def open_spider(self, spider):
        self.conn = pymysql.Connection(
            host='127.0.0.1',
            port=3306,
            user='root',
            password='123',
            db='chouti_data',
            charset='utf8'
        )
        self.cursor = self.conn.cursor()

    def process_item(self, item, spider):
        title = item['title']
        author = item['author']

        sql = """
        insert into chouti values("%s","%s")
        """ % (author, title)
        try:
            self.cursor.execute(sql)
            self.conn.commit()
        except Exception as e:
            print("错误:", e)
            self.conn.rollback()

        ### 非常重要, 将item 传入到下一个 管道类
        return item

    def close_spider(self, spider):
        self.cursor.close()
        self.conn.close()

settings.py
# Scrapy settings for choutiPro project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'choutiPro'

SPIDER_MODULES = ['choutiPro.spiders']
NEWSPIDER_MODULE = 'choutiPro.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False
LOG_LEVEL='ERROR'
LOG_FILE='./LOG.log'

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'choutiPro.middlewares.ChoutiproSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'choutiPro.middlewares.ChoutiproDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'choutiPro.pipelines.ChoutiproPipelineMySQL': 100,
   'choutiPro.pipelines.ChoutiproPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

items.py
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class ChoutiproItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()

    # scrapy.Field() 可作为一个万能的数据类型
    title = scrapy.Field()
    author = scrapy.Field()

chouti.py
import scrapy
from choutiPro.items import ChoutiproItem


class ChoutiSpider(scrapy.Spider):
    name = 'chouti'
    # allowed_domains = ['www.baidu.com']
    start_urls = ['https://dig.chouti.com/']

    def parse(self, response):
        # scapy 封装了xpath
        # 解析内容 和 发布平台
        div_list = response.xpath('//div[@class="link-con"]/div')
        all_data = []
        for div in div_list:
            title_text = div.xpath('.//div[@class="link-detail"]/a/text()').extract_first()
            author = div.xpath('.//div[@class="operate-author-con clearfix"]//span[@class="left author-name"]/text()').extract_first()
            """
            Data:
                [<Selector xpath='.//div[@class="link-detail"]/a/text()' data='河南博物院上新文物修复盲盒!竟然和做模型有这么多相
似之处!'>] []
            # 需要对结果进行 进行提取字符串
                extract()   :  提取多个 , 相当于列表 
                extract_first()  : 提取第0个元素 , 具体值
            """
            # 1. 方式一 基于 命令行存储
            # all_data.append({"title_text": title_text, "author": author or 'DD'})
        # return all_data

            # 2. 基于本地存储
            # 实例化 一个 item对象
            item = ChoutiproItem()
            # 给 item 创建属性值 ,item['title'] 调用的 __setitem__
            item['title'] = title_text
            item['author'] = author

            # 提交管道 , 将item 提交到管道
            yield item

原文地址:https://www.cnblogs.com/dengz/p/14802134.html