scrapy 数据存储mysql

scrapy 数据存储mysql 处理:

user-agent 池:

UserAgent = ["Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]

数据库同步插入数据模板:

同步
class MysqlPipeline(object):
    def open_spider(self, spider):
        self.db = pymysql.connect(host=MYSQL_HOST, port=MYSQL_PORT, user=MYSQL_USER, password=MYSQL_PWD,
                                  database=MYSQL_DB)
        self.cursor = self.db.cursor()
        print('数据库连接success')

    def process_item(self, item, spider):

        info_list = (item['company_name'], item['company_person'], item['company_money'],
                     item['company_establish'], item['company_phone'], item['company_address'],
                     item['company_status'])

        ins = """
            insert into test1(company_name,company_person,company_money,company_establish,company_phone,company_address' 
              'company_status,) VALUES (%s,%s,%s,%s,%s,%s,%s)"""
        try:
            self.cursor.execute(ins, info_list)
            self.db.commit()
            print('插入重构')
        except Exception as e:
            print(e)

        return item

    def close_spider(self, spider):
        self.cursor.close()
        self.db.close()

csv 存储:

csv 存储

import csv
class WangyinPipeline_a(object):
    def __init__(self):
        self.f = open("qichacha_sh.csv", "w")
        self.writer = csv.writer(self.f)
        self.writer.writerow(
            ['company_name', 'company_city', 'company_person', 'company_money', 'company_establish', 'company_email',
             'company_phone', 'company_address'' 
              ''company_status', 'company_link'])

    def process_item(self, item, spider):
        wangyiyun_list = item['company_name'], item['company_city'], item['company_person'], item['company_money'], 
                         item['company_establish'], item['company_email'], item['company_phone'], item[
                             'company_address'], 
                         item['company_status'], item['company_link']

        self.writer.writerow(wangyiyun_list)
        return item

    def close_spider(self, spider):  # 关闭
        self.f.close()

插入模板:

import pymysql


# 同步
class QccPipeline(object):
    def __init__(self):
        dbparams = {
            'host': '127.0.0.1',
            'port': 3306,
            'user': 'root',
            'password': '155345',
            'database': 'qcc',
            'charset': 'utf8'

        }
        self.conn = pymysql.connect(**dbparams)
        self.cursor = self.conn.cursor()
        self._sql = None

    def process_item(self, item, spider):
        self.cursor.execute(self.sql, (item['title'], ...))
        self.conn.commit()

        return item

    @property
    def sql(self):
        if not self._sql:
            # 创建表 sql  语句
            self._sql = """
            文本: longtext  长度不写
            时间   datetime
            注意:
                id 不为空
                item 内的company_name = scrapy.Field() 
                        company 对应表子段
             id title text content author touxiang time x-id 
            insert into art( id,....) values(null,%s,%s,%s,%s。。。。) 

            """
            return self._sql
        return self._sql

异步插入模板:

# 异步
from twisted.enterprise import adbapi  # 数据库处理
from pymysql import cursors


class ty(object):
    def __init__(self):
        dbparams = {
            'host': '127.0.0.1',
            'port': 3306,
            'user': 'root',
            'password': '155345',
            'database': 'jianshu',
            'charset': 'utf8',
            'cursorclass': cursors.DictCursor

        }
        self.dbpool = adbapi.ConnectionPool('pymysql', **dbparams)
        self._sql = None

    @property
    def sql(self):
        if not self._sql:
            # 创建表 sql  语句
            self._sql = """
            文本: longtext  长度不写
            时间   datetime
            注意:
                id 不为空
                item 内的company_name = scrapy.Field() 
                        company 对应表子段
             id title text content author touxiang time x-id 
            insert into art( id,....) values(null,%s,%s,%s,%s。。。。) 

            """
            return self._sql
        return self._sql

    def process_item(self, item, spider):
        defer = self.dbpool.runInteraction((self.insert_item, item))
        defer.addErrback(self.handle_error, item, spider)

    def insert_item(self, cursor, item):
        cursor.execute(self.sql, (item['title'], ...))

    def handler_error(self, error, item, spider):
        print('=' * 10 + 'error' + '=' * 10)
        print(error)


数据库插入包错: 1064

class MysqlPipeline(object):

    def __init__(self):
        self.host = settings.DB_HOST
        self.port = settings.DB_PORT
        self.user = settings.DB_USER
        self.pwd = settings.DB_PWD
        self.name = settings.DB_NAME
        self.charset = settings.DB_CHARSET

        self.connect()

    def connect(self):
        self.conn = pymysql.connect(host=self.host,
                                    port=self.port,
                                    user=self.user,
                                    password=self.pwd,
                                    db=self.name,
                                    charset=self.charset)
        self.cursor = self.conn.cursor()

    def close_spider(self, spider):
        self.conn.close()
        self.cursor.close()

    def process_item(self, item, spider):
        try:
            print('在插入 ***************************************          ')
            # sql = 'insert into zb_contest_data(craw_date, cd_filename, cd_content, create_time) values("%s", "%s", "%s", "%s")' % (
            #     item['craw_date'], item['file_name'], str(item['file_content']), item['create_time'])
            file_count = pymysql.escape_string(item['file_content'])
            sql = 'insert into zb_contest_data(craw_date, cd_filename, cd_content, create_time) values("%s", "%s", "%s", "%s")' % (
                item['craw_date'], item['file_name'], file_count, item['create_time'])

            # sql = 'insert into zb_contest_data(craw_date, cd_filename, cd_content, create_time) values("%s", "%s", "%s", "%s")' % (
            #     item['craw_date'], item['file_name'], item['file_content'], item['create_time'])

            # 执行sql语句
            self.cursor.execute(sql)
            self.conn.commit()
           
        except Exception as e:
            print(e)
            pass
        return item

翻页乱码:

乱码:
	settings.py

 1 FEED_EXPORT_ENCODING ='utf-8' #插入这两行,解码成中文,不然出现的是乱码

2 FEED_EXPORT_ENCODING = 'gb18030'

import scrapy

from movie.items import MovieItem

class MovieproSpider(scrapy.Spider):
    name = 'moviePro'
    allowed_domains = ['4567tv.tv']
    start_urls = ['https://www.4567tv.tv/frim/index1.html']
    page = 1
    page_url = 'https://www.4567tv.tv/frim/index1-%s.html'


    def parse(self, response):
        li_list = response.xpath('//li[@class="col-md-6 col-sm-4 col-xs-3"]')
        for li in li_list:
            item = MovieItem()
            name = li.xpath('./div/div/h4/a/text()').extract_first()
            detail_url = 'https://www.4567tv.tv' + li.xpath('./div/div/h4/a/@href').extract_first()
            item['name'] = name
            yield scrapy.Request(url = detail_url, callback = self.parse_detail, meta ={'item':item})

        if self.page <= 10:
            self.page += 1
            new_page_url = self.page_url % self.page
            yield scrapy.Request(url = new_page_url, callback = (self.parse))

    def parse_detail(self, response):
        item = response.meta['item'] #注意这里实例化的是meta的,是parse函数传递过来的第二层内容
        actor = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[3]/a/text()').extract_first() 
        link = response.xpath('/html/body/div[1]/div/div/div/div[1]/a/@href').extract_first()
        item['actor'] = actor
        item['link'] = 'https://www.4567tv.tv' + link #这个连接是相对路径,重新构造地址
        yield item

数据库事务处理:

自定义的管道,将完整的爬取数据,保存到MySql数据库中
class DingdianPipeline(object):
    def process_item(self, item, spider):
        dbu = dbutil.MYSQLdbUtil()
        dbu.getConnection()  # 开启事物

        # 1.添加
        try:
            #sql = "insert into movies (电影排名,电影名称,电影短评,评价分数,评价人数)values(%s,%s,%s,%s,%s)"
            sql = "insert into ebook (novel_name,author,novelurl,serialstatus,serialsize,ebookdate,newchapter)values(%s,%s,%s,%s,%s,%s,%s)"
            #date = [item['rank'],item['title'],item['quote'],item['star']]
            #dbu.execute(sql, date, True)
            dbu.execute(sql, (item['novel_name'],item['author'],item['novelurl'],item['serialstatus'],item['serialsize'],item['date'],item['newchapter']),True)
            #dbu.execute(sql,True)
            dbu.commit()
            print('插入数据库成功!!')
        except:
            dbu.rollback()
            dbu.commit()  # 回滚后要提交
        finally:
            dbu.close()
        return item
原文地址:https://www.cnblogs.com/shaozheng/p/12781104.html