利用Scrapy框架对4567电影爬取

1.创建一个爬虫文件Movie:--scrapy genspider Movie

2.在爬虫文件中编写:

  

# -*- coding: utf-8 -*-
import scrapy
from dianying.items import DianyingItem


class MovieSpider(scrapy.Spider):
name = 'Movie'
#allowed_domains = ['www.xxx.com']
start_urls = ['https://www.4567kan.com/index.php/vod/show/class/爱情/id/7.html']
url = 'https://www.4567kan.com/index.php/vod/show/class/爱情/id/7/page/%d.html'
pageNumber = 2 # 爬取的页码

def parse(self, response):
li_list = response.xpath('/html/body/div[1]/div/div/div/div[2]/ul/li')
for li in li_list:
name = li.xpath('./div/a/@title')[0].extract()
detai_url = 'https://www.4567kan.com'+li.xpath('./div/a/@href').extract_first()
item = DianyingItem()
item['name'] = name
# 利用dedail_url对每个详情页面进行爬取
yield scrapy.Request(detai_url,callback=self.parse_detai,meta={'item':item}) # meta参数的作用,给回调函数
if self.pageNumber < 5: #爬取前五页的代码数据
new_url = format(self.url%self.pageNumber)
yield scrapy.Request(new_url,callback=self.parse)

def parse_detai(self,response):
item = response.meta['item']
desc = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]/text()').extract_first()
item['desc'] = desc
yield item # 把item传给管道

3.在item.py中编写
import scrapy


class DianyingItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
desc = scrapy.Field()


4.在管道中编写
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql

class DianyingPipeline:
conn = None
c = None
def open_spider(self,spider):
self.conn = pymysql.Connect(user='root', password='123456', host='localhost'
, port=3306, database='xuezhijun',charset = 'utf8')
self.c = self.conn.cursor()

def process_item(self, item, spider):
name = item['name']
desc = item['desc']
try:
self.c.execute('insert into DY values (%s,%s)',
(name, desc))
except Exception as e:
print(e)
self.conn.rollback()
self.conn.commit()
return item
def close_spider(self,spider):
self.c.close()
self.conn.close()

5.设置一下配置文件
ITEM_PIPELINES = {
'dianying.pipelines.DianyingPipeline': 300,
}
ROBOTSTXT_OBEY = False
LOG_LEVEL = 'ERROR'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'


原文地址:https://www.cnblogs.com/KingOfCattle/p/13038892.html