爬虫---爬取豆瓣-科幻片-排行

scrapy

movie.py

# -*- coding: utf-8 -*-
import scrapy
import json
import re

from douban.items import DoubanItem


class MovieSpider(scrapy.Spider):
name = 'movie'
allowed_domains = ['movie.douban.com']
start_urls = ['https://movie.douban.com/j/chart/top_list?type=17&interval_id=100%3A90&action=&start=0&limit=20']

def parse(self, response):
item = DoubanItem()
datas = json.loads(response.body)
if datas:
for data in datas:
item['movie_rank'] = data['rank']
item['movie_name'] = data['title']
item['movie_score'] = data['score']
item['movie_people'] = data['vote_count']
# print item
yield item
a = int(re.findall('.*start=(d+).*', response.url)[0])
if a < 201:
url = 'https://movie.douban.com/j/chart/top_list?type=17&interval_id=100%3A90&action=&start=' + str(a + 20) + '&limit=20'
print url
yield scrapy.Request(url, callback=self.parse)


items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class DoubanItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
movie_rank = scrapy.Field()
movie_name = scrapy.Field()
movie_score = scrapy.Field()
movie_people = scrapy.Field()

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html


class DoubanPipeline(object):
def process_item(self, item, spider):
with open('./douban_movie.txt', 'a+') as f:
# cont = str(item['movie_rank']) + ' ' + item['movie_name'].encode('utf8') + ' ' + item['movie_score'].encode('utf8') + ' ' + str(item['movie_people']) + ' '
cont = str(item['movie_rank']).ljust(3, ' ') + ' ' + item['movie_name'].encode('utf8') + ' ' + item['movie_score'].encode('utf8') + ' '
f.write(cont)

mian.py

# -*- coding:utf-8 -*-
from scrapy import cmdline
file = open('./douban_movie.txt', 'w+')
file.close()
cmdline.execute('scrapy crawl movie'.split())

保存结果   txt文件

1       盗梦空间             9.3
2 机器人总动员 9.3
3 星际穿越 9.2
4 楚门的世界 9.2
5 超感猎杀:完结特别篇 9.2
6 蝙蝠侠:黑暗骑士 9.1
7 攻壳机动队2:无罪 9.1

原文地址:https://www.cnblogs.com/wozuilang-mdzz/p/9740418.html