爬虫---爬取豆瓣-科幻片-排行

scrapy

movie.py

# -*- coding: utf-8 -*-
import scrapy
import json
import re

from douban.items import DoubanItem


class MovieSpider(scrapy.Spider):
    name = 'movie'
    allowed_domains = ['movie.douban.com']
    start_urls = ['https://movie.douban.com/j/chart/top_list?type=17&interval_id=100%3A90&action=&start=0&limit=20']

    def parse(self, response):
        item = DoubanItem()
        datas = json.loads(response.body)
        if datas:
            for data in datas:
                item['movie_rank'] = data['rank']
                item['movie_name'] = data['title']
                item['movie_score'] = data['score']
                item['movie_people'] = data['vote_count']
                # print item
                yield item
            a = int(re.findall('.*start=(d+).*', response.url)[0])
            if a < 201:
                url = 'https://movie.douban.com/j/chart/top_list?type=17&interval_id=100%3A90&action=&start=' + str(a + 20) + '&limit=20'
                print url
                yield scrapy.Request(url, callback=self.parse)

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class DoubanItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    movie_rank = scrapy.Field()
    movie_name = scrapy.Field()
    movie_score = scrapy.Field()
    movie_people = scrapy.Field()

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html


class DoubanPipeline(object):
    def process_item(self, item, spider):
        with open('./douban_movie.txt', 'a+') as f:
            # cont = str(item['movie_rank']) + '     ' + item['movie_name'].encode('utf8') + '     ' + item['movie_score'].encode('utf8') + '     ' + str(item['movie_people']) + '
'
            cont = str(item['movie_rank']).ljust(3, ' ') + '     ' + item['movie_name'].encode('utf8') + '     ' + item['movie_score'].encode('utf8') + '
'
            f.write(cont)

mian.py

# -*- coding:utf-8 -*-
from scrapy import cmdline
file = open('./douban_movie.txt', 'w+')
file.close()
cmdline.execute('scrapy crawl movie'.split())

保存结果 txt文件

1       盗梦空间             9.3
2       机器人总动员       9.3
3       星际穿越             9.2
4       楚门的世界          9.2
5       超感猎杀：完结特别篇     9.2
6       蝙蝠侠：黑暗骑士     9.1
7       攻壳机动队2：无罪     9.1