pyhhon2.7+scrapy+mongodb-- 爬取豆瓣电影TOP250

配置items.py文件

  8 import scrapy
  9 
 10 
 11 class DoubandyItem(scrapy.Item):
 12     # define the fields for your item here like:
 13     # name = scrapy.Field()
 14     title = scrapy.Field()
 15     bd = scrapy.Field()

 16     star = scrapy.Field()
 17     quote = scrapy.Field()

配置setting.py文件

USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, lik    e Gecko) Chrome/17.0.963.56 Safari/535.11'



ROBOTSTXT_OBEY = False
COOKIES_ENABLED = False

ITEM_PIPELINES = {
     'doubandy.pipelines.DoubandyPipeline': 300,
 }

配置srapy.py文件

  1 # -*- coding: utf-8 -*-
  2 import scrapy
  3 from doubandy.items import DoubandyItem
  4 
  5 class DbdySpider(scrapy.Spider):
  6     name = 'dbdy'
  7     allowed_domains = ['douban.com']
  8     offset = 0
  9     url = 'https://movie.douban.com/top250?start='
 10     start_urls = [url + str(offset)]
 11 
 12     def parse(self, response):
 13         item = DoubandyItem()
 14         movies = response.xpath("//div[@class='info']")
 15 
 16         for each in movies:
 17 
 18             item['title'] = each.xpath(".//span[@class='title'][1]/text()").extract()[0]
 19 
 20             item['bd'] = each.xpath(".//div[@class='bd']/p/text()").extract()[0]
 21 
 22             item['star'] = each.xpath(".//div[@class='star']/span[@class='rating_num']/text    ()").extract()[0]

24 quote = each.xpath(".//p[@class='quote']/span/text()").extract()
25 if len(quote) != 0:
26 item['quote'] = quote[0]
27
28
29 yield item
30
31 if self.offset < 225:
32 self.offset += 25
33 yield scrapy.Request(self.url + str(self.offset),callback = self.parse)

配置pipelines.py文件

  1 # -*- coding: utf-8 -*-
  2 
  3 # Define your item pipelines here
  4 #
  5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
  6 # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
  7 
  8 import pymongo
  9 from scrapy.conf import settings
 10 
 11 class DoubandyPipeline(object):
 12     def __init__(self):
 13         host = '127.0.0.1'
 14         port = 27017
 15         dbname = 'douban'
 16         sheetname = 'doubanmovies'
 17 
 18         client = pymongo.MongoClient(host = host, port = port)
 19 
 20         mydb = client[dbname]
 21 
 22         self.post = mydb[sheetname]
 23 
 24     def process_item(self, item, spider):
 25         data = dict(item)
 26         self.post.insert(data)
 27 
 28         return item

数据展示：