scrapy crawl itcast -o teachers.json 爬虫案列

  1. spider.py文件配置
      1 
      2 # -*- coding: utf-8 -*-
      3 import scrapy
      4 from itTeachers.items import ItteachersItem
      5 
      6 
      7 class ItcastSpider(scrapy.Spider):
      8     name = 'itcast'
      9     allowed_domains = ['itcast.cn']
     10     start_urls = ['http://www.itcast.cn/channel/teacher.shtml#']
     11 
     12     def parse(self, response):
     13         #with open("teacher.html","w") as f:
     14             #f.write(response.body)
     15 
     16         items = []
     17 
     18         teacher_list = response.xpath('//div[@class="li_txt"]')
     19         for each in teacher_list:
     20 
     21             #我们将得到的数据封装到一个'ItcastItem'对象
     22             item = ItteachersItem()
     23             name = each.xpath('h3/text()').extract()
     24             title = each.xpath('h4/text()').extract()
     25             info = each.xpath('p/text()').extract()
     26 
     27             #xpath返回的是包含一个元素的列表
     28             item['name'] = name[0]
     29             item['title'] = title[0]
     30             item['info'] = info[0]
     31 
     32             items.append(item)
     33         #直接返回最后数据
     34         return items
    ~                         
  2. items.py文件配置
      1 # -*- coding: utf-8 -*-
      2 
      3 # Define here the models for your scraped items
      4 #
      5 # See documentation in:
      6 # https://doc.scrapy.org/en/latest/topics/items.html
      7 
      8 import scrapy
      9 
     10 
     11 class ItteachersItem(scrapy.Item):
     12     # define the fields for your item here like:
     13     # name = scrapy.Field()
     14     name = scrapy.Field()
     15     title = scrapy.Field()
     16     info = scrapy.Field()

原文地址:https://www.cnblogs.com/hizf/p/8270008.html