爬取人力资源社保局咨询问题

创建项目

scrapy startproject shebao

items.py

 1 import scrapy
 2 
 3 
 4 class ShebaoItem(scrapy.Item):
 5     # define the fields for your item here like:
 6     # name = scrapy.Field()
 7     title = scrapy.Field()
 8     content = scrapy.Field()
 9     url = scrapy.Field()
10     number = scrapy.Field()

创建CrawSpider,使用模版crawl

scrapy genspider -t crawl SB www.bjrbj.gov.cn

SB.py

 1 import scrapy
 2 from scrapy.linkextractors import LinkExtractor
 3 from scrapy.spiders import CrawlSpider, Rule
 4 from shebao.items import ShebaoItem
 5 
 6 class SbSpider(CrawlSpider):
 7     name = 'SB'
 8     allowed_domains = ['www.bjrbj.gov.cn']
 9     start_urls = ['http://www.bjrbj.gov.cn/mzhd/list_more.htm?stid=-1&ps=10&ishot=0&pn=1&ps=10']
10 
11     rules = (
12         Rule(LinkExtractor(allow=r'&pn=d+'), follow=True),
13         Rule(LinkExtractor(allow=r'/mzhd/detail_d+.htm'), callback = 'parse_item'),
14         #Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
15     )
16 
17     def parse_item(self, response):
18 
19         item = ShebaoItem()
20         item['title'] = response.xpath('//div[@class="xx_neirong"]/h1/text()').extract()[0]
21         # 编号
22         item['number'] = response.xpath('//p[@class="jz_p1"]/text()').extract()[0]
23 
24         item['content'] = response.xpath('//p[@class="jz_p2"]/text()').extract()[0]
25         # 链接
26         item['url'] = response.url
27 
28         yield item

pipelines.py

 1 import json
 2 
 3 class ShebaoPipeline(object):
 4 
 5 
 6     def __init__(self):
 7         self.filename = open("shebao.json", "w")
 8 
 9     def process_item(self, item, spider):
10         text = json.dumps(dict(item), ensure_ascii = False) + ",
"
11         self.filename.write(text)
12         return item
13 
14     def close_spider(self, spider):
15         self.filename.close()

settings.py
 1 BOT_NAME = 'shebao'
 2 
 3 SPIDER_MODULES = ['shebao.spiders']
 4 NEWSPIDER_MODULE = 'shebao.spiders'
 5 
 6 
 7 ITEM_PIPELINES = {
 8     'shebao.pipelines.ShebaoPipeline': 300,
 9 }
10 
11 LOG_FILE = "dg.log"
12 LOG_LEVEL = "DEBUG"

执行

scrapy crawl SB

 
原文地址:https://www.cnblogs.com/wanglinjie/p/9231519.html