爬虫学习-使用CrawlSpider

使用scrapy中的CrawlSpider类来进行爬行

一直用的是BaseSpider,回调函数的方式,有一个问题是title,date在一个页面,author,detail在另一个页面时,怎么把这些字段统一在一个item条目中,尝试了多次,用全局变量等,未果。

尝试使用更高级的CrawlSpider操作;

参照实例代码:

 1 from scrapy.selector import Selector
 2 from scrapy.http import Request
 3 #from scrapy.contrib.spiders import CrawlSpider
 4 from scrapy.spiders import CrawlSpider
 5 from scrapy.loader import ItemLoader
 6 from scrapy.linkextractors.sgml import SgmlLinkExtractor
 7 from bbsdmoz.items import BbsdmozItem
 8 
 9 class formSpider(CrawlSpider):
10     name='bbsSpider'
11     allow_domain=['bbs.sjtu.edu.cn']
12     start_urls=['https://bbs.sjtu.edu.cn/bbsall']
13     link_extractor={
14         'page':SgmlLinkExtractor(allow='/bbsdoc,board,w+.html$'),
15         'page_down':SgmlLinkExtractor(allow='/bbsdoc,board,w+,page,d+.html$'),
16         'content':SgmlLinkExtractor(allow='/bbscon,board,w+,file,M.d+.A.html$'),
17     }
18     _x_query={
19         'page_content':'//pre/text()[2]',
20         'poster':'//pre/a/text()',
21         'forum':'//center/text()[2]',
22     }
23 
24     def parse(self,response):
25         for link in self.link_extractor['page'].extract_links(response):
26             yield Request(url=link.url,callback=self.parse_page)
27 
28     def parse_page(self,response):
29         for link in self.link_extractor['page_down'].extract_links(response):
30             yield Request(url=link.url,callback=self.parse_page)
31 
32         for link in self.link_extractor['content'].extract_links(response):
33             yield Request(url=link.url,callback=self.parse_content)
34 
35     def parse_content(self,response):
36         bbsItem_loader=ItemLoader(item=BbsdmozItem(),response=response)
37         url=str(response.url)
38         bbsItem_loader.add_value('url',url)
39         bbsItem_loader.add_xpath('forum',self._x_query['forum'])
40         bbsItem_loader.add_xpath('poster',self._x_query['poster'])
41         bbsItem_loader.add_xpath('content',self._x_query['page_content'])

42         return bbsItem_loader.load_item() 

稍加改造后,如下代码:

 1 class MySpider6(CrawlSpider):
 2     name = "myspider6"
 3     allowed_domains = ["10.60.32.179"]
 4     start_urls = [
 5         'http://10.60.32.179/Site/Site1/myindex.shtml',
 6         #'http://example.com/page2',
 7     ]
 8     link_extractor={
 9         # 'page':SgmlLinkExtractor(allow='/bbsdoc,board,w+.html$'),
10         # 'page_down':SgmlLinkExtractor(allow='/bbsdoc,board,w+,page,d+.html$'),
11         'page':SgmlLinkExtractor(allow='/Article/w+/w+.shtml$'),
12     }
13     
14     _x_query={
15         'date':'span/text()',
16         'title':'a/text()',
17     }
18     _y_query={
19         'detail':'/html/body/center/div/div[4]/div[1]/p[1]',
20     }
21 
22     def parse(self,response):
23         for link in self.link_extractor['page'].extract_links(response):
24             yield Request(url=link.url,callback=self.parse_content)
25 
26     
27 
28     def parse_content(self,response):
29         bbsItem_loader=ItemLoader(item=DmozItem(),response=response)
30         url=str(response.url)
31         bbsItem_loader.add_value('desc',url)
32         bbsItem_loader.add_xpath('title',self._x_query['title'])
33         bbsItem_loader.add_xpath('date',self._x_query['date'])
34         bbsItem_loader.add_xpath('detail',self._y_query['detail'])

35         return bbsItem_loader.load_item() 

run it,success.

D: est-python utorial>Python27Scriptsscrapy.exe crawl myspider6 -o ee.json

原文地址:https://www.cnblogs.com/javajava/p/4833581.html