接触python ,想着可不可以自己爬它点数据,目的是能够过滤掉他这些令人头痛的广告,当然也可以顺带熟悉一下python 的scrapy框架 那就开始吧
1 scrapy startproject btxxxx 2 3 scrapy genspider -t crawl btxxxx xxx.info
spider的代码
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from scrapy.linkextractors import LinkExtractor 4 from scrapy.spiders import CrawlSpider, Rule 5 import urllib 6 from xxx.items import XxxxItem 7 import sys 8 9 reload(sys) 10 11 sys.setdefaultencoding('utf8') 12 13 class BtxxxxSpider(CrawlSpider): 14 name = 'btxxxx' 15 allowed_domains = ['btxxxx.info'] 16 def __init__(self, key_word='', *args, **kwargs): 17 super(BtxxxxSpider, self).__init__(*args, **kwargs) 18 self.key_words = key_word 19 quote_str = urllib.quote(self.key_words) 20 # 网址就不搞出来啦 21 zero_url = 'http://www.xxxx.info/search/' + quote_str + '.html' 22 self.start_urls = [zero_url] 23 24 25 rules = ( 26 Rule(LinkExtractor(allow=r'/search/b-[sS]*.html'),callback='root_url', follow=True), 27 Rule( LinkExtractor( 28 allow=r'/search/b-[a-z,A-Z,0-9]+/[0-9]+-[0-9]+.html'), callback='content_url', follow=True 29 ), 30 Rule(LinkExtractor(allow=r'/wiki/.*.html'), callback='parse_item', follow=False) 31 ) 32 33 def root_url(self, response): 34 pass 35 36 37 def content_url(self, response): 38 pass 39 40 41 def parse_item(self, response): 42 i = BtxxxxItem() 43 script_txt = response.xpath('//*[@id="wall"]/h2/script/text()').extract() 44 if len(script_txt) !=0: 45 url_str = script_txt[0].replace('document.write(decodeURIComponent(', '').replace('));', '').replace('"','') 46 link_name = urllib.unquote(str(url_str.replace('+', ''))) 47 i["file_name"] = link_name 48 print "*" * 10 49 #print link_name 50 print "*" * 10 51 file_nodes = response.xpath('//*[@id="wall"]/div/table/tr[last()]/td/text()').extract() 52 print "#" * 10 53 print file_nodes 54 print "#" * 10 55 if len(file_nodes) > 0 : 56 i["file_type"] = file_nodes[0].replace(' ', '') 57 i["file_createtime"] = file_nodes[1].replace(' ', '') 58 i["file_hot"] = file_nodes[2].replace(' ', '') 59 i["file_size"] = file_nodes[3].replace(' ', '') 60 i["file_url"] = response.url 61 file_link = response.xpath('//*[@id="wall"]/div[1]/div[1]/div[2]/a/@href').extract() 62 if len(file_link) > 0: 63 i["file_link"] = file_link[0] 64 yield i
items的代码
1 class BtxxxxItem(scrapy.Item): 2 file_type = scrapy.Field() 3 file_createtime = scrapy.Field() 4 file_hot = scrapy.Field() 5 file_size = scrapy.Field() 6 file_count = scrapy.Field() 7 file_link = scrapy.Field() 8 file_name = scrapy.Field() 9 file_url = scrapy.Field()
settings 中添加
FEED_EXPORT_ENCODING = 'utf-8'
并启用 DEFAULT_REQUEST_HEADERS
执行scrapy (设置要检索的关键字和输出的文件)
scrapy crawl btxxxx -a key_word=xx -o xx.json
简单的爬数据而已,包含网址信息的代码我都已经在文章中删掉,只做学习使用
转载请标明出自 原文地址