scrapy基本使用及选择器-阶梯

text # 输出字符串类型

body # 输出字节类型

import sys,os
sys.stdout=io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')
windows编码

gb18030 兼容性更好。中文,日文,韩文

# -*- coding: utf-8 -*-
import scrapy
from scrapy.selector import HtmlXPathSelector


class HuabanSpider(scrapy.Spider):
name = 'huaban'
allowed_domains = ['huaban.com']
start_urls = ['http://huaban.com/']

def parse(self, response):
hxs = HtmlXPathSelector(response)
result = hxs.select("//a[@class='top_promotion']") #对象
result = hxs.select("//a[@class='top_promotion']").extract() #列表[<a>]
result = hxs.select("//a[@class='top_promotion']").extract_one()#拿第一个

result = hxs.select("//a[@class='top_promotion']/@href").extract_one()#拿a标签下的href属性
result = hxs.select("//a[@class='top_promotion']/text()").extract_one()
#拿其文本内容

# 找到class="recommend-imgbox recommend-box的所有标签
# print(response.text)
#recommend-line top_promotion

  推荐以下方式

hxs = Selector(response=response).xpath('//a')
# print(hxs)
# hxs.xpath('//a')

  常用方式

response = HtmlResponse(url='http://example.com', body=html,encoding='utf-8')
# hxs = HtmlXPathSelector(response)
# print(hxs)
# hxs = Selector(response=response).xpath('//a')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[2]')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[@id]')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[@id="i1"]')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[@href="link.html"][@id="i1"]')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[contains(@href, "link")]')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[starts-with(@href, "link")]')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[re:test(@id, "id+")]')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[re:test(@id, "id+")]/text()').extract()
# print(hxs)
# hxs = Selector(response=response).xpath('//a[re:test(@id, "id+")]/@href').extract()
# print(hxs)
# hxs = Selector(response=response).xpath('/html/body/ul/li/a/@href').extract()
# print(hxs)
# hxs = Selector(response=response).xpath('//body/ul/li/a/@href').extract_first()
# print(hxs)
 

  #当前下面去找:./   或空    或*/

#到对象中循环找用法
# ul_list = Selector(response=response).xpath('//body/ul/li') # for item in ul_list: # v = item.xpath('./a/span') # # 或 # # v = item.xpath('a/span') # # 或 # # v = item.xpath('*/a/span') # print(v)
#拿其文本
item.xpath('./span[@class="price"]/text()')
item.xpath('./span[@class="price"]/text()').extract_first()#拿其第一个文本
item.xpath('div[@class="item_t"]/div[@class="class"]//a/@href').extract_first()   #/儿子;//子孙

  

hxs.xpath('//div[@class="recommend-imgbox recommend-box"]')  #对象
hxs.xpath('//div[@class="recommend-imgbox recommend-box"]').extract() #列表
//#默认去整个html中找

  一般操作总结

// ---子孙

/ ---儿子

特殊:

item.xpath('//')  #从根目录开始

item.xpath('./')   #从相对当前位置,儿子中找

item.xpath('.//')   #从相对当前位置,子孙中找

item.xpath('a') #从相对当前儿子中找(不加/,也不加点.)

 注意:settings.py中设置DEPTH_LIMIT = 1来指定“递归”的层数。

/@属性名

/text()

yield request(url=''xxx,callback=self.parse)

原文地址:https://www.cnblogs.com/catherine007/p/8619000.html