XPATH应用

  1 # -*- coding:utf-8 -*-
  2 '''
  3 Created on Sep 10, 2018
  4 
  5 @author: SaShuangYiBing
  6 '''
  7 from lxml import etree
  8 
  9 html='''
 10 <html>
 11     <head>
 12         <title>哈哈测试一下</title>
 13         <link type="text/css" rel="stylesheet" href="haha.css" />
 14         <link type="text/css" rel="stylesheet" href="haha1.css" />
 15         <link type="text/css" rel="stylesheet" href="haha2.css" />
 16         <script type="text/javascript" src="haha.js"></script>
 17         <script type="text/javascript" src="haha1.js"></script>
 18         <script type="text/javascript" src="haha2.js"></script>
 19     </head>
 20     <body>
 21         <div id="id1" class="class1">
 22             <div id="id2" class="class2">
 23                 <ul class="cls_ul1">
 24                     <li class="cls_li1">
 25                         <div class="cls_3">
 26                             <span>span_text1</span>
 27                             <span>span_text2</span>
 28                             <i>text_1</i>
 29                         </div>
 30                         <div>
 31                             <a href="a_1.html">a_1</a>
 32                             <a href="a_2.html">a_2</a>
 33                             <a href="a_3.html">a_3</a>
 34                         </div>
 35                         <div class="cls_4">
 36                             <a href="a_4.html">
 37                                 <img href="a_img1.jpg" />
 38                             </a>
 39                         </div>
 40                     </li>
 41                     <li class="cls_li1">
 42                         <div class="cls_3">
 43                             <span>span_text3</span>
 44                             <span>span_text4</span>
 45                             <i>text_2</i>
 46                             <i>text_22</i>
 47                         </div>
 48                         <div>
 49                             <a href="a_4.html">a_4</a>
 50                             <a href="a_5.html">a_5</a>
 51                             <a href="a_6.html">a_6</a>
 52                         </div>
 53                         <div class="cls_4">
 54                             <a href="a_5.html">
 55                                 <img href="a_img2.jpg" />
 56                             </a>
 57                         </div>
 58                     </li>
 59                 </ul>
 60             </div>
 61             <div id="id3" class="class3">
 62                 <ul class="cls_ul2">
 63                     <li class="cls_li2">
 64                         <div class="cls_5">
 65                             <span>span_text5</span>
 66                             <span>span_text6</span>
 67                             <i>text_3</i>
 68                         </div>
 69                         <div>
 70                             <a href="a_1.html">a_1</a>
 71                             <a href="a_2.html">a_2</a>
 72                             <a href="a_3.html">a_3</a>
 73                         </div>
 74                         <div class="cls_6">
 75                             <a href="a_4.html">
 76                                 <img href="a_img3.jpg" />
 77                             </a>
 78                         </div>
 79                     </li>
 80                     <li class="cls_li2">
 81                         <div class="cls_5">
 82                             <span>span_text7</span>
 83                             <span>span_text8</span>
 84                             <i>text_4</i>
 85                         </div>
 86                         <div>
 87                             <a href="a_4.html">a_4</a>
 88                             <a href="a_5.html">a_5</a>
 89                             <a href="a_6.html">a_6</a>
 90                         </div>
 91                         <div class="cls_6">
 92                             <a href="a_5.html">
 93                                 <img href="a_img4.jpg" />
 94                             </a>
 95                         </div>
 96                     </li>
 97                 </ul>
 98             </div>
 99         </div>
100     </body>
101 </html>
102 '''
103 
104 html_data = etree.HTML(html)
105 
106 # 1、从根节点开始,沿着XML路径一步一步选择节点,text()表示节点内容
107 content = html_data.xpath("/html/head/title/text()")
108 for con in content:
109     print (con)
110 print ("~~~~~~~~~这是第一个分隔线~~~~~~~~~")
111 
112 # 2、从根节点开始,沿着XML路径一步一步选择节点,text表示节点内容
113 nodes = html_data.xpath("/html/head/title")
114 for i in nodes:
115     print (i.text)
116 print ("~~~~~~~~~这是第二个分隔线~~~~~~~~~") 
117    
118 # 3、从文档中某个节点开始,不考虑此节点位置,text()表示节点内容
119 content = html_data.xpath("//title/text()")
120 for con in content:
121     print (con)
122 print ("~~~~~~~~~这是第三个分隔线~~~~~~~~~")  
123   
124 # 4、获取所有div(html/body/div/div)的id属性值
125 nodes = html_data.xpath("/html/body/div/div")
126 for i in range(len(nodes)):
127     content = nodes[i].xpath("@id")
128     for con in content:
129         print (con)
130 print ("~~~~~~~~~这是第四个分隔线~~~~~~~~~") 
131    
132 # 5、body节点下某节点的属性值
133 content = html_data.xpath("body/div/div[@id= 'id2']/ul/li[1]/div[2]/a/@href")
134 for con in content:
135     print (con)
136 print ("~~~~~~~~~这是第五个分隔线~~~~~~~~~")
137 
138 # 6、div[@id='id2']节点下某节点的属性值
139 content = html_data.xpath("//div[@id = 'id2']/ul/li[1]/div[2]/a/@href")
140 for con in content:
141     print (con)
142 print ("~~~~~~~~~这是第六个分隔线~~~~~~~~~")
143 
144 # 7、div[@id='id2']节点下某节点的内容
145 content = html_data.xpath("//div[@id= 'id2']/ul/li[1]/div[2]/a/text()")
146 for con in content:
147     print (con)
148 print ("~~~~~~~~~这是第七个分隔线~~~~~~~~~")
149     
150 # 8、用'*'来匹配任何元素
151 content = html_data.xpath("*//div[@id = 'id2']/ul/li[1]/div[2]/a/text()")
152 for con in content:
153     print (con)
154 print ("~~~~~~~~~这是第八个分隔线~~~~~~~~~")   
155 
156 # 9、选取多个节点
157 nodes = html_data.xpath("//i|//span")
158 for i in range(len(nodes)):
159     print (nodes[i].text)
160 print ("~~~~~~~~~这是第九个分隔线~~~~~~~~~")     
161 
162 # 10、选取所有li节点
163 nodes = html_data.xpath("//li")
164 for i in range(len(nodes)):
165     content = nodes[i].xpath("div/@class") # li节点下所有div节点的class属性值
166     print (i,'='*5)
167     for con in content:
168         print (con)
169 print ("~~~~~~~~~这是第十个分隔线~~~~~~~~~") 
170 
171 # 11、选取所有li节点
172 nodes = html_data.xpath("//li")
173 for i in range(len(nodes)):
174     content = nodes[i].xpath("div[last()]/@class")  # li节点下最后一个div节点的class属性值
175     print (i, '='*5)
176     for con in content:
177         print (con)
178 print ("~~~~~~~~~这是第十一个分隔线~~~~~~~~~") 
179 
180 # 12、这里应用了'..'和'@',其中'..'表示父节点,具体就是上一步(title)的父节点head;'@'表示属性,就是它后面接是属性名,在这里的意思就是属性href的内容
181 content = html_data.xpath("/html/head/title/../script/@src")
182 for con in content:
183     print (con)
184 print ("~~~~~~~~~这是第十二个分隔线~~~~~~~~~")
185 
186 # 13、div[@class='cls_3']的子节点span的兄弟节点i
187 nodes = html_data.xpath("//div[@class = 'cls_3']/span/following-sibling::i")
188 for i in range(len(nodes)):
189     content = nodes[i].xpath("./text()")  # 当前节点内容
190     for con in content:
191         print (con)
192 print ("~~~~~~~~~这是第十三个分隔线~~~~~~~~~")
193 
194 # 14、li[@class='cls_li1']后代节点里第一个div的class属性值
195 content = html_data.xpath("//li[@class = 'cls_li1']/descendant::div[1]/@class")
196 for con in content:
197     print (con)
198 print ("~~~~~~~~~这是第十四个分隔线~~~~~~~~~")
199 
200 
201 # 15、li[@class='cls_li1']后代节点里span的内容
202 content = html_data.xpath("//li[@class = 'cls_li1']/descendant::span/text()")
203 for con in content:
204     print (con)
205 print ("~~~~~~~~~这是第十五个分隔线~~~~~~~~~")
206 
207 # 16、用'*'来匹配任何元素,且不包含class属性的div节点
208 content = html_data.xpath("*//div[@id = 'id2']/ul/li[1]/div[not(@class)]/a/text()")
209 for con in content:
210     print (con)
211 print ("~~~~~~~~~这是第十六个分隔线~~~~~~~~~")   
212 
213 # 17、多个条件的情况
214 content = html_data.xpath("//div[@id= 'id2' and @class= 'class2']/ul/li[1]/div[1]/span/text()")
215 for con in content:
216     print (con)
217 print ("~~~~~~~~~这是第十七个分隔线~~~~~~~~~") 
218 
219 # 18、contains 包含的情况
220 content = html_data.xpath("//div[contains(@class,'class2')]/ul/li[2]/div[2]/a/@href")
221 for con in content:
222     print (con)
223 print ("~~~~~~~~~这是第十八个分隔线~~~~~~~~~") 
224 
225 输出如下:
226 
227 哈哈测试一下
228 ~~~~~~~~~这是第一个分隔线~~~~~~~~~
229 哈哈测试一下
230 ~~~~~~~~~这是第二个分隔线~~~~~~~~~
231 哈哈测试一下
232 ~~~~~~~~~这是第三个分隔线~~~~~~~~~
233 id2
234 id3
235 ~~~~~~~~~这是第四个分隔线~~~~~~~~~
236 a_1.html
237 a_2.html
238 a_3.html
239 ~~~~~~~~~这是第五个分隔线~~~~~~~~~
240 a_1.html
241 a_2.html
242 a_3.html
243 ~~~~~~~~~这是第六个分隔线~~~~~~~~~
244 a_1
245 a_2
246 a_3
247 ~~~~~~~~~这是第七个分隔线~~~~~~~~~
248 a_1
249 a_2
250 a_3
251 ~~~~~~~~~这是第八个分隔线~~~~~~~~~
252 span_text1
253 span_text2
254 text_1
255 span_text3
256 span_text4
257 text_2
258 text_22
259 span_text5
260 span_text6
261 text_3
262 span_text7
263 span_text8
264 text_4
265 ~~~~~~~~~这是第九个分隔线~~~~~~~~~
266 0 =====
267 cls_3
268 cls_4
269 1 =====
270 cls_3
271 cls_4
272 2 =====
273 cls_5
274 cls_6
275 3 =====
276 cls_5
277 cls_6
278 ~~~~~~~~~这是第十个分隔线~~~~~~~~~
279 0 =====
280 cls_4
281 1 =====
282 cls_4
283 2 =====
284 cls_6
285 3 =====
286 cls_6
287 ~~~~~~~~~这是第十一个分隔线~~~~~~~~~
288 haha.js
289 haha1.js
290 haha2.js
291 ~~~~~~~~~这是第十二个分隔线~~~~~~~~~
292 text_1
293 text_2
294 text_22
295 ~~~~~~~~~这是第十三个分隔线~~~~~~~~~
296 cls_3
297 cls_3
298 ~~~~~~~~~这是第十四个分隔线~~~~~~~~~
299 span_text1
300 span_text2
301 span_text3
302 span_text4
303 ~~~~~~~~~这是第十五个分隔线~~~~~~~~~
304 a_1
305 a_2
306 a_3
307 ~~~~~~~~~这是第十六个分隔线~~~~~~~~~
308 span_text1
309 span_text2
310 ~~~~~~~~~这是第十七个分隔线~~~~~~~~~
311 a_4.html
312 a_5.html
313 a_6.html
314 ~~~~~~~~~这是第十八个分隔线~~~~~~~~~
原文地址:https://www.cnblogs.com/aziji/p/9674315.html