python解析html文件,提取标签中一个元素

对于本地html文件
1
# -*- coding: utf-8 -*- 2 # 使用BeautifulSoup解析网页 3 from bs4 import BeautifulSoup 4 5 #获取要解析的标签 6 with open('test.html','r',encoding='utf-8') as wb_data: 7 Soup = BeautifulSoup(wb_data,'lxml'); #将要解析的文件传入 8 print(Soup); #打印读入Soup中的内容 9 print("!-------------- ") 10 shot_name = Soup.select('body > div > div > table > tbody > tr > td > a'); #将要解析的标签元素路径传入 11 #shot_name = Soup.select('body > div > div > div > ol > li > a'); #将要解析的标签元素路径传入 12 #可以从网站上直接复制 13 print(shot_name,sep=' !!--------------- '); #打印解析标签元素包含内容 14 wb_data.close(); 15 16 #解析标签内容-------使用get_text()获得文本内容,使用get('')方法获取标签属性值 17 list = []; 18 for shot in shot_name: 19 data = shot.get('href').strip('/'); 20 list.append(data); 21 22 with open('shot_names.txt', 'w+') as f: 23 for i in list: 24 f.writelines(i + ' ')

2、对于网页

 1 # -*- coding: utf-8 -*-
 2 from bs4 import BeautifulSoup
 3 import requests
 4 
 5 
 6 url = 'https://hao.360.cn/?a1004'
 7 wb_data = requests.get(url)
 8 soup = BeautifulSoup(wb_data.text,'lxml')   #把web_data变得可读
 9 #解析网页元素,从网站上复制元素的CSS路径
10 #这里以链接为例
11 #famous-section > ul.list.first.gclearfix > li:nth-child(7) > a
12 url_famous = soup.select('#famous-section > ul.list.first.gclearfix > li > a')
13 #famous-section > ul.list.last.gclearfix > li:nth-child(1) > a
14 url_famous .append(soup.select('#famous-section > ul.list.last.gclearfix > li > a'))
15 print(url_famous)
16 #focus_news > ul > li:nth-child(1) > a
17 url_focus = soup.select('#focus_news > ul > li > a')
18 print(url_focus)
原文地址:https://www.cnblogs.com/yml6/p/7595270.html