获取网页内容生成html,并将某些标签属性进行修改 (基于python3.6)

#!/usr/bin/python3
# -*- coding: utf-8 -*-

import urllib.request
import os

from bs4 import BeautifulSoup

# 网址
url = 
# 更换部分
Splicing = 


def get_web(get_url):
page = urllib.request.urlopen(get_url)
html = page.read().decode("utf-8")
all_url = []

url_list = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')
for list_div in url_list.find_all('div', class_='col_menu_con'):
for a in list_div.find_all('a', href=True):
if a.get_text(strip=True):
if 'https' in a['href']:
continue
elif 'http' in a['href']:
continue
else:
all_url.append(a['href'])

for want_url in all_url:
jump_url = Splicing + want_url
name_split = want_url.split('/')
file_name = name_split[1] + '.html'
down_page = urllib.request.urlopen(jump_url)
down_html = down_page.read()
write_html = open(file_name, "w+b")
write_html.write(down_html)
write_html.close()
print(file_name + ' ' + 'done!')


def change_web(html_file):
file = html_file
content = open(file, 'r', encoding="utf-8")
html_cont = content.read()
find_content = BeautifulSoup(html_cont, 'lxml')

# 修改<a href
for change_a in find_content.find_all('a', href=True):
change_a.get_text(strip=True)
if 'https' in change_a['href']:
continue
elif 'http' in change_a['href']:
continue
else:
change_href = Splicing + change_a['href']
change_a['href'] = change_href

# 修改<link href
for change_link in find_content.find_all('link', href=True):
change_link.get_text(strip=True)
if 'https' in change_link['href']:
continue
elif 'http' in change_link['href']:
continue
else:
change_linkhref = Splicing + change_link['href']
change_link['href'] = change_linkhref

# 修改<script src
for change_script in find_content.find_all('script', src=True):
change_script.get_text(strip=True)
if 'https' in change_script['src']:
continue
elif 'http' in change_script['src']:
continue
else:
change_src = Splicing + change_script['src']
change_script['src'] = change_src

# 修改<form action
for change_form in find_content.find_all('form', action=True):
change_form.get_text(strip=True)
if 'https' in change_form['action']:
continue
elif 'http' in change_form['action']:
continue
else:
change_action = Splicing + change_form['action']
change_form['action'] = change_action

# 修改<img src
for change_image in find_content.find_all('img', src=True):
change_image.get_text(strip=True)
if 'https' in change_image['src']:
continue
elif 'http' in change_image['src']:
continue
else:
change_imagesrc = Splicing + change_image['src']
change_image['src'] = change_imagesrc

# 修改<img original_src
for change_originalsrc in find_content.find_all('img', original_src=True):
change_originalsrc.get_text(strip=True)
if 'https' in change_originalsrc['original_src']:
continue
elif 'http' in change_originalsrc['original_src']:
continue
else:
change_original = Splicing + change_originalsrc['original_src']
change_originalsrc['original_src'] = change_original

change_content = str(find_content).encode(encoding='utf-8')   #尤其注意,soup生成了字典,进行修改后要转为str,并将其固定utf-8编码,才能存回去
change_html = open(file, "w+b")
change_html.write(change_content)
change_html.close()
print(file + ' ' + 'changed!')


get_web(url)
filearray = []
file_list = os.listdir(os.getcwd())
for fileNAME in file_list:
if os.path.splitext(fileNAME)[1] == '.html':
filearray.append(fileNAME)
for html_number in range(len(filearray)):
change_web(filearray[html_number])

原文地址:https://www.cnblogs.com/setname/p/9261396.html