获取指定网页的所有链接

Github上的一个小项目,通过requests和bs4获取网页指定网页上的所有链接:

"""
    一个小程序,获取指定网页上的所有链接
"""

import requests
from bs4 import BeautifulSoup

url = input("请输入网址:") # 从终端输入网址

if ("https" or "http") in url: # 判定一下
    webData = requests.get(url) # 获取网页响应
    # print(webData)
else:
    webData = requests.get("https://" + url)

webData.encoding = webData.apparent_encoding # 编码
webData.raise_for_status()
# webData.encoding = 'utf-8' # 编码
# print(webData.text)
htmlData = webData.text
# 解析网页数据
# soup = BeautifulSoup(htmlData, 'html.parser')
soup = BeautifulSoup(htmlData, 'lxml')
# print(soup)

# 开始查找网页下所有链接
allLinksFromPage = []
links = soup.find_all('a')
# print(links)
for link in links:
    getLink = link.get('href')
    allLinksFromPage.append(getLink)

# print(allLinksFromPage)

# 开始存储
with open('myLinks.txt', 'w') as saved:
    print(allLinksFromPage[0:10], file=saved) # 保存前十条
    saved.close()

  

原文地址:https://www.cnblogs.com/mafu/p/15419691.html