Spider_实践_beautifulsoup静态网页爬取所有网页链接

# 获取百度网站首页上的所有a标签里的 href属性值：

# import requests
# from bs4 import BeautifulSoup 

# # html = requests.get('http://en.wikipedia.org/wiki/Kevin_Bacon')
# html = requests.get('http://www.baidu.com')
# bs = BeautifulSoup(html.text, 'html.parser')
# for link in bs.find_all(lambda tag: 'href' in tag.attrs):
#     print(link.attrs['href'])
    

# import requests
# import re
# from bs4 import BeautifulSoup 

# # html = requests.get('http://en.wikipedia.org/wiki/Kevin_Bacon')
# html = requests.get('http://www.baidu.com')
# bs = BeautifulSoup(html.text, 'html.parser')
# for link in bs.find_all('', {'href':re.compile('.com')}):
#     print(link.attrs['href'])


# import requests
# from bs4 import BeautifulSoup 

# html = requests.get('http://www.baidu.com')
# bs = BeautifulSoup(html.text, 'html.parser')
# for link in bs.find_all('a'):
#     if 'href' in link.attrs:
#         print(link.attrs['href'])
        

import requests
from bs4 import BeautifulSoup 

def geturl(url):
    html = requests.get(url)
    bs = BeautifulSoup(html.text, 'html.parser')
    return bs.find_all('a')

links=geturl('http://www.baidu.com')
for link in links:
    if 'href' in link.attrs:
        print(link.attrs['href'])

http://news.baidu.com
http://www.hao123.com
http://map.baidu.com
http://v.baidu.com
http://tieba.baidu.com
http://www.baidu.com/bdorz/login.gif?login&tpl=mn&u=http%3A%2F%2Fwww.baidu.com%2f%3fbdorz_come%3d1
//www.baidu.com/more/
http://home.baidu.com
http://ir.baidu.com
http://www.baidu.com/duty/
http://jianyi.baidu.com/


import requests
from bs4 import BeautifulSoup 

def geturl(url):
    html = requests.get(url)
    bs = BeautifulSoup(html.text, 'html.parser')
    return bs.find_all('', {'href':re.compile('http://')})

# links=geturl('http://www.baidu.com')
# print(list)
# links_in_news=set(geturl(links[1].attrs['href']))  # set(list) 去重变为集合
# for link in links_in_news:
#    print(link.attrs['href'])

print('-------------------------------------------------------------------------')
    
links=geturl('http://www.baidu.com')
for link in links:
    if '//news.' in link.attrs['href']:
        links_in_news=set(geturl(link.attrs['href']))  # set(list) 去重变为集合
        for link in links_in_news:
            print(link.attrs['href'])
        break

-------------------------------------------------------------------------
http://baijiahao.baidu.com/s?id=1670182176542294758
http://baijiahao.baidu.com/s?id=1670237336710694101
http://baijiahao.baidu.com/s?id=1670287125142703268
http://baijiahao.baidu.com/s?id=1670255408896313915
http://baijiahao.baidu.com/s?id=1670191066909619203
http://v.baidu.com/
http://baijiahao.baidu.com/s?id=1670253988609166598
 http://news.cctv.com/2020/06/23/ARTIHsG0yhCaD2YJUSFy7Qwt200623.shtml
http://baijiahao.baidu.com/s?id=1670286769270600802
http://news.cctv.com/2020/06/23/ARTIpnapIHyb413WeY46ShDy200623.shtml
http://m.top.cnr.cn/bdxw/20200623/t20200623_525141426.html
http://world.people.com.cn/n1/2020/0623/c1002-31756267.html
http://m.news.cctv.com/2020/06/23/ARTIDAQdwzQFMOkbW2Z0ehEk200623.shtml
http://baijiahao.baidu.com/s?id=1670245143050480742
http://m.news.cctv.com/2020/06/18/ARTIYNwiYAjjHBmGeAXpERs3200618.shtml
http://m.xinhuanet.com/yn/2020-06/23/c_139161263.htm
http://baijiahao.baidu.com/s?id=1670194818426496533
http://baijiahao.baidu.com/s?id=1670232858345398185
http://www.xinhuanet.com/2020-06/23/c_1126147531.htm
http://baijiahao.baidu.com/s?id=1670251112933488182
http://baijiahao.baidu.com/s?id=1670254276238905964
http://baijiahao.baidu.com/s?id=1670255017218969710
http://music.baidu.com/
http://m.top.cnr.cn/bdxw/20200623/t20200623_525141422.html
http://app.cctv.com/special/cportal/detail/arti/index.html?id=Arti8bFV6wkTJPYEkaZYVvoC200622&fromapp=cctvnews&version=805&allow_comment=1&allow_comment=1
http://map.baidu.com/
http://baijiahao.baidu.com/s?id=1670243226621040644
http://baijiahao.baidu.com/s?id=1670254944449236682
http://net.china.cn/chinese/index.htm
http://baijiahao.baidu.com/s?id=1670250874637091231
http://baijiahao.baidu.com/s?id=1670232858345398185
http://baijiahao.baidu.com/s?id=1670289098569528699
http://baijiahao.baidu.com/s?id=1670247580845339645
http://baijiahao.baidu.com/s?id=1670254849012760202
http://m.top.cnr.cn/bdxw/20200623/t20200623_525141424.html
http://baijiahao.baidu.com/s?id=1670246144336669257
http://baijiahao.baidu.com/s?id=1670254276238905964
http://app.cctv.com/special/cportal/detail/arti/index.html?id=ArtiLXGGutc9OLD23xo3Y3dN200622&fromapp=cctvnews&version=805&allow_comment=1&allow_comment=1
http://www.qstheory.cn/zt2019/llxjj/index.htm
http://www.cyberpolice.cn/wfjb/
http://baijiahao.baidu.com/s?id=1670250874637091231
http://baijiahao.baidu.com/s?id=1670239896280719334
http://baijiahao.baidu.com/s?id=1670248053773599893
http://image.baidu.com/
http://baijiahao.baidu.com/s?id=1670243226621040644
http://news.baidu.com/
http://tieba.baidu.com/
http://wenku.baidu.com/
http://report.12377.cn:13225/toreportinputNormal_anis.do
http://www.xinhuanet.com/politics/2020-06/23/c_1126149333.htm
http://app.cctv.com/special/cportal/detail/arti/index.html?id=ArtiA1FM8grjZNDdJ15XVvv8200623&fromapp=cctvnews&version=727
http://downpack.baidu.com/baidunews_AndroidPhone_1014720b.apk
http://www.bjjubao.org/
http://www.qstheory.cn/zt2017/xcgcdd19djs/index.htm

li=[1,2,2,3,4,3,6,4,3]
s=set(li)  #set(list) 去重变为集合
print(s)

{1, 2, 3, 4, 6}

# 递归 抓取所有的链接及链接页面的链接...：
import requests
from requests import exceptions
from bs4 import BeautifulSoup 

pages=set()
def geturl(url):
    global pages
    headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
    # 捕获异常
    try:
        html = requests.get(url,headers=headers)
    except exceptions.ConnectionError:
        # print(url)
        print("连接错误")
    else:
        bs = BeautifulSoup(html.text, 'html.parser')
        links=set(bs.find_all('', {'href':re.compile('^(http://)|^(https://)')}))  # set去重
        if links:
            for link in links:
                if link.attrs['href'] not in pages:  # 去重
                    pages.add(link.attrs['href'])
                    print(link.attrs['href'])
                    geturl(link.attrs['href'])  # 递归
        else:
            print("已爬完！")
home_link='http://www.baidu.com'
geturl(home_link)
print('end....')

https://wenku.baidu.com
https://www.baidu.com/cache/icon/favicon.ico
已爬完！
https://www.baidu.com/cache/icon/favicon.svg
已爬完！
https://jingyan.baidu.com
https://passport.baidu.com/v2/?reg&tpl=exp&u=http%3A%2F%2Fjingyan.baidu.com%2F
https://www.baidu.com/favicon.ico
已爬完！
https://www.baidu.com/img/baidu.svg
已爬完！
https://passport.baidu.com/v2/?ucenterfeedback#reg
http://www.baidu.com/
https://passport.baidu.com/v2/?login&tpl=mn&u=http%3A%2F%2Fwww.baidu.com%2F&sms=5
https://passport.baidu.com/export/app/index.html
https://downpack.baidu.com/ppSecurityCenter_AndroidPhone_passport.apk
已爬完！
https://itunes.apple.com/cn/app/bai-du-quan-zhong-xin-shou/id695439229
https://www.apple.com.cn/iphone/
https://www.apple.com/kw/iphone/
https://www.apple.com/lae/iphone/
https://www.apple.com/gn/iphone/
https://support.apple.com/fr-gn
https://support.apple.com/ko-kr
https://support.apple.com/en-al
https://support.apple.com/fr-sn
https://support.apple.com/ru-ru
https://www.apple.com/ru/
https://www.apple.com/kr/
https://www.apple.com/la/
---为了方便展示，删除几百行---

KeyboardInterrupt:

# 增强上面的代码：只爬取链接，往往是没有什么用的，现在我们增加点功能：
# 1）所有链接网页的标题
      # h1-->span
# 2）第一段文字  
      # div#mw-content-text-->p
# 3）编辑链接
      # li#cca-edit-->span-->a

import requests
from bs4 import BeautifulSoup
from requests import exceptions
import re


pages = set()
def geturl(url):
    global pages
    headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
    # 捕获异常
    html = requests.get(url,headers=headers)
    bs = BeautifulSoup(html.text, 'html.parser')
    try:
        print(bs.h1)
#         print(bs.find(id ='mw-content-text').find_all('p')[0])
#         print(bs.find(id='ca-edit').find('span').find('a').attrs['href'])
    except AttributeError:
        print('This page is missing something! Continuing.')
    except exceptions.ConnectionError:
        print("连接错误")
    except exceptions.HTTPError:
        print('HTTP错误异常')
    except exceptions.MaxRetryError:
        print("多次重试失败")
    except exceptions.TooManyRedirects:
        print("多次重定位失败")
    except exceptions.ConnectTimeout:
        print("连接远程服务超时错误")
    except exceptions.Timeout:
        print("请求 URL超时")
    
    links=set(bs.find_all('', {'href':re.compile('^(http://)|^(https://)')}))  # set去重
    if links:
        for link in links:
            if 'href' in link.attrs:
                if link.attrs['href'] not in pages:
                    newPage = link.attrs['href']
                    print('-'*20)
                    print(newPage)
                    pages.add(newPage)
                    geturl(newPage)
    else:
        print("已爬完！")

# home_link='https://baike.baidu.com/'
home_link='https://baike.hk.xileso.top/wiki/Wikipedia:首页'
geturl(home_link)
print('end....')

# 书上的例子（不翻墙没法访问，例子不可用）
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

pages = set()
def getLinks(pageUrl):
    global pages
    html = urlopen('http://en.wikipedia.org{}'.format(pageUrl))
    bs = BeautifulSoup(html, 'html.parser')
    try:
        print(bs.h1.get_text())
        print(bs.find(id ='mw-content-text').find_all('p')[0])
        print(bs.find(id='ca-edit').find('span').find('a').attrs['href'])
    except AttributeError:
        print('This page is missing something! Continuing.')
    
    for link in bs.find_all('a', href=re.compile('^(/wiki/)')):
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                #We have encountered a new page
                newPage = link.attrs['href']
                print('-'*20)
                print(newPage)
                pages.add(newPage)
                getLinks(newPage)
getLinks('')