爬虫-例子

# coding=utf8

import requests
import re
#目标:获得所有搜狐首页上的链接文章,包含篮球2字的网页都保存到某个地方

#操作:
# 1.获取网页源码
html = requests.get("http://www.sohu.com")
#print(html.text) #获取网页源码

# 2.获取所有网页url
links = re.findall(r'href="(.*?)"',html.text)
# for link in links:
#     print(link)
# 3.过滤url,只保留网页url,去除图片、css、js等
valid_link = []
for link in links:
    if "sohu" not in link:
        continue
    if re.search(r'jpg|png|css|ico|tif|fig|mailto',link):
        continue
    if link.startswith("//"):
        valid_link.append("http:"+link)
        # print("http:"+link.strip())
    else:
        valid_link.append(link.strip())
#         print(link)
# print(len(valid_link))

# 4.判断是否包含篮球2字,并保存
no = 0
for link in valid_link:
    r = requests.get(link)
    if "篮球" in r.text:
        with open("F:\workspace\API_test\Crawlers\links\%s.html"%no,"w",encoding="utf-8") as fp:
            fp.write(r.text)
            no+=1
原文地址:https://www.cnblogs.com/bubutianshu/p/14179321.html