urllib库爬取实例

from urllib import request
import random

def spider(url):

    user_agent_list = [
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36"
    ]
    user_agent = random.choice(user_agent_list)

    print(user_agent,url)


    headers = {
        "User-Agent":user_agent
    }

    req = request.Request(url,headers=headers)
    response = request.urlopen(req)
    html = response.read()
    html = html.decode("utf-8")
    # print(html)

    l = url.split("/")
    fileName = "05_"+l[-1]



    with open(fileName,"w",encoding="utf-8") as f:
        f.write(html)


if __name__ == "__main__":
    url_list = ["http://www.langlang2017.com/index.html","http://www.langlang2017.com/route.html","http://www.langlang2017.com/FAQ.html"]
    for url in url_list:
        spider(url)
原文地址:https://www.cnblogs.com/zhangboblogs/p/8542059.html