Urllib库

urllib是一个包,这个包收集了几个用于处理URLs的模块

urllib.request      用于打开和读取URLs
urllib.error        用于触发请求的异常
urllib.parse        用于分析URLs
urllib.robotparser  用于分析robots.txt格式的文件

 URLOPEN练习

import urllib.request

response = urllib.request.urlopen("http://www.baidu.com")
print(response.read().decode("utf-8"))
第一个爬虫
import urllib.request
import urllib.parse

data = bytes(urllib.parse.urlencode({"word":"hello"}),encoding="utf8")
response = urllib.request.urlopen("http://httpbin.org/post",data=data)
print(response.read())
POST请求
import urllib.request

response = urllib.request.urlopen("http://httpbin.org/get",timeout=1)
print(response.read())
简单超时
import socket
import urllib.request
import urllib.error

try:
    response = urllib.request.urlopen("http://httpbin.org/get",timeout=0.1)
except urllib.error.URLError as e:
    if isinstance(e.reason,socket.timeout):
        print("TIME OUT")
简单的异常

 响应练习

import urllib.request

response = urllib.request.urlopen("https://www.python.org")
print(type(response))
响应类型
import urllib.request

response = urllib.request.urlopen("https://www.python.org")
print(response.status)
print(response.getheaders())
print(response.getheader("Server"))
获取状态码响应头
import urllib.request

request = urllib.request.Request("https://www.python.org")
response = urllib.request.urlopen(request)
print(response.read().decode("utf-8"))
得到响应内容

请求练习

import urllib.request

request = urllib.request.Request("https://python.org")
response = urllib.request.urlopen(request)
print(response.read().decode("utf-8"))
简单请求
from urllib import request,parse

url = "http://httpbin.org/post"
headers = {
    "User-Agent":"Mozilla/4.0(compatible;MSIE 5.5;Windows NT)",
    "Host":"httpbin.org"
}
dict = {
    "name":"Germey"
}
data = bytes(parse.urlencode(dict),encoding="utf8")
req = request.Request(url=url,data=data,headers=headers,method="POST")
response = request.urlopen(req)
print(response.read().decode("utf-8"))
heards
from urllib import request,parse

url = "http://httpbin.org/post"
dict = {
    "name":"Germey"
}
data = bytes(parse.urlencode(dict),encoding="utf8")
req = request.Request(url=url,data=data,method="POST")
req.add_header("User-Agent","Mozilla/4.0(compatible;MSIE 5.5;Windows NT)")
response = request.urlopen(req)
print(response.read().decode("utf-8"))
add_herder方法

代理HANDLER

import urllib.request

proxy_handler = urllib.request.ProxyHandler({
    "http":"http://127.0.0.1:9743",
    "https":"https://127.0.0.1:9743"
})
opener = urllib.request.build_opener(proxy_handler)
response = opener.open("http://www.douyu.com")
print(response.read())
代理

cookie

import http.cookiejar,urllib.request

cookie = http.cookiejar.CookieJar()
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open("http://www.baidu.com")
for item in cookie:
    print(item.name+"="+item.value)
cookie

 

异常处理

from urllib import request,error
try:
    response = request.urlopen("http://chuiqingcai.com/index.htm")
except error.URLError as e:
    print(e.reason)
异常1

URL解析

1.URlPARSE

 urllib.parse.urlparse(urlstring,scheme="",allow_fragments=True)

from urllib.parse import urlparse

result = urlparse("http://www.baidu.com/index.html;user?id=5#comment")
print(type(result),result)
分割url
from urllib.parse import urlparse

result = urlparse("www.baidu.com/index.html;user?id=5#comment",scheme="https")
print(result)
自动填充协议类型(默认如果有则不改变协议类型)

2.URLUNPARSE

from urllib.parse import urlunparse
data = ["http","www.baidu.com","index.html","user","a=6","comment"]
print(urlunparse(data))
拼接

3.URLJOIN

4.URLENCODE

原文地址:https://www.cnblogs.com/cangshuchirou/p/9726499.html