爬虫基础知识

请求方式

#获取一个post请求
import urllib.parse
import urllib.request
data = bytes(urllib.parse.urlencode({"hello":"world"}),encoding = "utf-8")
#用于模拟网站密码账号的登录，后续的还需cookies
response = urllib.request.urlopen("https://httpbin.org/post",data = data)
print(response.read().decode("utf-8"))

#获取get请求
import urllib.request
#用于模拟网站密码账号的登录，后续的还需cookie
response = urllib.request.urlopen("https://httpbin.org/get")
print(response.read().decode("utf-8"))

超时处理

#超时处理
import urllib.request
try:
    response = urllib.request.urlopen("https://httpbin.org/get",timeout = 1)#如果超过1秒还没结果
    print(response.read().decode("utf-8"))
except urllib.error.URLError as e: #错误类型
    print("time out!")

对豆瓣电影进行信息的爬取

import urllib.request
url = "http://www.douban.com"
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"}
#伪装成浏览器的信息
req = urllib.request.Request(url=url,headers=headers)#将其打包封装
response = urllib.request.urlopen(req)
print(response.read().decode("utf-8"))