requests模块的使用

requests模块

什么是request模块：requests是python原生一个基于网络请求的模块，模拟浏览器发起请求。

requests-get请求

# get请求
import requests
# 指定url
url = 'https://www.sogou.com/'

# 发起get请求：get方法会返回请求成功的响应对象
response = requests.get(url)
if response.status_code == 200:
    with open('sougo.html','w') as f:
        f.write(response.text)
else:
    print('页面获取失败')

response常用属性

# get请求
import requests
# 指定url
url = 'https://www.sogou.com/'

# 发起get请求：get方法会返回请求成功的响应对象
response = requests.get(url)
if response.status_code == 200:
    # print(response.text) # 文本
    print(response.status_code) # 返回一个响应状态码
    print(response.content) # content获取的是response对象中二进制(byte)类型的页面数据
    print(response.headers) # 获取响应头信息
    print(response.url) # 获取请求的url
else:
    print('页面获取失败')

携带参数的get请求

方式1

import requests
# 指定url,参数不需要进行编码处理
url = 'https://www.sogou.com/web?query=周杰伦&ie=utf-8'

# 发起get请求：get方法会返回请求成功的响应对象
response = requests.get(url)
if response.status_code == 200:
    with open('jay.html','wb') as f:
        f.write(response.content)
else:
    print('页面获取失败')

方式2

import requests
url = 'https://www.sogou.com/web'

params = {
    'query':'周杰伦',
    'ie':'utf-8'
}
response = requests.get(url,params=params)
if response.status_code == 200:
    with open('jay.html','wb') as f:
        f.write(response.content)
else:
    print('页面获取失败')

get请求自定义请求头信息

# 自定义请求头信息
import requests
url = 'https://www.sogou.com/web'
# 自定义的请求头信息放在该字典中，然后发请求的时候传到headers参数中
headers = {
    'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
}
params = {
    'query':'林宥嘉',
    'ie':'utf-8'
}
response = requests.get(url=url,params=params,headers=headers)
print(response.status_code)

requests-post请求

# post请求

# 指定url
url = 'https://github.com/session'
data = {
    'commit': 'Sign in',
    'utf8': '✓',
    'authenticity_token': 'IRdX8jflo9hKJAZ9mOzQBNnVnOFD7z9MfKvSYCOvrVN4uWz/LDQ81b6wWWy4d8YrvYobfiuLYS92zoK6XgH/LQ==',
    'login': '1032298871@qq.com',
    'password': '09212427zlh'
}
headers = {
    'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
}
response = requests.post(url=url,data=data,headers=headers)
with open('github.html','w',encoding='utf-8') as f:
    f.write(response.text)

requests模块ajax的get请求

# 基于ajax的get请求
import requests
url = 'https://movie.douban.com/j/new_search_subjects?'
data = {
    'sort': 'U',
    'range':'0,10',
    'tags': '电影',
    'start': '40'
}
headers = {
    'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
}
response = requests.get(url=url,data=data,headers=headers)
# ajax返回的数据类型是json字符串类型
print(response.text)

requests模块ajax的post请求

# 基于ajax的post请求
import requests
import json
url = 'https://fanyi.baidu.com/sug'
data = {
    'kw': '西瓜'
}
headers = {
    'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
}
response = requests.post(url=url,headers=headers,data=data)
json_text =response.text
json_data = json.loads(json_text)
print(json_data)

爬取多页数据

# 爬取带有分页的数据
import requests
import os

if not os.path.exists('./page'):
    os.mkdir('page')

url = 'https://zhihu.sogou.com/zhihu?'
work= input('想搜索什么内容')
page_number = input('想获取前几页的内容')
for page in range(1,int(page_number)+1):
    print(page)
    params = {
        'query': work,
        'sut': '13598',
        'lkt': '1,1546144033954,1546144033954',
        'sst0': '1546144034930',
        'page': page,
        'ie': 'utf8'
    }
    response = requests.get(url=url,params=params)
    page_text = response.text
    page_file = './page/%s%s.html'%(work,page)
    with open(page_file,'w',encoding='utf-8') as f:
        f.write(page_text)

requests模块高级：

cookie作用：服务器端使用cookie来记录客户端的状态信息

import requests

session = requests.session()
#1.发起登录请求：将cookie获取，切存储到session对象中
login_url = 'https://accounts.douban.com/login'
data = {
    "source": "None",
    "redir": "https://www.douban.com/people/185687620/",
    "form_email": "15027900535",
    "form_password": "bobo@15027900535",
    "login": "登录",
}
headers={
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
    }
#使用session发起post请求
login_response = session.post(url=login_url,data=data,headers=headers)

#2.对个人主页发起请求（session（cookie）），获取响应页面数据
url = 'https://www.douban.com/people/185687620/'
response = session.get(url=url,headers=headers)
page_text = response.text

with open('./douban110.html','w',encoding='utf-8') as fp:
    fp.write(page_text)

requests使用ip代理

# 使用代理ip爬取百度搜索ip
import requests

url = 'http://www.baidu.com/s?ie=UTF-8&wd=ip'

# 传入的代理ip是个字典，key是协议，value是ip：端口
proxy = {
    'http':'115.28.209.249:3128'
}
response = requests.get(url=url,proxies=proxy)
with open('daili.html','w') as f:
    f.write(response.text)