python爬虫--基础入门1

一个最简单的爬虫

from urllib.request import urlopen

url = 'http://www.baidu.com'

# 发送请求
response = urlopen(url)

# 读取内容
info = response.read()
# 打印内容
#print(info.decode())

# 打印状态码
print(response.getcode())
print('*'*50)
# 打印真实url
print(response.geturl())
print('*'*50)
# 打印响应头
print(response.info())

request的使用

from urllib.request import urlopen
from urllib.request import Request
from fake_useragent import UserAgent
url = 'https://www.baidu.com'
ua = UserAgent()
headers = {
    "User-Agent": ua.chrome
}

request = Request(url,headers=headers)
print(request.get_header('User-agent'))

response = urlopen(request)

info = response.read()

print(info.decode())

get请求编码转换

get请求时，往往在url上需要添加参数，但是如果参数值是中文的话会出现请求报错的情况,我们就需要将中转换编码才行

from urllib.request import urlopen
from urllib.request import Request
from fake_useragent import UserAgent
from urllib import parse
# 单个参数
#url = 'https://www.baidu.com/s?wd={}'.format(parse.quote('火狐'))

# 如果url中有多个参数，可以使用urlencode
args = {
    'wd':'火狐',
    'id':'utf-8'
}
url = 'https://www.baidu.com/s?'+parse.urlencode(args)
print(url)
ua = UserAgent()
headers = {
    "User-Agent": ua.chrome
}
print(url)
request = Request(url,headers=headers)
#print(request.get_header('User-agent'))

response = urlopen(request)

info = response.read()

print(info.decode())

Post请求

from urllib.request import Request,urlopen
from urllib.parse import urlencode
from fake_useragent import UserAgent
import ssl


url = 'https://www.maguangyi.top/tzsc/login.php'

from_data = {
    'pwd' : '123456'
}

headers = {
    'User-Agent' : UserAgent().chrome
}

f_data = urlencode(from_data)

request = Request(url,data=f_data.encode(),headers=headers)

# 忽略ssl证书验证
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

response = urlopen(request,context=ctx)

print(response.read().decode())