简单反爬虫代码

import urllib.request

#发起请求
res = urllib.request.urlopen('http://www.baidu.com/')
print(type(res))

#获取状态码
# print(res.getcode())

#获取请求地址
# print(res.geturl())

#获取头信息
# print(res.getheaders())

#读取全文 以二进制方式读取网页全文 需要进行解码操作
#编码 encode --->字节
#解码 decode --->文本
#utf-8 gbk gb2312
# print(res.read().decode('utf-8'))

# with open("baidu.html","w",encoding="utf-8") as f:
# f.write(res.read().decode("utf-8"))

#请求网页# url = "http://www.baidu.com/"
# url = 'http://www.baidu.com/'
# urllib.request.urlretrieve(url=url,filename='baidu.html')

#请求图片
# img_url = "http://b-ssl.duitang.com/uploads/item/201601/28/20160128084015_z3cUP.jpeg"
# name = img_url.rsplit("/")[-1-2]
# urllib.request.urlretrieve(url=img_url,filename=name+".jpg")


# 请求视频
# vedio_url = 'http://v6-default.ixigua.com/741142c2612117615b8343d7a6c12643/5cece6c6/video/m/2204a5c8c6f50be412db774c6d688b6bade1162097c100008550afaf2a34/?rc=M29wdjxrNHFxbTMzaTczM0ApQHRAbzQ5NTM7MzQzMzY3NDUzNDVvQGg1dilAZzN3KUBmM3UpZHNyZ3lrdXJneXJseHdmNzZAMnJzYW5eLV5hXy0tYS0vc3MtbyNvIzI0MC8vMS0uNDAxNTI2LTojbyM6YS1vIzpgLXAjOmB2aVxiZitgXmJmK15xbDojMy5e&vfrom=xgplayer'
# urllib.request.urlretrieve(url=vedio_url,filename="toutiao.mp4")


url = 'http://www.baidu.com/'
#构造请求头
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36"
}

#构造请求
req = urllib.request.Request(url=url,headers=headers)
#发起请求 获得响应内容
res = urllib.request.urlopen(req)
#读取响应内容
print(res.getcode())



import urllib
import urllib.request
url1 = 'http://www.baidu.com/s?'
name = input('请输入要查询的内容:')
source = {
'wd':name
}
url2 = urllib.parse.urlencode(source)
url3 = url1+url2
print(url3)

res = urllib.request.urlopen(url3)
print(res.getcode())


# http://www.baidu.com/s?wd=%E7%BE%8E%E5%A5%B3
# %e7%be%8e%e5%a5%b3



百度贴吧
#导包
import urllib
import urllib.request
#定义固定url
url1 = 'http://tieba.baidu.com/f?'

name = input('请输入要查询的贴吧名称:')
pge = int(input('请输入要查询的页数:'))
#定义url关键字
source = {
'kw':name
}
#关键字编码
url2 = urllib.parse.urlencode(source)

#拼接url
url3 = url1+url2
for page in range(1,pge+1):
pn = (page-1)*50
full_url = url3+"&pn=%s"%pn
#发起请求
res = urllib.request.urlopen(full_url)
#读取
print(res.getcode())



# import urllib
# # import urllib.request
# #
# # url='https://tieba.baidu.com/f?'

# # name = input('请输入要查询的贴吧:')
# # page = int(input('请输入要查询的页数:'))
# #
# # headers = {
# # "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36",
# # "Host": "www.baidu.com",
# # }
# # source = {
# # 'kw':name
# # }
# # new_url = urllib.parse.urlencode(source)
# # url2 = url + new_url
# #
# # for i in range(1,page+1):
# # pn = (i-1)*50
# # full_url = url2 + "&pn=%s"%pn
# # #构造请求
# # req = urllib.request.Request(url=full_url,headers=headers)
# # #获取响应
# # res = urllib.request.urlopen(req)
# # print(res.getcode())
# # # urllib.request.urlretrieve(url=full_url,filename='第%s页.html'%i)



import urllib
import urllib.request
#kw搜索名称
#pn页数50为数距,初始值为0
url = "https://tieba.baidu.com/f?"
name = input("请输入你要搜索的贴吧:")
page = int(input("请输入要爬取的页数:"))
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
"Host": "tieba.baidu.com"
}
source = {
"kw":name
}
url1 = urllib.parse.urlencode(source)
url2 = url + url1
for i in range(1,page+1):
pn = (i-1)*50
full_url = url2 + "&pn=%s"%pn
req = urllib.request.Request(url=full_url,headers=headers)
res = urllib.request.urlopen(req)
# print(res.getcode())
urllib.request.urlretrieve(url=full_url,filename="第%s页.html"%i)

百度翻译
from urllib import request,parse
import json
# POST提交地址
url = "https://fanyi.baidu.com/sug"
headers ={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36"
}
name = input("请输入要翻译的内容:")
#发起POST请求时需要的数据
form = {
"kw":name
}
# 将字典进行编码 编码完成后是一个字符串类型
forms = parse.urlencode(form)
#构造post请求 post请求和get请求的区别就在于是否有data参数
#post的数据必须是字节 所以我们用bytes()函数进行转换
req = request.Request(url=url,data=bytes(forms,encoding="utf-8"),headers=headers)
#发起请求
res = request.urlopen(req)
#获得响应数据
content = res.read().decode("utf-8")
# print(content)
#把一个json字符串转换为字典
res = json.loads(content)
print(res)
print(res['data'][0]['v'])




豆瓣电影
from urllib import request,parse
import json
url = "https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action=&start=40&limit=20"

headers = {
"User-Agent": "Safari5.0:Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_7) AppleWebKit/534.16+ (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4"
}
form = {
"start":"40",
"limit":"20"
}
forms = parse.urlencode(form)
req = request.Request(url=url,data=bytes(forms,encoding="utf-8"),headers=headers)
response = request.urlopen(req)
content = response.read().decode("utf-8")
res = json.loads(content)
print(res)
for i in res:
title = i['title']
actors = i['actors']
print(title)
print(actors)






# from urllib import request,parse
# import json
#
# url = "https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action="
#
# headers = {
# "User-Agent": "Safari5.0:Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_7) AppleWebKit/534.16+ (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4"
# }
#
#
# form = {
# "start":"10",
# "limit":"40"
# }
#
# forms = parse.urlencode(form)
#
# req = request.Request(url=url,data = bytes(forms,encoding="utf-8"),headers=headers)
#
# response = request.urlopen(req)
#
# conten = response.read().decode("utf-8")
#
# res = json.loads(conten)
#
# # print(res)
#
# for var in res:
# title = var["title"]
# actors = var["actors"]
#
# print(title)
# print(actors)



模拟登陆
from urllib import request
url = "https://user.qzone.qq.com/2862346891"
headers = {
"User-Agent": "Safari5.0:Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_7) AppleWebKit/534.16+ (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
"cookie": "tvfe_boss_uuid=58a76f49bc6ae610; pgv_pvid=7834310198; _qpsvr_localtk=0.6362081929899539; pgv_pvi=9847598080; pgv_si=s2316292096; pgv_info=ssid=s461279900; uin=o2862346891; skey=@VHWWL050N; ptisp=ctc; RK=hRqQfr4Nwh; ptcz=fc6eb6560e4aa5836aa5f0ac644eede30edc9c95840ea9126bb7462bcc1e13c6; p_uin=o2862346891; pt4_token=uvlbuVIP21ETttWCMi*1wnd2sjHOOsnfKyW3A0vMbqI_; p_skey=ynMqbG7hjPmWuwpOwlccdBS*I36xq9z3jBf83sJq7Ik_; Loading=Yes; qz_screen=1280x720; 2862346891_todaycount=0; 2862346891_totalcount=29429; QZ_FE_WEBP_SUPPORT=1; __Q_w_s_hat_seed=1; rv2=80336DBCB68C00892BB825EA0FACDE3AD70C89C10A1903D221; property20=63A613D7498074A6D1571F37A4941C048B9F458DA01089D6A40BF0D8EB5D330F6A4905911FFF9094; cpu_performance_v8=11; v6uin=2862346891|qzone_player"
}
req = request.Request(url=url,headers=headers)
response = request.urlopen(req)
content = response.read().decode("utf-8")
with open("qq.html","w",encoding="utf-8") as f:
f.write(content)


肯德基店铺位置
from urllib import request,parse
import json
url = "http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname"
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36"

}
size = int(input("请输入要查询多少条数据:"))
form = {
"cname": "北京",
"pid": "",
"pageIndex": 1,
"pageSize": size,
}
forms = parse.urlencode(form)
req = request.Request(url=url,data=bytes(forms,encoding="utf-8"),headers=headers)
response = request.urlopen(req)
result = response.read().decode("utf-8")
res = json.loads(result)
print(res)
for i in res['Table1']:
addressDetail = i["addressDetail"]
storeName = i["storeName"]
print("addressDetail:", addressDetail)
print("storeName:", storeName)












原文地址:https://www.cnblogs.com/wyf2019/p/10946334.html