1 爬虫 百度/搜狗/豆瓣/药监局

爬 爬 爬 -- 

两个软件 
Anaconda  内置 Jupyter   编译器
Fiddler4     一个代理软件
例1  获取整个页面/搜狗
import requests
url = 'https://www.sogou.com/'  # 1指定url
res = requests.get(url=url)     # 2 请求得到相应对象
page_text = res.text            # 3 text属性返回的是字符串形式的响应数据
with open('./sg.html','w',encoding='utf-8') as f:    #4 持久化数据
    f.write(page_text)
例二   搜狗搜索的结果页面
#UA检测  解决办法headers请求头里加 User-Agent(浏览器标识)
import requests
url = 'https://www.sogou.com/web'
wd = input('你要搜啥:')
param = {'query':wd}
res = requests.get(url=url,params=param)  #携带参数
# print(res.encoding)  # ISO-8859-1  查看响应的编码格式
res.encoding = 'utf-8' # 指定编码格式
page_text = res.text
name = wd + '.html'
with open(name,'w',encoding='utf-8') as f:
    f.write(page_text)
    print(name,'爬取结束!')
案例二  更新 添加 请求头 User-Agent 键值对
import requests
url = 'https://www.sogou.com/web'
wd = input('你要搜啥:')
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'}
param = {
    'query':wd
}
res = requests.get(url=url,params=param,headers=headers)  # 参数,请求头 UA检测  反爬机制
# print(res.encoding)  # ISO-8859-1  查看响应的编码格式
res.encoding = 'utf-8' # 编码格式改变
page_text = res.text
name = wd + '.html'
with open(name,'w',encoding='utf-8') as f:
    f.write(page_text)
    print(name,'爬取结束!')
案例3 
# 获取 百度翻译的结果数据 
# 页面中有可能存在动态加载的数据
import requests
url = 'https://fanyi.baidu.com/sug'
wd = input('enter a word: ')
data = {'kw':wd}
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'}
res = requests.post(url=url,data=data,headers=headers) # post 请求
obj_json = res.json()  # json
for i in obj_json['data']:
    print(i['k'],' ',i['v'])
案例4
#
豆瓣电影详情数据 # 页面中有些情况会包含动态加载的数据 鼠标滚轮下滑 数据持续加载 import requests url = 'https://movie.douban.com/j/chart/top_list' headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'} param = { "type": "5", "interval_id": "100:90", "action":"", "start": "0", "limit": "50", } obj_json = requests.get(url=url,params=param,headers=headers).json() # get请求 params参数 # print(obj_json) print(len(obj_json))
案例五
#
药监局 化妆品公司数据 http://125.35.6.84:81/xk/ import requests post_url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsList' headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'} all_data = [] IDs =[] for page in range(1,3): data = { "on": "true", "page": str(page), "pageSize": "15", "productName":"", "conditionType": "1", "applyname": "", "applysn": "", } # 首页ajax 请求返回的响应数据 json_obj = requests.post(url=post_url,data=data,headers=headers).json() for dic in json_obj["list"]: IDs.append(dic['ID']) print(len(IDs)) for id in IDs: detail_post_url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsById' data = {'id':id} detail_dic = requests.post(url=detail_post_url,data=data,headers=headers).json() all_data.append(detail_dic) print(all_data[0]) print(len(all_data))

 # 下面是 PM2.5 检测网站

# 需求爬取当前页面全部的城市名称https://www.aqistudy.cn/historydata/
url = 'https://www.aqistudy.cn/historydata/'
page_text = requests.get(url=url,headers=headers).text

tree = etree.HTML(page_text)
#热门城市://div[@class="bottom"]/ul/li/a/text()
#全部城市://div[@class="bottom"]/ul/div[2]/li/a/text()
all_city_names = tree.xpath('//div[@class="bottom"]/ul/li/a/text() | //div[@class="bottom"]/ul/div[2]/li/a/text()')
print(all_city_names,len(all_city_names))
import requests
from lxml import etree

url='http://industry.nbd.com.cn'
headers={
     'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
}

page_text=requests.get(url=url,headers=headers).text
tree=etree.HTML(page_text)
lis_all=tree.xpath('//ul[@class="m-columnnews-list"]/li')
all_list=[]
for  i in lis_all:
    title=i.xpath('./a/img/@alt')[0]
    all_list.append(title)
print(all_list)
每经网 热门精选话题 xtree xpath
原文地址:https://www.cnblogs.com/zhangchen-sx/p/10792461.html