python 获取提交表单网址内容(即需要密码网址)以财务网站为例

import requests
import re
from functools import reduce
import json
import base64
import time
import os


-------- 安装的python包---------------


pwd = base64.b64decode("")    #64位解码

loginMeta = {"username":'',"pwd":pwd} #用户名和密码

msession = requests.Session()
ret = msession.post("http://uis.shou.edu.cn/cas/login?isLoginService=11&service=http://ecampus.shou.edu.cn/c/portal/login",
{
"username":loginMeta['username'],
"password":loginMeta['pwd'],
"submit":""
},allow_redirects = False)
ret = msession.get(ret.headers['Location'],allow_redirects = False)
Jsession = ret.headers['Set-Cookie'].split(';')[0]
ret = msession.get(ret.headers['Location'],allow_redirects = False)

Jheaders = {'Cookie':'COOKIE_SUPPORT=true; JSESSIONID=%s; GUEST_LANGUAGE_ID=zh_CN'%Jsession}

getASessionUrl = ''
ret = msession.get(getASessionUrl,headers=Jheaders,allow_redirects=False)
while 'Location' in ret.headers:
ret = msession.get(ret.headers['Location'])

fwUrl = ""
ret = msession.get(fwUrl,headers=Jheaders,allow_redirects=False)
while 'Location' in ret.headers:
ret = msession.get(ret.headers['Location'])
ACookies = requests.utils.dict_from_cookiejar(msession.cookies)

ret = msession.get('')

ret = msession.get('')


------------登录部分长久保存cookie-------------------------------------
def parseOrderInfo(content):
content = content.replace(" ", '').replace(' ', '').replace(' ', ' ')
eles = re.findall('<tr.{0,4}orderno="\d+".{900}', content, re.I)
orders = []
for ele in eles:
p = re.subn("<td.*?>(.*?)</td>", " \1", ele) #subn替换函数
p = re.subn("<input.*?/>", "", p[0])
p = p[0]
p = list(filter(lambda x: x if len(x.strip()) > 1 else None, p.split(' ')))
print(p)
if (len(p) > 6):
cinfo = {}
cinfo["orderId"] = p[1].strip()
cinfo["project"] = p[2].strip()
cinfo["reason"] = p[3].strip()
cinfo["pay"] = p[4].strip()
cinfo["date"] = p[6].strip()
orders += [cinfo]
else:
raise Exception("too LONG order Description")

return orders


ret = msession.post(url='http://cwc1.shou.edu.cn:82/SFP_ClaimsSelf/OrderQuery/OrderShow',
data={'DepartProject': '',
'Depart': '',
'depname': '',
'Object': '',
'projectname': '',
'OrderStartTime': '',
'OrderEndTime': '',
'OrderNo': '',
'OrderState': '1,2,3,4,5,8,-1',
'ExpenBusinessType': '',
'currentPageIndex': '1',
'num': '1',
'isture': 'false',
'ProxyPerson': '',
'OrderRemark': ''},
headers={'Cookie': 'ASP.NET_SessionId=%s; SFP_Verify_Cookie=%s'
% (ACookies["ASP.NET_SessionId"],
ACookies["SFP_Verify_Cookie"]),
'Referer': 'http://cwc1.shou.edu.cn:82/SFP_ClaimsSelf/OrderQuery/OrderIndex',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest'})
time.sleep(2)
seaContent = ret.content.decode()
orderInfo = parseOrderInfo(seaContent)
orders = re.findall("SFP_ClaimsSelf/OrderQuery/PrintOrder?OrderNo=(\d+)", seaContent)
# pages=2
pages = int(re.findall("pagecount: '(\d*)'", seaContent)[0])
if pages > 1:
for i in range(1, pages + 1):
if i == 1:
ret = msession.post(url='http://cwc1.shou.edu.cn:82/SFP_ClaimsSelf/OrderQuery/OrderShow', #提交post表单
data={'DepartProject': '',
'Depart': '',
'depname': '',
'Object': '',
'projectname': '',
'OrderStartTime': '',
'OrderEndTime': '',
'OrderNo': '',
'OrderState': '1,2,3,4,5,8,-1',
'ExpenBusinessType': '',
'currentPageIndex': '1',
'num': '1',
'isture': 'false',
'ProxyPerson': '',
'OrderRemark': ''},
headers={'Cookie': 'ASP.NET_SessionId=%s; SFP_Verify_Cookie=%s'
% (ACookies["ASP.NET_SessionId"],
ACookies["SFP_Verify_Cookie"]),
'Referer': 'http://cwc1.shou.edu.cn:82/SFP_ClaimsSelf/OrderQuery/OrderIndex',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest'})
else:
ret = msession.post(url='http://cwc1.shou.edu.cn:82/SFP_ClaimsSelf/OrderQuery/OrderShow',
data={'DepartProject': '',
'Depart': '',
'depname': '',
'Object': '',
'projectname': '',
'OrderStartTime': '',
'OrderEndTime': '',
'OrderNo': '',
'OrderState': '1,2,3,4,5,8,-1',
'ExpenBusinessType': '',
'currentPageIndex': '%d' % i,
'num': '2',
'isture': 'false',
'ProxyPerson': '',
'OrderRemark': ''},
headers={'Cookie': 'ASP.NET_SessionId=%s; SFP_Verify_Cookie=%s'
% (ACookies["ASP.NET_SessionId"],
ACookies["SFP_Verify_Cookie"]),
'Referer': 'http://cwc1.shou.edu.cn:82/SFP_ClaimsSelf/OrderQuery/OrderIndex',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest'})
seaContent = ret.content.decode()
orders += re.findall("SFP_ClaimsSelf/OrderQuery/PrintOrder?OrderNo=(\d+)", seaContent)
orderInfo += parseOrderInfo(seaContent)

time.sleep(1)
# orderprint
for orderId in orders:
Url = 'http://cwc1.shou.edu.cn:82/SFP_ClaimsSelf/OrderQuery/PrintOrder?OrderNo='
printUrl = Url + orderId
Number = int(i)

# print(printUrl)
result = requests.get(url=printUrl,
headers={'Cookie': 'ASP.NET_SessionId=%s; SFP_Verify_Cookie=%s'
% (ACookies["ASP.NET_SessionId"],
ACookies["SFP_Verify_Cookie"]),
'Referer': ret.url,
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
})
address="E:/totally/FinancePDF"+"/" #pdf文件存储地址
fileName=orderId+".pdf"
if os.path.isfile(address+fileName):
print(fileName+'文件已存在')
else:
with open(address+orderId+".pdf", "wb") as f:
f.write(result.content)


sumInfo = {"detail": orders}

print(json.dumps(sumInfo, indent=4))

------------------提交表单部分(表单内容不能少,否则会停止爬取)--------------------------------------

if __name__ == '__main__':
parseOrderInfo()

 
 
 
原文地址:https://www.cnblogs.com/setname/p/8417737.html