PYTHON流向下载

  1 #-*- coding:utf-8 -*-
  2 import gzip
  3 import re
  4 import http.cookiejar
  5 import urllib.request
  6 import urllib.parse
  7 import xlwt
  8 import time,os
  9  
 10 
 11 
 12 def saveexcel(flow,filename,coding='gbk'):
 13     #flow 需要转换为excel的里面，格式为双层列表
 14     #coding excel页面编码
 15     try:
 16         workbook = xlwt.Workbook(encoding=coding)
 17         sheet = workbook.add_sheet('Sheet1')
 18         for row,rowdata in enumerate(flow):
 19             for col,val in enumerate(rowdata):
 20                 sheet.write(row,col,val.strip(),style = xlwt.Style.default_style)
 21         excelname = '\%s.xls'%filename
 22         workbook.save(excelname)
 23         return excelname
 24 
 25     except Exception as e:
 26         if hasattr(e,"code"):
 27             print ('excel写入失败，错误原因' +str(e.code))
 28         if hasattr(e,"reason"):
 29             print ('excel写入失败，错误原因' +str(e.reason))
 30         return None
 31 
 32  #从指定页面中取表单参数
 33 def getParm(data,parm):
 34     cer = re.compile('name="'+parm+'".* value="(.*?)"', flags = 0)
 35     strlist = cer.findall(data)
 36     
 37     if strlist:
 38         return strlist[0]
 39     else:
 40         return None
 41 
 42 def getOpener():
 43     #自动设置COOKIER
 44     # deal with the Cookies
 45     print( '正在设置cookie')    
 46     cj = http.cookiejar.CookieJar()
 47     pro = urllib.request.HTTPCookieProcessor(cj)
 48     opener = urllib.request.build_opener(pro, urllib.request.HTTPHandler)
 49     urllib.request.install_opener(opener)    
 50     print( '设置cookie成功')        
 51     return opener
 52  
 53 
 54 header = {
 55     'Connection': 'Keep-Alive',
 56     'Accept': 'text/html, application/xhtml+xml, */*',
 57     'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
 58 }
 59  
 60 url = 'http://115.231.58.130:8021/Default.aspx'
 61 header['Referer']='http://115.231.58.130:8021/'
 62 #1、设置Cookie
 63 opener = getOpener()
 64 
 65 
 66 ##2、初始化数据开始
 67 request = urllib.request.Request(url)
 68 try:
 69     html = urllib.request.urlopen(request).read()
 70     #取表单参数
 71     EVENTVALIDATION = getParm(html.decode('gbk'),'__EVENTVALIDATION')
 72     VIEWSTATEGENERATOR =  getParm(html.decode('gbk'),'__VIEWSTATEGENERATOR')
 73     VIEWSTATE = getParm(html.decode('gbk'),'__VIEWSTATE')
 74     btnsubmit = getParm(html.decode('gbk'),'sbtnSubmit')   
 75 
 76 except urllib.request.URLError as e:
 77     if hasattr(e,"code"):
 78         print ('请求页面失败，请检查网络设置，错误原因' +str(e.code))
 79     if hasattr(e,"reason"):
 80         print ('请求页面失败，请检查网络设置，错误原因' +str(e.reason))
 81 #取表单参数结束 
 82 
 83 
 84 id = '***'
 85 password = '***'
 86 postDict = {
 87          'LoginID':id,
 88         'Pwd':password,
 89         '__EVENTVALIDATION':EVENTVALIDATION,
 90         '__VIEWSTATEGENERATOR':VIEWSTATEGENERATOR,
 91         '__VIEWSTATE':VIEWSTATE,
 92         'btnSubmit':btnsubmit
 93 }
 94 postData = urllib.parse.urlencode(postDict).encode(encoding='UTF8')
 95 
 96 ##3、正式登录
 97 request = urllib.request.Request(url, postData,headers=header)
 98 try:
 99     response = urllib.request.urlopen(request)
100     data = response.read()
101 except urllib.request.URLError as e:
102     if hasattr(e,"code"):
103         print ('页面加载失败，请检查网络及账号设置，错误原因' +str(e.code))
104     if hasattr(e,"reason"):
105         print ('页面加载失败，请检查网络及账号设置，错误原因' +str(e.reason))
106 
107 #登录结束
108 print('login:',data.decode('gbk'))
109 
110 ##4 进入产品搜索界面进行数据提取
111 
112 posturl= 'http://115.231.58.130:8021/Search/ProductFlow.aspx'
113 request = urllib.request.Request(posturl)
114 try:
115     html = urllib.request.urlopen(request).read()
116     #取表单参数
117     EVENTVALIDATION = getParm(html.decode('gbk'),'__EVENTVALIDATION')
118     VIEWSTATEGENERATOR =  getParm(html.decode('gbk'),'__VIEWSTATEGENERATOR')
119     VIEWSTATE = getParm(html.decode('gbk'),'__VIEWSTATE')
120     #btnsearch = getParm(html.decode('gbk'),'btnSearcht')
121 except urllib.request.URLError as e:
122     if hasattr(e,"code"):
123         print ('请求页面失败，请检查网络设置，错误原因' +str(e.code))
124     if hasattr(e,"reason"):
125         print ('请求页面失败，请检查网络设置，错误原因' +str(e.reason))
126 
127 
128 postDict['__EVENTTARGET']=''
129 postDict['__EVENTARGUMENT']=''  
130 postDict['__EVENTVALIDATION']=EVENTVALIDATION
131 postDict['__VIEWSTATEGENERATOR']=VIEWSTATEGENERATOR
132 postDict['__VIEWSTATE']=VIEWSTATE
133 postDict['PName']=''
134 postDict['PID']=''
135 postDict['txtStartDate']='2016-01-01'
136 postDict['txtEndDate']='2016-01-31'
137 postDict['ConvertToExcel.x']='6'
138 postDict['ConvertToExcel.y']='9'
139 postDict['btnSearch']='' 
140 postData = urllib.parse.urlencode(postDict).encode(encoding='UTF8')
141 
142 print( '搜索页面数据获取成功，正在抓取流向数据...')    
143 
144 
145 ###登录搜索页面
146 request = urllib.request.Request(posturl, postData,headers=header)
147 try:
148     response = urllib.request.urlopen(request)
149     data = response.readlines()
150 except urllib.request.URLError as e:
151     if hasattr(e,"code"):
152         print ('页面加载失败，请检查网络及账号设置，错误原因' +str(e.code))
153     if hasattr(e,"reason"):
154         print ('页面加载失败，请检查网络及账号设置，错误原因' +str(e.reason))
155 print( '流向抓取成功，正在保存为excel...')
156 print('search:',data)
157 ##5 保存为excel
158 
159 workbook = xlwt.Workbook(encoding='gbk')
160 sheet = workbook.add_sheet('Sheet1')
161 for row,rowdata in enumerate(data):
162     rowdata_list = rowdata.decode('gbk').split('	')
163     for col,val in enumerate(rowdata_list):
164             sheet.write(row,col,val,style = xlwt.Style.default_style)
165 
166 ntime = time.strftime('%Y%m%d%H%M%S')
167 excelname = ntime+'%s.xls'%'宁波宝瑞达'
168 workbook.save(excelname)
169 print( 'excel导出成功，请查看程序目录下%s文件。'%excelname)