<双十一特辑> 模拟登录学校教务处爬取全校女生资料和头像


  1 #-*- coding=utf-8 -*- 
  2 import requests
  3 import re
  4 import json
  5 import time
  6 from PIL import Image
  7 import cStringIO
  8 import cookielib  
  9 import urllib
 10 import os
 11 import xlrd
 12 
 13 from requests.packages.urllib3.exceptions import InsecureRequestWarning,InsecurePlatformWarning
 14 requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
 15 requests.packages.urllib3.disable_warnings(InsecurePlatformWarning)
 16 
 17 data=xlrd.open_workbook('1.xlsx')
 18 table=data.sheet_by_name(u'Sheet1')
 19 
 20 message_url='https://matrix.dean.swust.edu.cn/acadmicManager/index.cfm?event=studentProfile:DEFAULT_EVENT'
 21 login_url='https://matrix.dean.swust.edu.cn/cas/login'
 22 topic_url=''
 23 flag=0
 24 temp=''
 25 pic_count=1
 26 
 27 student = {}
 28 student = {
 29     '学号':'',
 30     '姓名':'',
 31     '性别':'',
 32     '生日':'',
 33     'pic':'',
 34     '民族':'',
 35     '行政班':'',
 36     '专业':'',
 37     }
 38 
 39 headers={
 40 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
 41 }
 42 
 43 session=requests.Session()
 44 session.headers=headers    
 45 session.cookies = cookielib.LWPCookieJar(filename='cookies') 
 46 # try:  
 47 #     session.cookies.load(ignore_discard=True)  
 48 # except:  
 49 #     print u"未登陆过,需先登录"  
 50 
 51 
 52 def get_lt(url="https://matrix.dean.swust.edu.cn/cas/login"):  
 53     '''''_lt 是一个动态变化的参数'''  
 54     global session
 55     index_url =  url
 56     index_page = session.get(index_url,verify=False)  
 57     html = index_page.content  
 58     pattern = r'name="lt" type="hidden" value="(.*?)"'  
 59     lt = re.findall(pattern, html)
 60     return lt[0]
 61 
 62 def login(username,password):
 63     global session
 64     global topic_url
 65     global flag
 66     data={
 67     'lt':get_lt(),
 68     'username':username,
 69     'password':password,
 70     'service':'https://matrix.dean.swust.edu.cn/acadmicManager/index.cfm?event=studentPortal:DEFAULT_EVENT',
 71     }
 72     loginurl=login_url
 73     try:
 74         login_page=session.post(loginurl,data=data)
 75         login_code=login_page.content
 76         pattern=r'<a class="btn btn-primary" href="(.*?)"'
 77         real_url=re.findall(pattern, login_code)
 78         topic_url=real_url[0]
 79         flag=1
 80     except:
 81         pass
 82     session.cookies.save()
 83 
 84 def error_clean(error_temp):
 85     global student
 86     global temp
 87     if(error_temp==temp):
 88         session.cookies.clear()
 89         student = {
 90         '学号':'',
 91         '姓名':'',
 92         '性别':'',
 93         '生日':'',
 94         'pic':'',
 95         '民族':'',
 96         '行政班':'',
 97         '专业':'',
 98         }
 99         flag=0
100         topic_url=''
101     else:
102         pass
103 
104 
105 
106 
107 def isLogin():  
108     global session
109     url = "https://matrix.dean.swust.edu.cn/acadmicManager/index.cfm?event=studentProfile:DEFAULT_EVENT"  
110     login_code = session.get(url, allow_redirects=False).status_code  
111     if int(x=login_code) == 200:  
112         return True  
113     else:  
114         return False 
115 
116 def get_message():
117     global session
118     global topic_url
119     global message_url
120     global student
121 
122     html=session.get(topic_url)
123     html=session.get(message_url).text
124 
125     pattern_ming=r'<td>(.*?)</td>'
126     pattern_id=r'<span class="number">(.*?)</span>'
127     pattern_pic=r'<td style="padding:0;" width="135" height="180" valign="middle" align="center" rowspan="6"><img width="135" height="180" align="middle" src="(.*?)" /></td>'
128     message_name=re.findall(pattern_ming, html)
129     message_pic=re.findall(pattern_pic, html)
130     try:
131         student['学号']=re.findall(r'<span class="number">(d*?)</span>', message_name[2])[0]
132         student['姓名']=message_name[4]
133         student['性别']=message_name[6]
134         student['专业']=message_name[37]
135         student['行政班']=message_name[27]
136         student['pic']='https://matrix.dean.swust.edu.cn/acadmicManager/student/profile/'+student['学号']+'.jpg'
137 
138     except:
139         pass
140 
141     #student['生日']=re.findall(r'<span class="number">(.*?)</span>', message_name[8])[0]
142     #student['民族']=message_name[10]
143 
144 
145 def download():
146     global student
147     global session
148     global temp
149     global pic_count
150     basepath=os.path.abspath('.')
151     savepath=os.path.join(basepath,student['专业'])
152     if not os.path.exists(savepath):
153         os.mkdir(savepath)
154     try:
155         picpath=os.path.join(savepath,student['姓名']+student['学号']+'.jpg')
156         r=session.get(student['pic'])
157         with open(picpath, "wb") as pic:
158             pic.write(r.content)
159         print u'>>>>>>>>>成功抓取>>>>>>>>>>>>>>>>>>>>'+student['姓名']
160         temp=student['姓名']
161         session.cookies.clear()
162     except Exception, e:
163         pass
164     
165 
166 if __name__ == '__main__':
167     count=table.nrows
168     i=5000
169     while(count>0):
170         if(table.col_values(3)[i]==u'' and table.col_values(2)[i]!=u'王珀会'):
171             try:
172                 login(str(int(table.col_values(1)[i])), str(table.col_values(13)[i])[11:17])
173             except:
174                 pass
175         if(flag==1):
176             flag=0
177             get_message()
178             download()
179         count=count-1
180         i=i+1
181         session.cookies.clear()

总结:
python处理excel>>  http://www.cnblogs.com/lhj588/archive/2012/01/06/2314181.html
session释放>>    
http://stackoverflow.com/questions/23816139/clear-cookies-from-requests-pytho
注明:
  1.xlsx为提供学生资料的excel
  异常处理之间的妥协关系需要事先计划好
原文地址:https://www.cnblogs.com/vincebye/p/6049465.html