python爬虫

  1 #! /usr/bin/env python
  2 #coding=utf-8
  3 
  4 import requests
  5 import re,json
  6 import sys,os
  7 import Queue,threading
  8 from bs4 import BeautifulSoup
  9 reload(sys)
 10 sys.setdefaultencoding("utf8")
 11 
 12 def http_req_get(siteurl):
 13     headers = {
 14         "Host": "www.xuebang.com.cn",
 15         "Connection": "keep-alive",
 16         "Cache-Control": "max-age=0",
 17         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
 18         "Upgrade-Insecure-Requests": "1",
 19         "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36",
 20         "Accept-Encoding": "gzip, deflate, sdch",
 21         "Accept-Language": "zh-CN,zh;q=0.8",
 22         "Cookie": "__cfduid=da7335f4b0e760976f98697b651fc10041447572288; pgv_pvi=7944819712; deptNumOf11=140; deptNumOf89=60; deptNumOf711=60; commentNumOf11215=1074; deptNumOf1411=56; JSESSIONID=abcqLyMOLKEVDbynTTtev; a2666_pages=1; a2666_times=4; pgv_si=s4040530944; Hm_lvt_8147cdaed425fa804276ea12cd523210=1447572289,1447678990,1447734730; Hm_lpvt_8147cdaed425fa804276ea12cd523210=1447734730; CNZZDATA5928106=cnzz_eid%3D1168227404-1447570407-%26ntime%3D1447729389; Hm_lvt_863e19f68502f1ae0f9af1286bb12475=1447572289,1447678990,1447734730; Hm_lpvt_863e19f68502f1ae0f9af1286bb12475=1447734730; _ga=GA1.3.122575526.1447572289; _gat=1"}
 23     try:
 24         urlobj = requests.get(
 25             url = siteurl,
 26             headers = headers,
 27         )
 28         return urlobj
 29     except Exception,e:
 30         print 'yichang'
 31         pass
 32 
 33 
 34 #def request_get(siteurl):
 35 
 36 class LinksParser(object):
 37     def __init__(self,urlobj):
 38         self.urlobj = urlobj
 39         self.soup = BeautifulSoup(self.urlobj.text, "html.parser")
 40 
 41     #创建大学目录
 42     def createDaXueDir(self):
 43         #获取当前文件路径 如果不存在就创建文件夹
 44         path = sys.argv[0]
 45         current_dir = os.path.dirname(path)
 46         real_path = current_dir + '\' + self.soup.title.text.encode('gb18030')
 47         if os.path.exists(real_path):
 48             pass
 49         else:
 50             try:
 51                 os.mkdir(real_path)
 52             except:
 53                 pass
 54         return real_path
 55 
 56     #获取学院 并把学校的院系写入文件
 57     def xueyuan(self,path):
 58         try:
 59             fh = open(real_path + '/xueyuan.txt','wb')
 60             for line in self.soup.find_all('a',{'class':'yxcologe'}):#.encode('gb18030')
 61                 fh.writelines(line.text.encode('gb18030').strip() + '
')
 62             fh.close()
 63         except:
 64             pass
 65 
 66     #获取该学院的教师列表的url
 67     def teacher(self,path):
 68         lst = []
 69         length = len(self.soup.find_all('a',{'class':'yxcologe'}))
 70         for i in xrange(length):
 71             #依次获取每个院系的老师链接
 72             url = self.soup.find_all('a',{'class':'yxcologe'})[i]['href'].encode('gb18030')
 73             lst.append(url)
 74         return lst
 75 
 76 
 77     #获取所有教师列表
 78     def teacher_lst(self):
 79         length = len(self.soup.find('span',{'class','TJszlist'}).find_all('li'))
 80         for i in xrange(length):
 81             lst = self.soup.find('span',{'class','TJszlist'}).find_all('li')[i].find('a')['title']
 82             #开始截取该系的名称
 83             yuanxi = str(self.soup.find('span',{'class','t_dqwz'}))
 84             yuanxi = yuanxi[-40:]
 85             yuanxi = yuanxi.split('»')[1]
 86             yuanxi = yuanxi.split('<')[0]
 87             teacher_lst.append({'department':yuanxi,'name':lst})
 88             url = self.soup.find('span',{'class','TJszlist'}).find_all('li')[i].find('a')['href'].encode('gb18030')
 89             teacher_comment_url.append(url)
 90 
 91     #获取教师和该教师下所有评论
 92     def comment_teacher(self):
 93         length = len(self.soup.find_all('span',{'class','TJR_info'}))
 94         if length == 0:
 95             return 'no comments'
 96         else:
 97             for i in xrange(length):
 98                 teacher_content = self.soup.find_all('span',{'class','TJR_info'})[i].find('p',{'class','TJlycon'}).text
 99                 teacher_name = self.soup.find(color='#0088cc').text
100                 teacher_time = self.soup.find_all('span',{'class','TJR_info'})[i].find('span').string
101                 teacher_all_comment.append({'teacher_id':teacher_name,'comment':teacher_content,'time':teacher_time})
102             json_data = json.dumps(teacher_all_comment, encoding="UTF-8", ensure_ascii=False)
103             return json_data
104 
105 class myThreads(threading.Thread):
106     def __init__(self,queue):
107         threading.Thread.__init__(self)
108         self.queue = queue
109 
110     def run(self):
111         while True:
112             if self.queue.empty():
113                 break
114             else:
115                 try:
116                     url = self.queue.get_nowait()
117                     res_obj = LinksParser(http_req_get(url))
118                     res_obj.teacher_lst()
119                 except Exception,e:
120                     break
121 class commentThreads(threading.Thread):
122     def __init__(self,queue):
123         threading.Thread.__init__(self)
124         self.queue = queue
125 
126     def run(self):
127         while True:
128             if self.queue.empty():
129                 break
130             else:
131                 try:
132                     url = self.queue.get_nowait()
133                     res_obj = LinksParser(http_req_get(url))
134                     test = res_obj.comment_teacher()
135                     fh = open(real_path + '/teacher_comment_lst.txt','wb')
136                     fh.write(test)
137                     fh.close()
138                 except Exception,e:
139                     break
140 
141 if __name__ == '__main__':
142     #输入学院的ID
143     #i = sys.argv[1]
144     idlist = [11, 129, 70, 71]
145     for i in idlist:
146         i = str(i)
147         thread_number = 50
148         url = 'http://www.xuebang.com.cn/' + i + '/deptlist'
149         try:
150             urlobj = http_req_get(url)
151             #生成院系列表    
152             response_obj = LinksParser(urlobj)
153             #文件所保存的路径
154             real_path = response_obj.createDaXueDir()
155             response_obj.xueyuan(real_path)
156             
157             #获取该学院的教师列表的url 即每个系学院下面的教师
158             xi_to_teacher = response_obj.teacher(real_path)
159             #获取所有的教师列表
160 
161 
162             global teacher_lst
163             teacher_lst = []
164 
165 
166             #教师comment链接
167             global teacher_comment_url
168             teacher_comment_url = []
169 
170             #多线程加快速度
171             queue = Queue.Queue()
172             for line in xi_to_teacher:
173                 queue.put(line)
174             threads = []
175             for i in xrange(thread_number):
176                 threads.append(myThreads(queue))
177             for t in threads:
178                 t.start()
179             for t in threads:
180                 t.join()
181             teacher_lst = json.dumps(teacher_lst, encoding="UTF-8", ensure_ascii=False)
182             #print len(teacher_lst)
183 
184             #将教师写入文件
185             try:
186                 fh = open(real_path + '/teacher_lst.txt','wb')
187                 fh.write(teacher_lst)
188                 fh.close()
189             except:
190                 pass
191          
192             global teacher_all_comment
193             teacher_all_comment = []
194             #获取教师及教师评论
195             queu = Queue.Queue()
196             for line_url in teacher_comment_url:
197                 queu.put(line_url)
198 
199             comments = []
200             for line in xrange(thread_number):
201                 comments.append(commentThreads(queu))
202             for t in comments:
203                 t.start()
204             for t in comments:
205                 t.join()
206         except:
207             pass
208 
209         #print teacher_all_comment
原文地址:https://www.cnblogs.com/jsq16/p/6018396.html