Python小爬虫

  自己琢磨写了一个Python的小爬虫,用来爬学校的招聘信息,以下是代码。

 1 __author__ = 'WCQ'
 2 # -*- coding: utf-8 -*-
 3 
 4 import urllib2
 5 import urllib
 6 import re
 7 import thread
 8 import time
 9 
10 
11 #----------- 加载招聘信息 -----------
12 class Spider_Model:
13     def __init__(self):
14         self.page = 1
15         self.enable = False
16         self.endPage = 2
17 
18     # 获取网址的HTML 并编码
19     def GetHTML(self, myUrl):
20         user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
21         headers = {'User-Agent': user_agent}
22         req = urllib2.Request(myUrl, headers=headers)
23         myResponse = urllib2.urlopen(req)
24         myPage = myResponse.read()
25         # encode的作用是将unicode编码转换成其他编码的字符串
26         # decode的作用是将其他编码的字符串转换成unicode编码
27         unicodePage = myPage.decode("GBK")
28         return unicodePage
29 
30 
31     # 将招聘信息抠出来,添加到列表中并且返回列表
32     def GetPage(self, page):
33         myUrl = "http://www.job.ustc.edu.cn/list.php?trans=7&page=" + str(page) + "&MenuID=002002"
34         unicodePage = self.GetHTML(myUrl)
35         # 找出所有class="content"的div标记
36         # re.S是任意匹配模式,也就是.可以匹配换行符
37         jobList = re.findall('<div class="Joplistone">(.*?)</div>', unicodePage, re.S)
38         jobItems = re.findall('<li><a href="(.*?)" style="color:#">(.*?)</a><span class="zhiwei">(.*?)</span><span class="zhuanye">(.*?)</span></li>', jobList[0], re.S)
39         jobs = []
40         for job in jobItems:
41             # job 中第一个元素是招聘链接
42             # job 中第二个元素是招聘公司
43             # job 中第三个元素是职位
44             # job 中第四个元素是发布日期
45             jobs.append([job[1], "http://www.job.ustc.edu.cn/" + job[0], job[2], job[3]])
46         return jobs
47 
48     # 获得招聘细节
49     def getJobDetail(self, joburl):
50         jobHtml = self.GetHTML(joburl)
51         jobDetail = re.findall('<div class="textone">(.*?)</div>', jobHtml, re.S)
52         #print jobDetail
53         return jobDetail
54 
55     # 获得完整的招聘信息
56     def getJobDetailList(self, jobs):
57         jobDetailList = []
58         for job in jobs:
59             jobDetailList.append([job[0], job[1], job[2], job[3], self.getJobDetail(job[1])])
60         return jobDetailList
61 
62     # 先展示一下
63     def showJob(self, page):
64         jobs = self.GetPage(page)
65         jobDetailList = self.getJobDetailList(jobs)
66         for jobDetail in jobDetailList:
67             for iterm in jobDetail:
68                 print iterm
69 
70     def Start(self):
71         self.enable = True
72         page = self.page
73         while self.enable & (page < self.endPage):
74             # 展示招聘信息
75             self.showJob(page)
76             page += 1
77 
78 
79 print u'招聘内容:'
80 myModel = Spider_Model()
81 myModel.Start()

参考资料:[Python]网络爬虫(八):糗事百科的网络爬虫(v0.3)源码及解析(简化更新)

原文地址:https://www.cnblogs.com/focusonepoint/p/5701093.html