Python爬虫(八)

源码:

 1 import requests
 2 import re
 3 from my_mysql import MysqlConnect
 4 import time,random
 5 
 6 
 7 # 获取招聘详情链接
 8 def get_urls(page, headers):
 9     url = 'https://hr.tencent.com/position.php?lid=&tid=&keywords=python&start=page'.format(page)
10     response = requests.get(url, headers=headers)
11     pat = r'href="(position_detail.*?)">'
12     url_list_bytes = re.findall(pat.encode('utf-8'), response.content)
13     return url_list_bytes
14 
15 # 获取招聘详情
16 def get_info(url, headers):
17     response = requests.get(url, headers=headers)
18     html_bytes = response.content
19     # print(html_bytes)
20 
21     # title 标题
22     pat = r'id="sharetitle">(.*?)</td>'
23     res = re.search(pat.encode('utf-8'), html_bytes)
24     title = res.group(1).decode('utf-8')
25     # address 地点
26     pat = r'工作地点:</span>(.*?)</td>'
27     res = re.search(pat.encode('utf-8'), html_bytes)
28     address = res.group(1).decode('utf-8')
29     # types 类别
30     pat = r'职位类别:</span>(.*?)</td>'
31     res = re.search(pat.encode('utf-8'), html_bytes)
32     types = res.group(1).decode('utf-8')
33     # counts 人数
34     pat = r'招聘人数:</span>(.*?)</td>'
35     res = re.search(pat.encode('utf-8'), html_bytes)
36     counts = res.group(1).decode('utf-8')
37     # duty 职责
38     pat = r'工作职责.*?<ul class="squareli">(.*?)</ul>'
39     res = re.search(pat.encode('utf-8'), html_bytes)
40     duty_str = res.group(1).decode('utf-8')
41     pat = r'<li>(.*?)</li>'
42     duty = re.findall(pat,duty_str)
43     duty = ('
').join(duty)
44     # requires 要求
45     pat = r'工作要求.*?<ul class="squareli">(.*?)</ul>'
46     res = re.search(pat.encode('utf-8'), html_bytes)
47     requires_str = res.group(1).decode('utf-8')
48     pat = r'<li>(.*?)</li>'
49     requires = re.findall(pat, requires_str)
50     requires = ('
').join(requires)
51     return title,address,types,counts,duty,requires
52 
53 
54 if __name__ == '__main__':
55     mc = MysqlConnect('127.0.0.1','root','123456','homework')
56     sql = "insert into tencentzp(title,address,types,counts,duty,requires) values(%s,%s,%s,%s,%s,%s)"
57     headers = {
58         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
59     }
60     for page in range(0,200,10):
61         url_list_bytes = get_urls(page,headers)
62         # print(url_list_bytes)
63         for url in url_list_bytes:
64             # print(url.decode('utf-8'))
65             url = 'https://hr.tencent.com/' + url.decode('utf-8')
66             info = get_info(url,headers)
67             print(info)
68             mc.exec_data(sql,info)
69             time.sleep(random.random()*5)
原文地址:https://www.cnblogs.com/zhxd-python/p/9501321.html