使用python获取房价信息

从贝壳网获取房价信息。

基本的步骤和我的这篇博文一样:https://www.cnblogs.com/mrlayfolk/p/12319414.html。不熟悉的可参考一下。

下面的代码是获取3000个样本的代码。

 1 # encoding:utf-8
 2 
 3 '''
 4 目的:从贝壳找房中爬取房价信息。网址:https://cd.ke.com/ershoufang/qingyang/l2/
 5 环境:python 3.7.3
 6 所需的库:requests、BeautifulSoup、xlwt
 7 '''
 8 
 9 import logging
10 import xlwt
11 import requests
12 import string
13 from bs4 import BeautifulSoup
14 
15 headers = {
16     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
17     "Host": "cd.ke.com",
18 }
19 
20 # 将获取的信息保存到表格中
21 def save_info(content):
22     workbook = xlwt.Workbook(encoding = 'ascii')
23     worksheet = workbook.add_sheet('house info')
24     style = xlwt.XFStyle() # 初始化样式
25     font = xlwt.Font() # 为样式创建字体
26     font.name = 'Times New Roman' 
27     font.bold = True # 黑体
28     font.underline = True # 下划线
29     font.italic = True # 斜体字
30     style.font = font # 设定样式
31     worksheet.write(0, 0, '名称')
32     worksheet.write(0, 1, '位置')
33     worksheet.write(0, 2, '房屋信息')
34     worksheet.write(0, 3, '总价(万)')
35     worksheet.write(0, 4, '单价(元/平方米)')
36     
37     for i, item in enumerate(content):
38         for j in range(5):  #多添加一列(序号)
39             worksheet.write(i+1, j, content[i][j])
40     workbook.save('./house_info.xls') # 保存文件
41 
42 
43 # 获取房屋相关的信息
44 # 主要包括:title positon houseinfo totalprice unitprice
45 def get_info():
46     all_info        = []
47     title_list      = []
48     position_list   = []
49     house_list      = []
50     totalPrice_list = []
51     unitPrice_list  = []
52     
53     for i in range(100):
54         link = 'https://cd.ke.com/ershoufang/qingyang/pg%dl2/' % i
55         r = requests.get(link, headers=headers, timeout=10)
56         print (str(i+1), 'status_code: ', r.status_code)
57         soup = BeautifulSoup(r.text, 'lxml')
58         titleInfo = soup.findAll('div', {'class': 'info clear'})
59         positionInfo = soup.findAll('div', {'class': 'positionInfo'})
60         houseInfo = soup.findAll('div', {'class': 'houseInfo'})
61         totalPrice = soup.findAll('div', {'class': 'totalPrice'})
62         unitPrice = soup.findAll('div', {'class': 'unitPrice'})
63         for item in titleInfo:
64             title = item.div.a.text.strip()
65             title_list.append(title)
66         for item in positionInfo:
67             postion = item.a.text.strip()
68             position_list.append(postion)
69         for item in houseInfo:
70             house = item.text.strip().replace('
', ' ').replace(' ', '')
71             house_list.append(house)
72         for item in totalPrice:
73             total_price = item.span.text.strip()
74             totalPrice_list.append(total_price)
75         for item in unitPrice:
76             unit_price = item.span.text.strip().replace('单价', '').replace('元/平米', '')
77             unitPrice_list.append(unit_price)
78     print (len(title_list))
79     print (len(position_list))
80     print (len(house_list))
81     print (len(totalPrice_list))
82     print (len(unitPrice_list))
83     for i in range(len(title_list)):
84         item = [title_list[i], position_list[i], house_list[i], totalPrice_list[i], unitPrice_list[i]]
85         all_info.append(item)
86 
87     return all_info
88 
89 
90 if __name__ == "__main__":
91     all_info = get_info()
92     save_info(all_info)
原文地址:https://www.cnblogs.com/mrlayfolk/p/12329049.html