Python爬取小猪短租全网数据

爬取时需要进行的操作:

1:输入你是要爬取国内的还是海外的,1表示国内,2表示海外;
2:然后输入你要爬取的城市名称,就可以了;

每个函数的功能:

choose_area函数根据你输入的是国内还是海外,输出不同的区域名称
url_list函数根据你输入的城市名称,来爬取城市有多少房源,来判断有几页数据,因为小猪短租网上面只显示13页数据,超过了的话页只显示13页的数据,所以做个判断就可以
get_url函数根据你输入的城市名称和页码,来构建你输入的城市每页的url

get_html函数就是获取每页的html数据
get_zf_url函数根据每页的html数据来爬取每个租房的url链接
get_zf_message函数,通过传入的租房url链接来获取每个租房的信息

'''
获取每个城市的url
'''
import  re
import requests

url = 'http://jci.xiaozhustatic1.com/e17061601/xzjs?k=Front_Search&httphost=bj.xiaozhu.com'     #获取城市名称的链接

ser = input('输入你要查找的地区(1:国内;2:海外):')            #输入你你要查找是国内的短租房还是海外的短租房
html = requests.get(url).text       #通过上面提供的url来爬取每个省份的拼音

def choose_area():      #判断你需要查找的是国内的还是海外的,并输出相应的内容
    city_tup = re.compile('citys[[0-9]d*]=new Array(.*?);').findall(html)
    #print(city_tup)
    for city_name in city_tup[29:]:
        #print(city_name)
        city_time = re.compile('[0-9]d*:[0-9]d*').findall(city_name)
        if ser == '1':
            if len(city_time) == 0:
                city = re.compile('[u4E00-u9FA5]+').findall(city_name)[0]     #城市名称
                city_jc = re.compile('[a-z]w*').findall(city_name)[1]      #城市拼音
                city_zf = re.compile('[0-9]d*').findall(city_name)[0]      #城市租房数量
                city_dic = {city:[city_jc,city_zf]}
                yield city_dic
            else:
                pass
        elif ser == '2':
            if len(city_time) != 0:
                city = re.compile('[u4E00-u9FA5]+').findall(city_name)[0]
                city_jc = re.compile('[a-z]w*').findall(city_name)[1]
                city_zf = re.compile('[0-9]d*').findall(city_name)[0]
                city_dic = {city: [city_jc, city_zf]}
                yield city_dic
            else:
                pass

def get_url(city_jc,page):  #提供省份的名称和页码来构建需要爬取的url
    url = 'http://{}.xiaozhu.com/search-duanzufang-p{}-0/'.format(city_jc,page)
    return url

#根据你提供的省份名称来判断,这个省份有多少房源,但是每个省份的房源只显示13页的数据,所有做个判断,超过了的话就只显示13页,没有超过的话就有几页就显示几页
def url_list(city_name):
    #city_name = input('输入你要查找的城市名称:')
    for city in choose_area():
        if city_name in city.keys():
            if int(int(city[city_name][1])/24) > 13:
                for page in range(1,14):
                    url = get_url(city[city_name][0],page)
                    yield url
            elif int(int(city[city_name][1])/24) <= 13:
                sum_page = int(int(city[city_name][1])/24) <= 13
                for page in range(1,sum_page + 1):
                    url = get_url(city[city_name][0], page)
                    yield url
import requests
from lxml import etree
import re

headers = {
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36'
}

def get_html(url):      #获取网页的html内容
    response = requests.get(url,headers=headers)
    if response.status_code == 200:
        response.encoding = response.apparent_encoding
        html = response.text
        return html
    else:
        print('没有获取到HTML')

def get_zf_url(url):  #根据网页的内容来获取每个短租房的url,并传入get_zf_message函数来获取每个短租房的信息
    html = get_html(url)
    links = etree.HTML(html).xpath('//*[@id="page_list"]/ul/li/a/@href')
    return links    #返回是一个列表

def get_zf_message(zf_url):
    html = get_html(zf_url)
    area = etree.HTML(html).xpath('//div[@class="pho_info"]/p/@title')[0]
    h_image = etree.HTML(html).xpath('//*[@id="curBigImage"]/@src')[0]
    #//*[@id="floatRightBox"]/div[3]/div[3]/h6/a
    #因为用lxml获取不到房东姓名,不知道为什么,但是用正则就可以
    fd_name = re.compile('<a class="lorder_name" href=".*?" title="(.*?)" target="_blank">.*?</a>').findall(html)[0]
    #fd_name = etree.HTML(html).xpath('//a[class="lorder_name"]/text()')
    fd_link = re.compile('<a class="lorder_name" href="(.*?)" title=".*?" target="_blank">.*?</a>').findall(html)[0]
    zf_price = etree.HTML(html).xpath('//*[@id="pricePart"]/div[1]/span/text()')[0]
    zf_title = etree.HTML(html).xpath('//div[@class="pho_info"]/h4/em/text()')[0]
    data = {
        '标题': zf_title,
        '价格': zf_price,
        '地址': area,
        '图片': h_image,
        '房东': fd_name,
        '房东链接': fd_link,
    }
    return data
from spider_ziaozu import *
from get_url import *


if __name__ == '__main__':
    city = input('输入你想爬取的城市名称:')
    for url in url_list(city):      #通过传入上海的,获取上海的短租房的所有页面url
        print(url)
        zf_list = get_zf_url(url)       #把上海的短租房的所有页面url,传给get_zf_url来获取每个短租房url,返回的也是一个列表
        #print(zf_list)
        for zf_url in zf_list:          #通过遍历每个短租房的url,并传入给get_zf_message,来获取每个租房的信息
            print(zf_url)
            try:
                zf_message = get_zf_message(zf_url)
                print(zf_message)
            except Exception as e:
                print(e)

下面是城市的数据:

domestic_list = [
{'北京': ['beijing', '8221']},
{'上海': ['shanghai', '6996']},
{'广州': ['guangzhou', '2727']},
{'成都': ['chengdu', '5369']},
{'深圳': ['shenzhen', '2522']},
{'西安': ['xian', '2562']},
{'南京': ['nanjing', '1675']},
{'杭州': ['hangzhou', '2455']},
{'重庆': ['chongqing', '3171']},
{'武汉': ['wuhan', '1901']},
{'苏州': ['suzhou', '1603']},
{'无锡': ['wuxi', '240']},
{'青岛': ['qingdao', '3712']},
{'厦门': ['xiamen', '1548']},
{'三亚': ['sanya', '2384']},
{'大连': ['dalian', '1034']},
{'哈尔滨': ['haerbin', '790']},
{'秦皇岛': ['qinhuangdao', '1924']},
{'天津': ['tianjin', '485']},
{'昆明': ['kunming', '819']},
{'香港': ['xianggang', '164']},
{'长春': ['changchun', '350']},
{'沈阳': ['shenyang', '562']},
{'合肥': ['hefei', '533']},
{'郑州': ['zhengzhou', '370']},
{'太原': ['taiyuan', '470']},
{'威海': ['weihai', '821']},
{'丽江': ['lijiang', '632']},
{'大理': ['dali', '576']},
{'桂林': ['guilin', '518']},
{'澳门': ['aomen', '33']},
{'福州': ['fuzhou', '410']},
{'宁波': ['ningbo', '233']},
{'珠海': ['zhuhai', '595']},
{'长沙': ['changsha', '727']},
{'石家庄': ['shijiazhuang', '288']},
{'拉萨': ['lasa', '47']},
{'常州': ['changzhou', '137']},
{'扬州': ['yangzhou', '224']},
{'东莞': ['dongguan', '70']},
{'海口': ['haikou', '518']},
{'兰州': ['lanzhou', '198']},
{'洛阳': ['luoyang', '195']},
{'乌鲁木齐': ['wulumuqi', '213']},
{'徐州': ['xuzhou', '51']},
{'贵阳': ['guiyang', '503']},
{'呼和浩特': ['huhehaote', '82']},
{'济南': ['jinan', '375']},
{'唐山': ['tangshan', '102']},
{'保定': ['baoding', '83']},
{'南昌': ['nanchang', '206']},
{'邯郸': ['handan', '12']},
{'南宁': ['nanning', '168']},
{'潍坊': ['weifang', '65']},
{'锦州': ['jinzhou', '54']},
{'日照': ['rizhao', '508']},
{'临沂': ['linyi', '41']},
{'鞍山': ['anshan', '23']},
{'廊坊': ['langfang', '101']},
{'大庆': ['daqing', '29']},
{'北海': ['beihai', '436']},
{'中山': ['zhongshan', '70']},
{'西宁': ['xining', '362']},
{'金华': ['jinhua', '71']},
{'丹东': ['dandong', '181']},
{'承德': ['chengde', '437']},
{'盘锦': ['panjin', '35']},
{'淄博': ['zibo', '30']},
{'株洲': ['zhuzhou', '17']},
{'佛山': ['foshan', '127']},
{'吉林': ['jilinshi', '50']},
{'邢台': ['xingtai', '9']},
{'齐齐哈尔': ['qiqihaer', '8']},
{'宜昌': ['yichang', '42']},
{'大同': ['datong', '83']},
{'烟台': ['yantai', '803']},
{'银川': ['yinchuan', '76']},
{'温州': ['wenzhou', '52']},
{'淮安': ['huaian', '37']},
{'绵阳': ['mianyang', '121']},
{'包头': ['baotou', '40']},
{'抚顺': ['fushun', '5']},
{'泰安': ['taian', '103']},
{'济宁': ['jining', '11']},
{'连云港': ['lianyungang', '33']},
{'泉州': ['quanzhou', '95']},
{'安阳': ['anyang', '24']},
{'惠州': ['huizhou', '537']},
{'葫芦岛': ['huludao', '595']},
{'嘉兴': ['jiaxing', '405']},
{'南通': ['nantong', '143']},
{'攀枝花': ['panzhihua', '15']},
{'柳州': ['liuzhou', '19']},
{'东营': ['dongying', '1']},
{'佳木斯': ['jiamusi', '5']},
{'通辽': ['tongliao', '5']},
{'德州': ['dezhou', '22']},
{'赣州': ['ganzhou', '6']},
{'滨州': ['binzhou', '3']},
{'咸阳': ['xianyang', '23']},
{'江门': ['jiangmen', '17']},
{'漳州': ['zhangzhou', '84']},
{'新乡': ['xinxiang', '8']},
{'襄樊': ['xiangfan', '4']},
{'南充': ['nanchong', '29']},
{'聊城': ['liaocheng', '17']},
{'张家口': ['zhangjiakou', '196']},
{'沧州': ['cangzhou', '22']},
{'石河子': ['shihezi', '4']},
{'宝鸡': ['baoji', '5']},
{'赤峰': ['chifeng', '22']},
{'湛江': ['zhanjiang', '41']},
{'商丘': ['shangqiu', '5']},
{'平顶山': ['pingdingshan', '4']},
{'信阳': ['xinyang', '13']},
{'九江': ['jiujiang', '29']},
{'营口': ['yingkou', '500']},
{'本溪': ['benxi', '6']},
{'钦州': ['qinzhou', '2']},
{'衡阳': ['hengyang', '19']},
{'汕头': ['shantou', '63']},
{'芜湖': ['wuhu', '18']},
{'呼伦贝尔': ['hulunbeier', '124']},
{'湘潭': ['xiangtan', '11']},
{'朝阳市': ['chaoyang', '2']},
{'清远': ['qingyuan', '137']},
{'遂宁': ['suining', '6']},
{'泰州': ['jstaizhou', '6']},
{'莆田': ['putian', '3']},
{'枣庄': ['zaozhuang', '14']},
{'泸州': ['luzhou', '52']},
{'舟山': ['zhoushan', '304']},
{'镇江': ['zhenjiang', '19']},
{'开封': ['kaifeng', '130']},
{'鄂尔多斯': ['eerduosi', '3']},
{'十堰': ['shiyan', '14']},
{'延边': ['yanbian', '75']},
{'淮北': ['huaibei', '5']},
{'临汾': ['linfen', '21']},
{'常德': ['changde', '5']},
{'荆州': ['jingzhou', '3']},
{'郴州': ['chenzhou', '46']},
{'德阳': ['deyang', '13']},
{'绍兴': ['shaoxing', '33']},
{'南阳': ['nanyang', '7']},
{'菏泽': ['heze', '1']},
{'台州': ['zjtaizhou', '36']},
{'遵义': ['zunyi', '7']},
{'阜新': ['fuxin', '2']},
{'盐城': ['yancheng', '6']},
{'宿迁': ['suqian', '2']},
{'焦作': ['jiaozuo', '16']},
{'长治': ['changzhi', '26']},
{'吉安': ['jian', '14']},
{'驻马店': ['zhumadian', '1']},
{'汉中': ['hanzhong', '28']},
{'河源': ['heyuan', '11']},
{'铁岭': ['tieling', '2']},
{'晋中': ['jinzhong', '63']},
{'安康': ['ankang', '4']},
{'岳阳': ['yueyang', '13']},
{'肇庆': ['zhaoqing', '15']},
{'衡水': ['hengshui', '21']},
{'牡丹江': ['mudanjiang', '24']},
{'安庆': ['anqing', '11']},
{'黄冈': ['huanggang', '2']},
{'娄底': ['loudi', '3']},
{'乐山': ['leshan', '187']},
{'蚌埠': ['bengbu', '14']},
{'昌吉': ['changji', '1']},
{'韶关': ['shaoguan', '28']},
{'阳江': ['yangjiang', '87']},
{'潮州': ['chaozhou', '15']},
{'张家界': ['zhangjiajie', '171']},
{'怀化': ['huaihua', '2']},
{'西双版纳': ['xishuangbanna', '141']},
{'三明': ['sanming', '9']},
{'运城': ['yuncheng', '15']},
{'眉山': ['meishan', '17']},
{'许昌': ['xuchang', '11']},
{'防城港': ['fangchenggang', '16']},
{'永州': ['yongzhou', '1']},
{'益阳': ['yiyang', '5']},
{'上饶': ['shangrao', '45']},
{'衢州': ['quzhou', '1']},
{'六盘水': ['liupanshui', '10']},
{'白山': ['baishan', '37']},
{'六安': ['luan', '1']},
{'铜陵': ['tongling', '1']},
{'池州': ['chizhou', '5']},
{'晋城': ['jincheng', '3']},
{'黄石': ['huangshi', '10']},
{'湘西': ['xiangxi', '24']},
{'宜春': ['jxyichun', '18']},
{'茂名': ['maoming', '1']},
{'梅州': ['meizhou', '2']},
{'凉山': ['liangshan', '330']},
{'宜宾': ['yibin', '22']},
{'湖州': ['huzhou', '296']},
{'海拉尔': ['hailaer', '2']},
{'延安': ['yanan', '5']},
{'内江': ['neijiang', '9']},
{'南平': ['nanping', '11']},
{'三门峡': ['sanmenxia', '2']},
{'松原': ['songyuan', '5']},
{'阜阳': ['fuyang', '3']},
{'黄山': ['huangshan', '105']},
{'巴彦淖尔': ['bayannaoer', '1']},
{'渭南': ['weinan', '8']},
{'咸宁': ['xianning', '12']},
{'恩施': ['enshi', '29']},
{'抚州': ['jxfuzhou', '5']},
{'龙岩': ['longyan', '13']},
{'通化': ['tonghua', '18']},
{'莱芜': ['laiwu', '1']},
{'宣城': ['xuancheng', '8']},
{'锡林郭勒': ['xilinguole', '18']},
{'景德镇': ['jingdezhen', '21']},
{'曲靖': ['qujing', '3']},
{'广元': ['guangyuan', '9']},
{'巴中': ['bazhong', '5']},
{'济源': ['jiyuan', '3']},
{'鹤岗': ['hegang', '2']},
{'黑河': ['heihe', '6']},
{'吕梁': ['lvliang', '3']},
{'天水': ['tianshui', '11']},
{'榆林': ['sxyulin', '4']},
{'萍乡': ['pingxiang', '4']},
{'哈密': ['hami', '7']},
{'自贡': ['zigong', '23']},
{'阿坝': ['aba', '260']},
{'宁德': ['ningde', '20']},
{'马鞍山': ['maanshan', '1']},
{'阿拉善': ['alashan', '12']},
{'阳泉': ['yangquan', '2']},
{'新余': ['xinyu', '1']},
{'喀什': ['kashi', '2']},
{'黔西南': ['qianxinan', '14']},
{'鸡西': ['jixi', '1']},
{'伊春': ['hljyichun', '24']},
{'大兴安岭': ['daxinganling', '5']},
{'宿州': ['ahsuzhou', '2']},
{'梧州': ['wuzhou', '2']},
{'阿克苏': ['akesu', '1']},
{'汕尾': ['shanwei', '12']},
{'广安': ['guangan', '4']},
{'资阳': ['ziyang', '3']},
{'安顺': ['anshun', '7']},
{'黔东南': ['qiandongnan', '15']},
{'七台河': ['qitaihe', '1']},
{'河池': ['hechi', '4']},
{'张掖': ['zhangye', '27']},
{'酒泉': ['jiuquan', '113']},
{'陇南': ['longnan', '4']},
{'神农架': ['shennongjia', '12']},
{'克拉玛依': ['kelamayi', '4']},
{'伊犁': ['yili', '19']},
{'雅安': ['yaan', '8']},
{'甘孜': ['ganzi', '94']},
{'丽水': ['lishui', '39']},
{'瓦房店': ['wafangdian', '2']},
{'武夷山': ['wuyishan', '34']},
{'亳州': ['bozhou', '1']},
{'贺州': ['hezhou', '6']},
{'石嘴山': ['shizuishan', '1']},
{'中卫': ['zhongwei', '20']},
{'平凉': ['pingliang', '1']},
{'铜川': ['tongchuan', '3']},
{'昭通': ['zhaotong', '1']},
{'巴音郭楞': ['bayinguoleng', '2']},
{'日喀则': ['rikaze', '4']},
{'铜仁': ['tongren', '6']},
{'忻州': ['xinzhou', '15']},
{'吴忠': ['wuzhong', '1']},
{'玉树': ['yushu', '1']},
{'海西': ['haixi', '11']},
{'玉溪': ['yuxi', '11']},
{'红河': ['honghe', '7']},
{'德宏': ['dehong', '8']},
{'吐鲁番': ['tulufan', '2']},
{'黔南': ['qiannan', '9']},
{'张北': ['zhangbei', '3']},
{'鹤壁': ['hebi', '1']},
{'五指山': ['wuzhishan', '4']},
{'兴安': ['xingan', '6']},
{'嘉峪关': ['jiayuguan', '20']},
{'商洛': ['shangluo', '1']},
{'海东': ['haidong', '5']},
{'海北': ['haibei', '17']},
{'随州': ['suizhou', '1']},
{'保山': ['baoshan', '25']},
{'楚雄': ['chuxiong', '2']},
{'普洱': ['puer', '3']},
{'文山': ['wenshan', '1']},
{'迪庆': ['diqing', '14']},
{'和田': ['hetian', '1']},
{'阿拉尔': ['alaer', '1']},
{'文昌': ['wenchang', '39']},
{'琼海': ['qionghai', '30']},
{'儋州': ['danzhou', '1']},
{'万宁': ['wanning', '17']},
{'东方': ['dongfang', '5']},
{'安定': ['anding', '1']},
{'澄迈': ['chengmai', '7']},
{'临高': ['lingao', '1']},
{'白沙': ['baisha', '2']},
{'昌江': ['changjiang', '10']},
{'乐东': ['ledong', '5']},
{'陵水': ['lingshui', '60']},
{'保亭': ['baoting', '2']},
{'琼中': ['qiongzhong', '1']},
{'长白山': ['changbaishan', '113']},
{'台北': ['taibei', '14']},
{'新北': ['xinbei', '4']},
{'台中': ['taizhong', '9']},
{'高雄': ['gaoxiong', '2']},
{'新竹': ['xinzhu', '1']},
{'嘉义': ['jiayi', '1']},
{'花莲乡': ['hualianxiang', '20']},
{'台东县': ['taidongxian', '1']},
{'澎湖县': ['penghuxian', '1']},
]

overseas_list = [
{'墨尔本': ['moerben', '4']},
{'悉尼': ['xini', '3']},
{'维多利亚': ['weiduoliya', '2']},
{'巴黎': ['bali', '856']},
{'巴厘岛': ['balidaobalidao', '1']},
{'佛罗伦萨': ['foluolunsa', '1']},
{'名古屋': ['nagoya', '2']},
{'福冈': ['fugang', '1']},
{'吉隆坡': ['jilongpo', '19']},
{'马累': ['malei', '1']},
{'奥克兰': ['aokelan', '4']},
{'圣彼得堡': ['shengbidebao', '1']},
{'普吉岛': ['phuket', '5']},
{'芭堤雅': ['badiya', '10']},
{'伦敦': ['lundun', '1']},
{'旧金山': ['jiujinshan', '1']},
{'拉斯维加斯': ['lasiweijiasi', '1']},
{'文莱': ['wenlai', '4']},
{'弗雷德里顿': ['fuleidelidun', '1']},
{'杜塞尔多夫': ['dusaierduofu', '3']},
{'雅加达': ['yajiada', '1']},
{'埼玉': ['qiyu', '2']},
{'广岛': ['guangdao', '1']},
{'千叶': ['qianye', '2']},
{'': ['jie', '1']},
{'相模原': ['xiangmoyuan', '1']},
{'船桥': ['chuanqiao', '1']},
{'东大阪': ['dongdaban', '12']},
{'暹粒': ['xianli', '3']},
{'哥打基纳巴鲁': ['gedajinabalu', '4']},
{'奥兰多': ['aolanduo', '3']},
{'圣何塞': ['shenghs', '1']},
{'立川': ['lichuan', '1']},
{'调布': ['diaobu', '1']},
{'日野': ['riye', '1']},
{'马塔兰': ['mataram', '1']},
]
View Code

 因为每个城市的房源时刻在变得所有,不一定准确

原文地址:https://www.cnblogs.com/114811yayi/p/7061674.html