python 爬取链家

import json

import  requests
from lxml import etree
from time import sleep


url = "https://sz.lianjia.com/ershoufang/rs/"
headers = {
    "User-Agent":"",
    "Refer":"https://sz.lianjia.com/ershoufang/pg2/"
}
resp = requests.get(url,headers=headers)

base_url = "https://sz.lianjia.com/ershoufang/pg{}/"
html = etree.HTML(resp.text)
data = html.xpath('//*[@id="content"]//div[@class="page-box fr"]/div/@page-data')
data = json.loads(data[0])
totalPage = data['totalPage']
curPage = data['curPage']

def get_data(url):
    list = []
    resp = requests.get(url, headers=headers)
    html = etree.HTML(resp.text)
    ul = html.xpath('.//ul[@class="sellListContent"]/li')
    for li in ul:
        face = li.xpath('./a/img/@src')
        title = li.xpath('.//div[@class="title"]/a/text()')
        position = li.xpath('.//div[@class="positionInfo"]/a/text()')
        house_info = li.xpath('.//div[@class="houseInfo"]/text()')
        follow_info = li.xpath('.//div[@class="followInfo"]/text()')
        price = li.xpath('.//div[@class="priceInfo"]/div[@class="totalPrice"]/span/text()')
        unit_price = li.xpath('.//div[@class="priceInfo"]/div[@class="unitPrice"]/span/text()')
        tag = li.xpath('.//div[@class="tag"]//span/text()')
        content = {}
        content["face"] = face[0]
        content["title"] = title[0]
        content["position"] = position[0]
        content["house_info"] = house_info[0]
        content["follow_info"] = follow_info[0]
        content["price"] = price[0]
        content["unit_price"] = unit_price[0]
        if len(tag) >=1 and  tag[0] is not None:
            content['tag'] = tag[0]
        list.append(content)
    return list

totalList = []
for i in range(1,totalPage+1):
    url = base_url.format(i)
    print("crawl url  " + url)
    cur_list = get_data(url)
    print(cur_list)
    totalList = totalList + cur_list

url = base_url.format(1)

print(totalList)

原文地址:https://www.cnblogs.com/brady-wang/p/12491105.html