python抓取链家房源信息

闲着没事就抓取了下链家网的房源信息,抓取的是北京二手房的信息情况,然后通过网址进行分析,有100页,并且每页的url都是类似的

url = 'https://bj.lianjia.com/ershoufang/pg' + 页数,然后请求是get 请求,所以静态页面,然后依次来进行分析,并且存储在mongodb中,每次插入的时候还是要字符串装换成json格式在进行插入,页面的解析用的是bs,解析很方便,代码用的是单进程,耗时是大致66s,因为怕ip被封,所以在每次页面请求之后都要sleep 1秒。

#-*-coding:utf-8-*-
import urllib
import urllib2
import re
import requests
import json
import lxml
from bs4 import BeautifulSoup
import time
from pymongo import MongoClient

from lxml import etree
client = MongoClient('localhost',27017)
db = client.test
House = db.House
headers = {
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-Encoding':'gzip, deflate, br',
    'Accept-Language':'zh-CN,zh;q=0.9',
    'Cache-Control':'max-age=0',
    'Connection':'keep-alive',
    'Cookie':'......',
    'Host':'bj.lianjia.com',
    'Upgrade-Insecure-Requests':'1',
    'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
}
URL = 'https://bj.lianjia.com/ershoufang/pg'
def download(url):
    num_try = 2
    while num_try > 0:
        num_try -= 1
        try:
            content = requests.get(url,headers = headers)
            return content.text
        except urllib2.URLError as e:
            print 'Download error',e.reason

    return None


def get_message(url):
    html = download(url)

    soup = BeautifulSoup(html,'html.parser')
    prices = soup.find_all('div','priceInfo')
    total_price = []
    for each in prices:
        total_price.append(each.span.string)
    address = []
    house_types = []
    areas = []
    towards = []
    decorates = []
    elevates = []
    message = soup.find_all('div',attrs={'class':'houseInfo'})
    for each in message:
        List = each.get_text().split('|')
        address.append(List[0].strip())
        house_types.append(List[1].strip())
        areas.append(List[2].strip())
        towards.append(List[3].strip())
        decorates.append(List[4].strip())
        if len(List) == 5:
            elevates.append("None")
        else:
            elevates.append(List[5].strip())
    for addres,house_type,area,price,toward,decorate,elevate in zip(address,house_types,areas,total_price,towards,decorates,elevates):
        mess = "{"Address":"%s","House_type":"%s","Area":"%s","Price":"%s","Toward":"%s","Decorate":"%s","Elevete":"%s"}"%(addres,house_type,area,price,toward,decorate,elevate)
        print mess
        message = json.loads(mess)
        House.insert(message)

if __name__ == '__main__':
    t = time.time()
    print t
    for num in xrange(1,101):

        url = URL + str(num)
        print url
        get_message(url)
        time.sleep(1)
    t1 = time.time()
    print 'Total time:'
    print t1 - t - 100
原文地址:https://www.cnblogs.com/chenyang920/p/7842495.html