爬虫小程序之爬取链家二手房

利用Python相关工具爬取链家二手房中的各房源信息,包含小区名称、户型、装修情况、单价和总价

要求:

  1、使用工具:urllib.request(请求)、re(解析)、csv(存储)

  2、编程范式:面向对象

  3、反反爬机制:利用time和random控制请求频次、伪装请求头User-Agent

代码如下:

  

from urllib import request
import re
import csv
import time
import random


class LianjiaSpider(object):
    def __init__(self, pages=1):
        self.base_url = 'https://hf.lianjia.com/ershoufang/pg{}'
        self.headers = {'User-Agent': 'Mozilla/5.0'}
        self.pages = pages

    # 获取请求响应
    def get_page(self, url):
        req = request.Request(url, headers=self.headers)
        res = request.urlopen(req)
        html = res.read().decode('utf-8')
        self.parse_page(html)

    # 解析相应内容
    def parse_page(self, html):
        pattern = re.compile(
            r'<div class="houseInfo".*?data-el="region">(.*?)</a>(.*?)</div>.*?<div class="totalPrice".*?<span>(.*?)</span>',
            re.S)
        house_list = pattern.findall(html)
        for house in house_list:
            print(house[1].split('|'))
        self.save_csv(house_list)

    # 保存解析内容
    def save_csv(self, house_list):
        house_new_list = []
        for house in house_list:
            house = (house[0].strip(),
                     house[1].split('|')[1].strip(),
                     house[1].split('|')[2].strip()[:-2],
                     house[1].split('|')[4].strip(),
                     house[2].strip(),)
            house_new_list.append(house)

        with open('lianjia_ershoufang.csv', 'a+', encoding='utf-8', newline='') as f:
            writer = csv.writer(f)
            writer.writerows(house_new_list)

    def main(self):
        for page in range(1, self.pages + 1):
            url = self.base_url.format(str(page))
            self.get_page(url)
            print('第%d页下载成功' % page)
            time.sleep(random.randint(1, 10))


if __name__ == '__main__':
    with open('lianjia_ershoufang.csv', 'a+', encoding='utf-8', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['小区名称', '户型', '面积', '装修状况', '总价'])
    spider = LianjiaSpider(100)
    spider.main()

  

原文地址:https://www.cnblogs.com/yuxiangyang/p/11093471.html