广州楼盘抓取分析-分析问题

上文其实还是有不少问题的。

1.顺序执行,效率比较慢;2.不能断点执行。

那么,解决办法是什么呢?

对于问题1,可以采用生产者消费者模式来改写,代码如下

# -*- coding: utf-8 -*-
#######################################################################
# Copyright (C) 2005-2016 UC Mobile Limited. All Rights Reserved
# File          : first_sale_spider.py
#
# Creation      : 2016/2/23 19:41
# Author        : shufeng.lsf@ucweb.com
#######################################################################
import random
from threading import Thread

import requests
import re

import time
from pyquery import PyQuery as pq
from Queue import Queue
import MySQLdb
import uniout
import sys
reload(sys)
sys.setdefaultencoding("utf-8")

community_list = []

HOST = "127.0.0.1"
USER = "root"
PASSWD = ""
DB = "house_analysis"
PORT = 3306

queue = Queue(10)

class DBOperate(object):
    def __init__(self, host, user, passwd, db, port, charset="utf8"):
        self.host = host
        self.user = user
        self.passwd = passwd
        self.db = db
        self.port = port
        self.conn = MySQLdb.connect(self.host, self.user, self.passwd, self.db, self.port, charset="utf8")
        self.cur = self.conn.cursor()

    def insertSql(self,sql):
        self.cur.execute(sql)
        self.conn.commit()

    def __del__(self):
        self.cur.close()
        self.conn.close()


def requestByGet(url):
    r = requests.get(url)
    return r.content

def getNextPage(content):
    m = re.findall(r'<a href="(.+?)" class="next-page next-link">下一页</a>',content)
    if len(m)>0:
        next_url = m[0]
    else:
        next_url = ''
    return next_url

def getCommunityList(content):
    community_urls = re.findall(r'data-link="(http://gz.fang.anjuke.com/loupan/d+?.html)"',content)
    print "正在采集...",community_urls
    if len(community_urls)>0:
        return community_urls


def getHouseInfo(url):
    p = pq(url)
    name = p('h1').text().strip()
    style = p('.house-item').text().split(",")[0].strip()
    price = p('.sp-price').text().strip()
    l = p('.lpAddr-text').text()
    location = re.split('[ | ]',l)
    area = location[-2].split('-')[0].strip()
    zone = location[-2].split('-')[1].strip()
    address = location[-1].strip()
    detail_location = location[-1].strip()
    result = {
        "name": name,
        "area": area,
        "location": zone,
        "detail_location": detail_location,
        "house_style": style,
        "price": price
    }
    return result


def detailPageHandler(cur, detail_url):
    result = getHouseInfo(detail_url)
    print "result:",result
    cur.insertSql("insert into first_sale (name,area,location,detail_location,house_style,price) VALUES('%s','%s','%s','%s','%s','%s')" % (
        result['name'],
        result['area'],
        result['location'],
        result['detail_location'],
        result['house_style'],
        result['price']
    ))

class UrlProducer(Thread):
    def __init__(self, start_url):
        Thread.__init__(self)
        self.start_url = start_url

    def run(self):
        global queue
        while True:
            content = requestByGet(self.start_url)
            next_url = getNextPage(content)
            community_urls = getCommunityList(content)
            for url in community_urls:
                queue.put(url)
                time.sleep(random.random())
                print "进入队列的url:",url
            if next_url != '':
                self.start_url = next_url
                continue
            else:
                break

class GetHouseInfo(Thread):
    def __init__(self, cur):
        Thread.__init__(self)
        self.cur = cur

    def run(self):
        global queue
        while True:
            url = queue.get()
            detailPageHandler(self.cur, url)
            queue.task_done()
            time.sleep(random.random())
            print "处理完毕的url:", url


def main():
    cur = DBOperate(host=HOST, user=USER, passwd=PASSWD, db=DB, port=PORT)
    UrlProducer("http://gz.fang.anjuke.com/loupan/?from=navigation").start()
    GetHouseInfo(cur).start()


if __name__ == '__main__':
    main()

2.对于不能断点执行的问题,可以用异常捕获的方式将当前执行的url保存下来,下次直接从文件中读取执行即可。

原文地址:https://www.cnblogs.com/alexkn/p/5225744.html