简单的大众点评爬虫

一个很简单的爬虫,爬取中大周边地点的点评信息。

# -*- coding: utf-8 -*-
import requests
import re
import time

def placeSplider(name, star, url):
    time.sleep(5)
    res = requests.get('http://www.dianping.com'+url)
    text = res.text
    longInfo = "<p class="desc J-desc">(.*?)</p>"
    longInfo_re = re.compile(longInfo, re.DOTALL)
    longInfos = longInfo_re.findall(text)
    
    info = "sml-rank-stars sml-str(.*?)".*?<p class="desc">(.*?)</p>"
    info_re = re.compile(info, re.DOTALL)
    results = info_re.findall(text)
    #print result
    #print '%d results' %len(results)
    if len(results) == 0 or len(results[0]) < 2 or results[0][1].count(u'人点评') > 0:
        print u'没有点评
'
        return
    fOut = open('D:\%s.txt' %name, 'w')
    fOut.write('place star %s
' %star)
    for result in results:
        star = result[0]
        info = result[1]
        if info.count('<span') > 0 or info.count(u'仅售')>0:#去广告
            print ''
            break
        else:
            if info[-6:] == u"......":#替换短评论为相应的长评论
                info = info[:-6]
                for i in longInfos:
                    if i.count(info) > 0:
                        info = i
                        break
            info = info.replace("<br/>", '')
            info = info.replace("<br>", '')
            info = info.replace("&nbsp;", '')
            print star, info
            fOut.write('%s
' %star)
            fOut.write('%s
' %info.encode('u8'))
    fOut.close()

for page in range(1, 6):
    res = requests.get('http://www.dianping.com/search/keyword/206/0_%E4%B8%AD%E5%B1%B1%E5%A4%A7%E5%AD%A6/p'+str(page))
    text = res.text
    href = "data-hippo-type="shop" title="(.*?)" target="_blank" href="(.*?)".*?sml-rank-stars sml-str(.*?)""
    href_re = re.compile(href, re.DOTALL)
    result =  href_re.findall(text)
    for place in result:
        name = place[0]
        url = place[1]
        star = place[2]
        print name, star, url
        placeSplider(name, star, url)
    time.sleep(5)
原文地址:https://www.cnblogs.com/instant7/p/4160448.html