爬虫实例:饿了么爬虫

饿了么外卖网站是一个ajax动态加载的网站

Version1:直接页面提取

from lxml import etree
import requests
import sys
import time

reload(sys)
sys.setdefaultencoding('utf-8')


url = 'https://www.ele.me/place/ws101hcw982?latitude=22.52721&longitude=113.95232'
response = requests.get(url)
print response.status_code

time.sleep(10)
html = response.content
selector = etree.HTML(html)
rez = selector.xpath('//*[@class="place-rstbox clearfix"]')
print 'haha',rez  #[]
for i in rez:
    Name = i.xpath('//*[@class="rstblock-title"]/text()')
    print name
    msales = i.xpath('//*[@class="rstblock-monthsales"]/text()')
    tip = i.xpath('//*[@class="rstblock-cost"]/text()')
    stime = i.xpath('//*[@class="rstblock-logo"]/span/text()')

print u'店名'
for j in Name:
    print j
    break

问题:根据//*[@class="place-rstbox clearfix"]xpath提取成功,但是rez输出为空

Version2:通过接口提取

geohash=ws101hcw982&latitude=22.52721&longitude=113.95232:位置信息参数及参数值

terminal=web:渠道信息

extras[]=activities和offset=0未知

import requests
import json

url = 'https://www.ele.me/restapi/shopping/restaurants?extras[]=activities&geohash=ws101hcw982&latitude=22.52721&limit=30&longitude=113.95232&offset=0&terminal=web'
resp = requests.get(url)
print  resp.status_code    

Jdata = json.loads(resp.text)
#print Jdata

for n in Jdata:
    name = n['name']
    msales = n['recent_order_num']
    stime = n['order_lead_time']
    tip = n['description']
    phone = n['phone']
    print name

输出:原以为通过limit=100就可以提取100条商家信息,然而最多只显示30

 Version3:通过selenium提取

from selenium import webdriver
import selenium.webdriver.support.ui as ui
import time

driver = webdriver.PhantomJS(executable_path=r"C:Python27phantomjs.exe")
#driver = webdriver.Chrome()
driver.get('https://www.ele.me/place/ws101hcw982?latitude=22.52721&longitude=113.95232')
time.sleep(10)
driver.get_screenshot_as_file("E:\Elm_ok.jpg")

wait = ui.WebDriverWait(driver,10)
wait.until(lambda driver: driver.find_element_by_xpath('//div[@class="place-rstbox clearfix"]'))

name = driver.find_element_by_xpath('//*[@class="rstblock-title"]').text
msales = driver.find_element_by_xpath('//*[@class="rstblock-monthsales"]').text
tip = driver.find_element_by_xpath('//*[@class="rstblock-cost"]').text
stime = driver.find_element_by_xpath('//*[@class="rstblock-logo"]/span').text

print name  #乐凯撒比萨(生态园店)

注:find_element只提取一个

改进版

#coding=utf-8
from selenium import webdriver
import selenium.webdriver.support.ui as ui
import time


driver = webdriver.PhantomJS(executable_path=r"C:Python27phantomjs.exe")
#driver = webdriver.Chrome()
driver.get('https://www.ele.me/place/ws101hcw982?latitude=22.52721&longitude=113.95232')
time.sleep(10)
#driver.get_screenshot_as_file("E:\Elm_ok.jpg")

wait = ui.WebDriverWait(driver,10)
wait.until(lambda driver: driver.find_element_by_xpath('//div[@class="place-rstbox clearfix"]'))
#driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")  #滚动至底部页面
def execute_times(times):
    for i in range(times + 1):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(5)
execute_times(20)

name = driver.find_elements_by_xpath('//*[@class="rstblock-title"]')
msales = driver.find_elements_by_xpath('//*[@class="rstblock-monthsales"]')
tip = driver.find_elements_by_xpath('//*[@class="rstblock-cost"]')
stime = driver.find_elements_by_xpath('//*[@class="rstblock-logo"]/span')

#print name,msales,stime,tip  #[<selenium.webdriver.remote.webelement.WebElement (session="c941cfb0-a428-11e7-affa-f38716880ab3",...]
print type(tip)  #<type 'list'>
print len(name)  #120

for i in name:
    print i.text

说明:通过execute_times函数,滚动条每下移一次,休息5s,从而使页面加载更多的商家信息

输出:

原文地址:https://www.cnblogs.com/Ryana/p/7276997.html