requests爬取数据与aiohttp爬取数据对比

# 同步

from datetime import datetime

import requests
from lxml import etree

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit"
                         "/537.36 (KHTML, like Gecko) "
                         "Chrome/72.0.3626.121 Safari/537.36"}


def get_movie_url():
    req_url = "https://movie.douban.com/chart"
    response = requests.get(url=req_url, headers=headers)
    html = etree.HTML(response.text)
    movies_url = html.xpath(
        "//*[@id='content']/div/div[1]/div/div/table/tr/td/a/@href")
    return movies_url


def get_movie_content(movie_url):
    response = requests.get(movie_url, headers=headers)
    result = etree.HTML(response.text)
    movie = dict()
    name = result.xpath('//*[@id="content"]/h1/span[1]//text()')
    author = result.xpath('//*[@id="info"]/span[1]/span[2]//text()')
    movie["name"] = name
    movie["author"] = author
    return movie


if __name__ == '__main__':
    start = datetime.now()
    movie_url_list = get_movie_url()
    movies = dict()
    for url in movie_url_list:
        movies[url] = get_movie_content(url)
    print(movies)
    print("同步用时为:{}".format(datetime.now() - start))

# 看一下同步的结果:
#
# E:venvspiderScriptspython.exe E:/python_project/filetest/douban.py
# [{'name': ['小丑 Joker'], 'author': ['托德·菲利普斯']},
# {'name': ['好莱坞往事 Once Upon a Time... in Hollywood'], 'author': ['昆汀·塔伦蒂诺']},
# {'name': ['爱尔兰人 The Irishman'], 'author': ['马丁·斯科塞斯']},
# {'name': ['准备好了没 Ready or Not'], 'author': ['马特·贝蒂内利-奥尔平', ' / ', '泰勒·吉勒特']},
# {'name': ['82年生的金智英 82년생 김지영'], 'author': ['金度英']},
# {'name': ['克劳斯:圣诞节的秘密 Klaus'], 'author': ['塞尔希奥·巴勃罗斯', ' / ', '卡洛斯·马丁内斯·洛佩斯']},
# {'name': ['寄生虫 기생충'], 'author': ['奉俊昊']},
# {'name': ['骡子 The Mule'], 'author': ['克林特·伊斯特伍德']},
# {'name': ['别告诉她 The Farewell'], 'author': ['王子逸']},
# {'name': ['犯罪现场 犯罪現場'], 'author': ['冯志强']}]
# 同步用时为:0:00:08.765342
# Process finished with exit code 0

# 异步
# 异步也很简单,关于异步的文章我还在整理,因为涉及到太多的东西了。先看这个爬虫代码:

import asyncio
from datetime import datetime

import aiohttp
from lxml import etree

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit"
                         "/537.36 (KHTML, like Gecko) "
                         "Chrome/72.0.3626.121 Safari/537.36"}


async def get_movie_url():
    req_url = "https://movie.douban.com/chart"
    async with aiohttp.ClientSession(headers=headers) as session:
        async with session.get(url=req_url, headers=headers) as response:
            result = await response.text()
            result = etree.HTML(result)
        return result.xpath("//*[@id='content']/div/div[1]/div/div/table/tr/td/a/@href")


async def get_movie_content(movie_url):
    async with aiohttp.ClientSession(headers=headers) as session:
        async with session.get(url=movie_url, headers=headers) as response:
            result = await response.text()
            result = etree.HTML(result)
        movie = dict()
        name = result.xpath('//*[@id="content"]/h1/span[1]//text()')
        author = result.xpath('//*[@id="info"]/span[1]/span[2]//text()')
        movie["name"] = name
        movie["author"] = author
    return movie


if __name__ == '__main__':
    start = datetime.now()
    loop = asyncio.get_event_loop()
    movie_url_list = loop.run_until_complete(get_movie_url())
    tasks = [get_movie_content(url) for url in movie_url_list]
    movies = loop.run_until_complete(asyncio.gather(*tasks))
    print(movies)
    print("异步用时为:{}".format(datetime.now() - start))

# 看一下结果,你就知道差距了:
#
# E:venvspiderScriptspython.exe E:/python_project/filetest/aio_douban.py
# [{'name': ['小丑 Joker'], 'author': ['托德·菲利普斯']},
# {'name': ['好莱坞往事 Once Upon a Time... in Hollywood'], 'author': ['昆汀·塔伦蒂诺']},
# {'name': ['爱尔兰人 The Irishman'], 'author': ['马丁·斯科塞斯']},
# {'name': ['准备好了没 Ready or Not'], 'author': ['马特·贝蒂内利-奥尔平', ' / ', '泰勒·吉勒特']},
# {'name': ['82年生的金智英 82년생 김지영'], 'author': ['金度英']},
# {'name': ['克劳斯:圣诞节的秘密 Klaus'], 'author': ['塞尔希奥·巴勃罗斯', ' / ', '卡洛斯·马丁内斯·洛佩斯']},
# {'name': ['寄生虫 기생충'], 'author': ['奉俊昊']},
# {'name': ['骡子 The Mule'], 'author': ['克林特·伊斯特伍德']},
# {'name': ['别告诉她 The Farewell'], 'author': ['王子逸']},
# {'name': ['犯罪现场 犯罪現場'], 'author': ['冯志强']}]
# 异步用时为:0:00:02.230956
抟扶摇而上者九万里
原文地址:https://www.cnblogs.com/fengting0913/p/15392590.html