爬取genome的网页和图片

# -*- coding: utf-8 -*-
# @Time    : 2018/03/08 10:32
# @Author  : cxa
# @File    : gethtmlandimg.py
# @Software: PyCharm

import requests
from fake_useragent import UserAgent as UA
from lxml import html
import traceback
import os

url = "http://www.genome.jp/kegg-bin/show_pathway?1520394169137283/hsa01100.args"
html_path = os.path.join(os.getcwd(), url.split("/")[-1].replace("args", "html"))
img_path = os.path.join(os.getcwd(), url.split("/")[-1].replace("args", "png"))
headers = {'Accept': 'text/html, application/xhtml+xml, image/jxr, */*',
           'Accept - Encoding': 'gzip, deflate',
           'Accept-Language': 'zh-Hans-CN, zh-Hans; q=0.5',
           'Connection': 'Keep-Alive',
           'User-Agent': UA().random}
img_xapth = "//div[@class='map']/div[@class='image']/img[@name='pathwayimage']/@src"
main_url = "http://www.genome.jp"


def get_img(buff):
    with open(img_path, "wb") as fs:
        fs.write(buff)


req = requests.get(url, timeout=20, headers=headers)
try:
    if req.status_code == requests.codes.ok:
        get_html = req.text
        root = html.fromstring(get_html)
        imgurl = main_url + root.xpath(img_xapth)[0]
        with open(html_path, "w") as fs:
            fs.write(get_html.replace(root.xpath(img_xapth)[0],"./{}".format(url.split("/")[-1].replace("args", "png"))))

        img_req = requests.get(imgurl, headers=headers)
        if img_req.status_code == requests.codes.ok:
            buff = img_req.content
            get_img(buff)
        else:
            img_req.raise_for_status()
    else:
        req.raise_for_status()
except:
    print(traceback.format_exc())

  

原文地址:https://www.cnblogs.com/c-x-a/p/8526679.html