通过user_agent 和url判断网页是否可爬
from urllib import robotparser
rb = robotparser.RobotFileParser()
rb.set_url("https://www.jd.com/robots.txt")
rb.read()
url = "https://www.jd.com"
user_agent = "HuihuiSpider"
rb.can_fetch(user_agent, url)
False
rb.can_fetch("sougou", url)
True