阳光高考的问题

import requests
import time
from lxml import etree

def get_html(url): # 请求页面
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'}
res = requests.get(url, headers = headers)
res.encoding = res.apparent_encoding
if res.status_code == 200:
html = res.text
return html
else:
time.sleep(0.1)
return get_html(url)
except Exception as e: # except BaseException 这个也可以 e是打印出错误的原因
print("问题是", e)
pass

def parse(html):
#print(html)
r = etree.HTML(html)

#装详细列表的url
list_detail = []

#基础的url
base_url = 'https://gaokao.chsi.com.cn'

#一个列表页面的信息
ppp_ = r.xpath("//div[@class='yxk-table']//text()")

#拿到所有的半截url
list_url = r.xpath("//div[@class='yxk-table']//td[@class='js-yxk-yxmc']/a/@href")

#拼接所有的url
for url in list_url:
detail_url = base_url + url
list_detail.append(detail_url)

#返回所有的url
return list_detail
def url_join():

url_start = 'https://gaokao.chsi.com.cn/sch/search--ss-on,searchType-1,option-qg,start-'
url_end = '.dhtml'
url_list = []
for i in range(1,139):
url_num = 20 * i - 20
url = url_start + str(url_num) + url_end
url_list.append(url)

return url_list
if __name__ == '__main__':
# url_list = url_join()
# print(url_list)
#
# for url in url_list:
#
# #访问
# html = get_html(url)
# parse(html)

url = 'https://gaokao.chsi.com.cn/sch/search--ss-on,searchType-1,option-qg,start-0.dhtml'
html = get_html(url)
url_list = parse(html)
print(url_list)
原文地址:https://www.cnblogs.com/yuanjia8888/p/11113859.html