爬虫练习:使用xpath 下载站长之家简历封面模板

# -*- coding: utf-8 -*-
# @Time : 2020/9/21 11:13
# @Author : aqiong
# @Site : 
# @File : 站长之家简历爬取.py
# @Software: PyCharm
import requests
from lxml import etree
import random
import os

##
#获得用户代理
#
def getheaders():
    user_agent_list = ['Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36',
                       'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36',
                       'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.36',
                       'Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36',
                       'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36',
                       'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36',
                       'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0) Gecko/20100101 Firefox/17.0.6',
                       'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36',
                       'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36',
                       'Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36']
    return  random.choice(user_agent_list)

if __name__ == '__main__':
    if not os.path.exists('./jl'):
        os.mkdir('./jl')

    url = 'http://sc.chinaz.com/jianli/fengmian.html'
    headers = {
        'user-agent':getheaders()
    }
    page_text = requests.get(url=url,headers = headers)
    page_text.encoding='utf-8'###这里是设置编码为utf-8,否则爬取到的数据是乱码
    page_text = page_text.text
    #print(page_text.text)

    page_html=etree.HTML(page_text)
    #print(page_html.xpath('//title/text()'))

    a_herf_list = page_html.xpath('//div[@class="main_list jl_main"]/div[@class="box col3 ws_block"]/a/@href')

    for a_url in a_herf_list:
        jl_page_text = requests.get(url=a_url,headers=headers,allow_redirects=False).text#allow_redirects=False当爬虫时报错:requests.exceptions.TooManyRedirects: Exceeded 30 redirects.

        jl_html = etree.HTML(jl_page_text)


        rar_list = jl_html.xpath('//div[@class="clearfix mt20 downlist"]//ul[@class="clearfix"]/li[1]/a/@href')#获得下载连接
        rar_url = rar_list[0]
       # print(rar_list)
        jl_rar = requests.get(url=rar_url, headers=headers).content
        fileName = './jl/' + rar_url.split('/')[-1]
        # print(fileName)

        with open(fileName, 'wb') as fp:
            fp.write(jl_rar)
            print(fileName + '保存成功')


    #print(a_herf_list)

原文地址:https://www.cnblogs.com/aqiong/p/13715334.html