lxml爬取实验

1.豆瓣

爬取单个页面数据

import requests
from lxml import etree
#import os


url = "https://movie.douban.com/cinema/nowplaying/yongzhou/"
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
req = requests.get(url=url,headers=headers)
text = req.text
dics = []
#将抓取下来的数据根据一定的规则进行提取
html = etree.HTML(text)
ul = html.xpath("//ul[@class='lists']")[0]
#print(etree.tostring(ul,encoding='utf-8').decode('utf-8'))
lis = ul.xpath("./li")
for li in lis:
    title = li.xpath("@data-title")[0]
    score = li.xpath("@data-actors")[0]
    adress = li.xpath("@data-region")[0]
    img_hai = li.xpath(".//img/@src")[0]
    dic = {
        'title':title,
        'score':score,
        'adress':adress,
        'img':img_hai
    }
    dics.append(dic)
print(dics)

2.电影天堂

爬取多个页面数据

import requests
import json
from lxml import etree
url = "http://www.dytt8.net"
HEADERS = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
    'Referer':'http://www.dytt8.net/html/gndy/dyzz/list_23_1.html'
}

def get_url(urls):
    response = requests.get(urls,headers=HEADERS)
    text = response.text            #请求页面
    html = etree.HTML(text)         #解析为HTML文档 html为Element对象    （可以执行xpath语法）
    detail_urls = html.xpath("//table[@class='tbspan']//a/@href")       #获取页面下的href
    detail_urls = map(lambda urls:url+urls,detail_urls)         #将detail_urls这个列表中每个url都扔给lambda这个函数合并    再将整个修改后的赋给detail_urls
    return detail_urls

def parse_detail_url(de_ur):
    movie = {}
    response = requests.get(de_ur,headers=HEADERS)
    text = response.content.decode('gbk')
    html = etree.HTML(text)
    title = html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]       #获取标题
    movie['title'] = title          #放入字典
    zoomE = html.xpath("//div[@id='Zoom']")[0]
    img_hb = zoomE.xpath(".//img/@src")
    cover = img_hb[0]           #海报
    #sst = img_hb[1]      #电影截图
    movie['cover'] = cover
    #movie['sst'] = sst

    def parse_info(info,rule):
        return info.replace(rule,"").strip()     #.strip()把前后空格删掉
    infos = zoomE.xpath(".//text()")
    for index,info in enumerate(infos):         #enumerate 索引序列（0 str 1 str 2 str）
        if info.startswith("◎片　　名"):       #判断 以。。开始
            info = parse_info(info,"◎片　　名")         #调用parse_info将"◎片　　名"替换为无（没有）
            movie['pian'] = info
        elif info.startswith("◎年　　代"):
            info = parse_info(info, "◎年　　代")
            movie['year'] = info
        elif info.startswith("◎产　　地"):
            info = parse_info(info, "◎产　　地")
            movie['adress'] = info
        elif info.startswith("◎导　　演"):
            info = parse_info(info, "◎导　　演")
            movie['actor'] = info
        elif info.startswith("◎类　　别"):
            info = parse_info(info, "◎类　　别")
            movie['lb'] = info
        elif info.startswith("◎豆瓣评分"):
            info = parse_info(info, "◎豆瓣评分")
            movie['db'] = info
        elif info.startswith("◎主　　演"):
            info = parse_info(info, "◎主　　演")
            actors = []
            for x in range(index+1,len(infos)):
                actor = infos[x]
                if actor.startswith("◎"):               #过滤简介部分
                    break
                actors.append(actor)
            movie['actors'] = actors
        elif info.startswith("◎简　　介"):
            info = parse_info(info,"◎简　　介")
            for x in range(index+1,len(infos)):
                profile = infos[x].strip()
                if profile.startswith("【"):             #过滤下载地址部分
                    break
                movie['profile'] = profile
    download_url = html.xpath("//td[@bgcolor='#fdfddf']/a/@href")[0]                #下载地址
    movie['download_url'] = download_url
    return movie

def write_to_file(content):
    with open('result.txt','a',encoding='utf-8') as f:
        f.write(json.dumps(content,ensure_ascii=False)+'
')        #ensure_ascii=False 输出为中文
        f.close()

def dianying():
     urld = "http://www.dytt8.net/html/gndy/dyzz/list_23_{}.html"               #这里用到了{} .format()的用法
     movies = []                        #定义一个列表
     for x in range(1,8):
         #第一个for循环用来控制7个页面
         print(x)
         urls = urld.format(x)
         if x==5:                   #这里因为第5个页面出现报错信息 可能是编码问题  解决不了  所以我就过滤了第5页
             continue
         detail_ur = get_url(urls)          #解析每页的详细信息
         write_to_file("第%s页" % x)
         for detail_url in detail_ur:
             #第二个for循环用来遍历每个页
             movie = parse_detail_url(detail_url)
             movies.append(movie)
             write_to_file(movie)

if __name__ == '__main__':
    dianying()

3.腾讯招聘

跟上一个电影天堂的代码差不多

import requests
import json
from lxml import etree
url = "https://hr.tencent.com/"
HEADERS = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
def get_url(urld):
    response = requests.get(urld,headers=HEADERS)
    text = response.text
    html = etree.HTML(text)
    detail_url = html.xpath("//tr[@class='even' or @class='odd']//a/@href")
    detail_url = map(lambda x:url+x,detail_url)

    return detail_url

def prease_url(detail_url):
    dic = {}
    #print(detail_url)
    response = requests.get(detail_url,headers=HEADERS)
    text =response.text
    html = etree.HTML(text)
    title = html.xpath("//tr[@class='h']//td[@class='l2 bold size16']//text()")[0]
    dic['title'] = title

    #方法一    (死板)
    adress = html.xpath("//tr[@class='c bottomline']//td//text()")[1]
    dic['adress'] = adress
    # 方法二    (简洁)
    str = html.xpath("//tr[@class='c bottomline']//td")
    leibie = str[1].xpath(".//text()")[1]
    dic['leibie'] = leibie
    nums = str[2].xpath(".//text()")[1]
    dic['nums'] = nums
    gz = html.xpath("//ul[@class='squareli']")
    gzzz = gz[0].xpath(".//text()")
    gzyq = gz[1].xpath(".//text()")
    dic['工作职责'] = gzzz
    dic['工作要求'] = gzyq
    #print(dic)
    return dic

def write_to_file(content):
    with open('tengxun.txt','a',encoding='utf-8') as f:
        f.write(json.dumps(content,ensure_ascii=False)+'
')        #ensure_ascii=False 输出为中文
        f.close()
def tengxun():
    movies = []
    urls = "https://hr.tencent.com/position.php?keywords=python&lid=0&tid=87&start={}#a"
    for x in range(0,501,10):           #步长为10
        print(x)
        urld = urls.format(x)
        detail_urls = get_url(urld)
        for detail_url in detail_urls:
            movie = prease_url(detail_url)
            movies.append(movie)
            write_to_file(x)
            write_to_file(movies)

if __name__ == '__main__':
    tengxun()