python 爬虫示例,方便日后参考

参考网址:https://zhuanlan.zhihu.com/p/32037625

def getOneMoviesInfo(Mid,url):
    import requests
    from lxml import etree
    
    #print(url)
    data = requests.get(url).text   #download the website
    s = etree.HTML(data)            #analyse data

    picture = s.xpath('//*[@id="main"]/section/div[1]/div/div/section/div[1]/div[1]/img/@src')
    if len(picture)== 0:
        picture = 'NULL'
    #longPicture = s.xpath('//*[@id="media_v4"]/div[2]/div[1]/div/div/section[3]/div[2]/div/div[1]/img/@src')
    name = s.xpath('//*[@id="main"]/section/div[1]/div/div/section/div[2]/section/div[1]/span/a/h2/text()')
    if len(name)==0:
        print("Mid = %s , failed for a lack of TMDB id "%Mid)
        return
    name = s.xpath('//*[@id="main"]/section/div[1]/div/div/section/div[2]/section/div[1]/span/a/h2/text()')[0]
    year = s.xpath('//*[@id="main"]/section/div[1]/div/div/section/div[2]/section/div[1]/span/span/text()')[0].strip("(").strip().strip(")")
    date = s.xpath('//*[@id="media_v4"]/div[2]/div[2]/div/section/div[1]/div/section[1]/ul/li[1]/text()')[1].strip()
    brief = s.xpath('//*[@id="main"]/section/div[1]/div/div/section/div[2]/section/div[2]/div/p/text()')[0].replace("
","\n")

    mainCreators =s.xpath('//*[@id="main"]/section/div[1]/div/div/section/div[2]/section/div[2]/ol/li') #all main creators array
    writers = []
    director = "NULL"
    for div in mainCreators:
        if len(div.xpath('./p[1]/a/text()'))== 0:
            director = 'NULL'
            writers = ['NULL','NULL','NULL']
        else:            
            creatorName = div.xpath('./p[1]/a/text()')[0]
            #print(creatorName)
            creatorProfession = div.xpath('./p[2]/text()')[0]
            #print(creatorProfession)
            if  'Director' in creatorProfession:
                director = creatorName
            elif 'Screenplay' in creatorProfession or 'Writer' in creatorProfession:
                writers.append(creatorName)
    
        
    stars = []
    starsData = s.xpath('//*[@id="media_v4"]/div[2]/div[1]/div/div/section[1]/ol/li')
    for div in starsData:
        star = div.xpath('./p[1]/a/text()')
        if len(star)== 0:
            stars == ["NULL","NULL","NULL"]
        else:
            star = star[0]
            stars.append(star)
                
    
    writerslen = len(writers)
    starslen=len(stars)
    
    for i in range(writerslen,3):
        writers.append("NULL");
    for i in range(starslen,5):
        stars.append("NULL");
    
    with open(r'C:UsersyuqiaoDesktop	estSpider.txt','a',encoding='utf-8') as f:
        f.write("{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}
".format(Mid,name,brief,year,date,director,
                                                 writers[0],writers[1],writers[2],
                                                 stars[0],stars[1],stars[2],stars[3],stars[4],
                                                 picture))
    print(Mid)
    print(name)
    
#______________________________________________________主函数__________________________________________________________
import time
with open(r'C:UsersyuqiaoDesktop	estSpider.txt','w',encoding='utf-8') as f:
        f.write("")
language = '?language=zh-CN' #######################
with open(r'D:gitiyeMovieMidURL.txt', "rt",encoding='utf-8') as in_file:
    all = in_file.read()
    lines = all.split("
")
    
    #for i in range(51,61):    51~60
    for i in range(9124,9125):
        line = lines[i]
        print(line)

print('finished')    


原文地址:https://www.cnblogs.com/YuQiao0303/p/9277666.html