MusiCode 批量下载指定歌手的所有专辑(已解除验证码限制)

一直想把喜欢的歌手的专辑全都归类并下载下来,由于那专辑数量实在太多了,再加上最近开始学习python,就想着何不用python写个脚本把下载过程自动化呢?所以就花了点时间写了这么个东西,分享给有需要的人。:)
写这个东西,一开始并没有想到抓取过于频繁、时间过长会出现验证码,由于验证码的问题试了几种方式都无法得到很好的解决,于是加上了生成下载清单这一步, 加这一步的时候,一开始是把最终下载地址存储起来,结果发现,下载地址居然会过期,没办法最后只有将下载页面地址存储下来,使用下载命令的时候,再去下载 页面获取最终下载地址。
这段脚本使用了两个开源的模块,gevent和BeautifulSoup。

updated-------------------------------------------------------------------------------------------
已解除验证码限制,若出现验证码,则会从验证码页面中提取出所需cookie并重新发起请求。

   #coding=utf-8
    
    import urllib,urllib2,re,os,json,gevent,traceback
    from BeautifulSoup import BeautifulSoup
    from gevent import monkey
    
    monkey.patch_all()
    
    rootUrl='http://music.baidu.com'
    artistId=2825 #想批量下载并归类你喜欢的歌手的所有专辑?那就把这里替换成该歌手在百度音乐的Id吧,例如:http://music.baidu.com/artist/2825
    pagesize=10
    savePath='G:\crawl\david bowie\' #改成你想存储的文件夹
    listDir='_____downlist\'
    handleCount=0
    BAIDUVERIFY=''
    
    def crawlList():
        artistUrl=rootUrl+'/artist/'+str(artistId)
        homeHtml=request(artistUrl)
        soup=BeautifulSoup(homeHtml)
        try:
            pagecount=len(soup.findAll("div",{"class":"page-inner"})[1].findAll(text=re.compile(r'd+')))
       except:
            print traceback.print_exc()
            print homeHtml
            return
        jobs=[]
        listPath=savePath+listDir
        if not os.path.exists(listPath):
            os.mkdir(listPath)
        for i in range(pagecount):
            jobs.append(gevent.spawn(crawlPage,i))
        gevent.joinall(jobs)
            
    def request(url):
        global BAIDUVERIFY
        req=urllib2.Request(url)
        if BAIDUVERIFY!='':
            req.add_header('Cookie','BAIDUVERIFY='+BAIDUVERIFY+';')
        resp=urllib2.urlopen(req)
        html= resp.read()
        verify=getBaiduVerify(html)
        if verify!='':
            print u'成功提取验证码并重新发起请求'
            BAIDUVERIFY=verify
            return request(url)
        return html
        
    def getBaiduVerify(html):
        vcode=re.search(r'name="vcode" value="(.*?)"' , html, re.I)
        id=re.search(r'name="id" value="(.*?)"' , html, re.I)
        di=re.search(r'name="di" value="(.*?)"' , html, re.I)
        if vcode and id and di:
            return vcode.group(1)+':'+id.group(1)+':'+di.group(1)
        return ''
    
    def crawlPage(page):
        start=page*pagesize
        albumListUrl='http://music.baidu.com/data/user/getalbums?start=%d&ting_uid=%d&order=time' % (start,artistId)
        print albumListUrl
        albumListHtml=json.loads(request(albumListUrl))["data"]["html"]
        albumListSoup=BeautifulSoup(albumListHtml)
        covers=albumListSoup.findAll('a',{'class':'cover'})
        pagePath=savePath+listDir+str(page)+'\'
        if not os.path.exists(pagePath):
            os.mkdir(pagePath)
        for cover in covers:
            try:
                crawlAlbum(pagePath,rootUrl+cover['href'],cover['title'])
            except:
                print traceback.print_exc()
    
    def crawlAlbum(pagePath,albumUrl,title):
        print albumUrl,title
        albumHtml=request(albumUrl)
        albumSoup=BeautifulSoup(albumHtml)
        musicWraps=albumSoup.findAll('span',{'class':'song-title '})
        title=re.subn(r'\|/|:|*|?|"|<|>||','',title)[0]
        path=savePath+title+'\'
        albumListPath=pagePath+title+'.txt'
        albumFile=open(albumListPath,'w')
        for wrap in musicWraps:
            link=wrap.find('a')
            try:
                musicPage=rootUrl+link['href']
                albumFile.write('%s	%s	%s
' % (musicPage,link['title'],path)) #真实下载地址会过期,这里保存下载页面
            except:
                print traceback.print_exc()
        albumFile.close()
    
    def crawlDownloadUrl(musicPage):
        downPage=musicPage+'/download'
        downHtml=request(downPage)
        downUrl=re.search('http://[^ ]*xcode.[a-z0-9]*' , downHtml, re.M).group()
        return downUrl
    
    def downList():
        listPath=savePath+listDir
        jobs=[]
        for pageDir in os.listdir(listPath):
            jobs.append(gevent.spawn(downPage,listPath+pageDir))
        gevent.joinall(jobs)
    
    def downPage(pagePath):
        for filename in os.listdir(pagePath):
            filePath=pagePath+'\'+filename
            albumFile=open(filePath,'r')
            try:
                for args in albumFile.readlines():
                    arrArgs=args.split('	')
                    downMusic(arrArgs[0],arrArgs[1],arrArgs[2].replace('
',''))
            except:
                print traceback.print_exc()
            finally:
               albumFile.close()
    
    
   def downMusic(musicPage,title,path):
        global handleCount
        if not os.path.exists(path):
            os.mkdir(path)
        handleCount+=1
        print handleCount,musicPage,title,path
        filename=path+re.subn(r'\|/|:|*|?|"|<|>||','',title)[0]+'.mp3'
        if os.path.isfile(filename):
            return
        downUrl=crawlDownloadUrl(musicPage)
        try:
            urllib.urlretrieve(downUrl,filename)
        except:
            print traceback.print_exc()
            os.remove(filename)
    
    if __name__=='__main__':
        print u'命令:
	list	生成下载清单
	down	开始下载
	exit	退出'
        cmd=raw_input('>>>')
        while cmd!='exit':
            if cmd=='list':
                crawlList()
                print u'已生成下载清单'
            elif cmd=='down':
               downList()
               print u'下载完成'
           else:
               print 'unknow cmd'
          cmd=raw_input('>>>')
原文地址:https://www.cnblogs.com/xuxiaoshuan/p/3628928.html