python随笔(一)

python爬虫获取QQ音乐和豆瓣的最新电影音乐名字

先上代码开源大家一起学习,代码如下:

#!python2
#coding:utf-8
__author__ = 'OldHarry'

import urllib2
import os
import re
import json
import xlsxwriter
import sys
defaultencoding = 'utf-8'
if sys.getdefaultencoding() != defaultencoding:
    reload(sys)
    sys.setdefaultencoding(defaultencoding)

def getHtml(url):
    send_headers = {
     'User-Agent':'Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0',
     'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
     'Connection':'keep-alive'
    }
    urls = urllib2.Request(url,headers=send_headers)
    html = urllib2.urlopen(urls)
    if html.getcode() == 200:
        pass
        #print ("已捕获"),url,"目标站数据..."
    else:
        print ("访问出现错误...错误代码:"),html.getcode()
    return html.read()


def kugoumusic(url):
    xx=getHtml(url)
    rr=re.compile(r'<span class="songName">(.*?) - (.*?)</span>')
    x=rr.findall(xx)
    nk=[]
    for xxx in  x:
        if xxx not in nk:
            nk.append(xxx[1].decode('utf8'))
    print json.dumps(nk, encoding="UTF-8", ensure_ascii=False)
    return nk


def qqmusic(url):
    xx=getHtml(url)
    rr=re.compile(r'{"action":{"alert":[0-9]+,"icons":[0-9]+,"msgdown":[0-9]+,"msgfav":[0-9]+,"msgid":[0-9]+,"msgpay":[0-9]+,"msgshare":[0-9]+,"switch":[0-9]+},"album":{"id":[0-9]+,"mid":"[a-zA-Z0-9]+","name":"(.*?)"')
    x=rr.findall(xx)
    nq=[]
    for xxx in x:
        xxx.strip()
        if xxx not in nq:
            nq.append(xxx)
    print json.dumps(nq, encoding="UTF-8", ensure_ascii=False)
    return nq

def dbmovie(url):
    ssd = getHtml(url)
    tt=re.compile(r'alt="(.*?)" rel="[a-z]+" class="" />')
    shu=tt.findall(ssd)
    print json.dumps(shu, encoding="UTF-8", ensure_ascii=False)
    return shu

def rmmovie(url):
    ssd = getHtml(url)
    tt=re.compile(r'"title":"(.*?)"')
    shu=tt.findall(ssd)
    print json.dumps(shu, encoding="UTF-8", ensure_ascii=False)
    return shu
def rmdsj():
    ssd = rmmovie('https://movie.douban.com/j/search_subjects?type=tv&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start=0')+rmmovie('https://movie.douban.com/j/search_subjects?type=tv&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start=20')+rmmovie('https://movie.douban.com/j/search_subjects?type=tv&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start=40')
    return ssd

def runtest():
    IP_PATH = os.path.abspath('.') + 'TXT.xls'
    print "酷狗音乐--新歌榜"
    a=kugoumusic("http://www.kugou.com/")
    print "腾讯音乐--内地新歌榜"
    b=qqmusic("https://u.y.qq.com/cgi-bin/musicu.fcg?-=recom2388477980207393&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq.json&needNewCode=0&data=%7B%22comm%22%3A%7B%22ct%22%3A24%7D%2C%22new_song%22%3A%7B%22module%22%3A%22QQMusic.MusichallServer%22%2C%22method%22%3A%22GetNewSong%22%2C%22param%22%3A%7B%22type%22%3A1%7D%7D%7D")
    print "腾讯音乐--港台新歌榜"
    c=qqmusic("https://u.y.qq.com/cgi-bin/musicu.fcg?-=recom6698628102261504&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq.json&needNewCode=0&data=%7B%22comm%22%3A%7B%22ct%22%3A24%7D%2C%22new_song%22%3A%7B%22module%22%3A%22QQMusic.MusichallServer%22%2C%22method%22%3A%22GetNewSong%22%2C%22param%22%3A%7B%22type%22%3A2%7D%7D%7D")
    print "腾讯音乐--欧美新歌榜"
    d=qqmusic("https://u.y.qq.com/cgi-bin/musicu.fcg?-=recom08419989487702839&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq.json&needNewCode=0&data=%7B%22comm%22%3A%7B%22ct%22%3A24%7D%2C%22new_song%22%3A%7B%22module%22%3A%22QQMusic.MusichallServer%22%2C%22method%22%3A%22GetNewSong%22%2C%22param%22%3A%7B%22type%22%3A3%7D%7D%7D")
    print "腾讯音乐--日本新歌榜"
    e=qqmusic("https://u.y.qq.com/cgi-bin/musicu.fcg?-=recom24411354608866187&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq.json&needNewCode=0&data=%7B%22comm%22%3A%7B%22ct%22%3A24%7D%2C%22new_song%22%3A%7B%22module%22%3A%22QQMusic.MusichallServer%22%2C%22method%22%3A%22GetNewSong%22%2C%22param%22%3A%7B%22type%22%3A4%7D%7D%7D")
    print "腾讯音乐--韩国新歌榜"
    f=qqmusic("https://u.y.qq.com/cgi-bin/musicu.fcg?-=recom909302436024819&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq.json&needNewCode=0&data=%7B%22comm%22%3A%7B%22ct%22%3A24%7D%2C%22new_song%22%3A%7B%22module%22%3A%22QQMusic.MusichallServer%22%2C%22method%22%3A%22GetNewSong%22%2C%22param%22%3A%7B%22type%22%3A5%7D%7D%7D")
    print "豆瓣电影--正在热映"
    g=dbmovie("https://movie.douban.com/")
    print "豆瓣电影--热门电影"
    h=rmmovie("https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start=0")
    print "豆瓣电影--最新电影"
    i=rmmovie("https://movie.douban.com/j/search_subjects?type=movie&tag=%E6%9C%80%E6%96%B0&page_limit=20&page_start=0")
    print "豆瓣电影--经典电影"
    j=rmmovie("https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%BB%8F%E5%85%B8&sort=time&page_limit=20&page_start=0")
    print "豆瓣电影--可播放电影"
    k=rmmovie("https://movie.douban.com/j/search_subjects?type=movie&tag=%E5%8F%AF%E6%92%AD%E6%94%BE&sort=time&page_limit=20&page_start=0")
    print "豆瓣电影--高分电影"
    l=rmmovie("https://movie.douban.com/j/search_subjects?type=movie&tag=%E8%B1%86%E7%93%A3%E9%AB%98%E5%88%86&sort=time&page_limit=20&page_start=0")
    print "豆瓣电影--热门电视剧"
    m=rmdsj()
    workbook = xlsxwriter.Workbook(IP_PATH)
    worksheet = workbook.add_worksheet()
    bold = workbook.add_format({'bold': 1, 'align': 'center', 'border': 1})
    bold2 = workbook.add_format({'align': 'center', 'border': 1})
    headings = ['酷狗音乐--新歌榜', '腾讯音乐--内地新歌榜','腾讯音乐--港台新歌榜','腾讯音乐--欧美新歌榜','腾讯音乐--日本新歌榜','腾讯音乐--韩国新歌榜','豆瓣电影--正在热映','豆瓣电影--热门电影','豆瓣电影--最新电影','豆瓣电影--经典电影','豆瓣电影--可播放电影','豆瓣电影--高分电影','豆瓣电影--热门电视剧']
    worksheet.write_row('A1', headings, bold)
    SS=30
    worksheet.set_column('A:A', SS)
    worksheet.set_column('B:B', SS)
    worksheet.set_column('C:C', SS)
    worksheet.set_column('D:D', SS)
    worksheet.set_column('E:E', SS)
    worksheet.set_column('F:F', SS)
    worksheet.set_column('G:G', SS)
    worksheet.set_column('H:H', SS)
    worksheet.set_column('I:I', SS)
    worksheet.set_column('J:J', SS)
    worksheet.set_column('K:K', SS)
    worksheet.set_column('L:L', SS)
    worksheet.set_column('M:M', SS)
    worksheet.write_column('A2', a, bold2)
    worksheet.write_column('B2', b, bold2)
    worksheet.write_column('C2', c, bold2)
    worksheet.write_column('D2', d, bold2)
    worksheet.write_column('E2', e, bold2)
    worksheet.write_column('F2', f, bold2)
    worksheet.write_column('G2', g, bold2)
    worksheet.write_column('H2', h, bold2)
    worksheet.write_column('I2', i, bold2)
    worksheet.write_column('J2', j, bold2)
    worksheet.write_column('K2', k, bold2)
    worksheet.write_column('L2', l, bold2)
    worksheet.write_column('M2', m, bold2)
    workbook.close()
if __name__ == '__main__':
    runtest()

主要思路是:第一步解析网站,第二步选择自己想要的数据,第三步在当前文件夹生成一个文件夹写入excl

 第一次写博客,各路大神不喜勿喷,python萌新一枚。

开发环境:Pycharm  python2.7

2019-04-0411:33:23

Study hard and make progress every day!

萌新签到
原文地址:https://www.cnblogs.com/Harrydz/p/10653926.html