python每日一题:爬虫电影的动态票房信息

题目:从http://movie.mtime.com中读取一个电影的票房信息和相关的同类电影。

方案一:采用Ajax技术,逐步提取动态网站的json,再进行爬虫

知识点:

 1.由于该数据是动态信息,需要找到相关的json,并拼接出所需要的网址,进行爬虫。

 2.利用正则表达式提取网址信息、利用json将字符串字典化。

from bs4 import BeautifulSoup
import re, csv, urllib.request, urllib.parse, time, json, pickle


class url_manager(object):
    def __init__(self):
        self.new_urls = []  # 书籍上采用set()函数,主要是考虑到次函数的去重功能,但集合是无序的,导致不方便查找new_urls内的数据,且后序add(url)时已进行判定,不必要再使用set()
        self.old_urls = []

    def add_new_url(self, url):
        if url not in self.new_urls and url not in self.old_urls:
            self.new_urls.append(url)

    def add_new_urls(self, urls):
        if urls == None:
            return
        for url in urls:
            self.add_new_url(url)

    def have_new_url(self):
        return len(self.new_urls) != 0

    def get_new_url(self):
        data = self.new_urls.pop(0)  # 从第一个数据进行删除,逐一爬虫

        self.old_urls.append(data)
        return data


class url_download(object):
    def download(self, url):
        response = urllib.request.urlopen(url)
        data = response.read().decode()
        if data == None:
            print("no web")
            return False
        return data


class url_scrapy(object):
    def get_data(self, source_url, source_data):#书中将票房的电影由于其json中关键字的差异,进行了分类,本程序中为了简单起见,只是读取一下关键字。

        pattern = re.compile(r'=(.*?);')
        jsondata = pattern.findall(str(source_data))[0]
        jsondata2=json.loads(jsondata)
        movieresult={'MovienIDame': jsondata2['value']['movieRating']['MovieId'],'RatingFinal': jsondata2['value']['movieRating']['RatingFinal'],'movieTitle': jsondata2['value']['movieTitle'] }
        print(movieresult)
        return (source_url,movieresult)  # 没有搜索到url时,返回None
    def get_sameurldata(self, source_url, source_data):
        a=[]
        pattern = re.compile(r'=(.*?);')
        jsonurl = pattern.findall(str(source_data))[0]
        jsonurl1=json.loads(jsonurl)
        movielist=jsonurl1['value']['movieList']
        for i in movielist:
            a.append(i['url'])
        return (a)  # 没有搜索到url时,返回None



class output_url(object):
    def output_scroe(self, root_url):  # 组建动态网址
        time0 = time.strftime("%Y%m%d%H%M%S11111", time.localtime())
        a = re.compile(r'/(d+)/')
        urlnum = a.findall(root_url)
        url2 = "http%3A%2F%2Fmovie.mtime.com%2F" + urlnum[0] + '%2F'
        scroe_url = 'http://service.library.mtime.com/Movie.api?' 
                    'Ajax_CallBack=true' 
                    '&Ajax_CallBackType=Mtime.Library.Services' 
                    '&Ajax_CallBackMethod=GetMovieOverviewRating' 
                    '&Ajax_CrossDomain=1' 
                    '&Ajax_RequestUrl=%s' 
                    '&t=%s' 
                    '&Ajax_CallBackArgument0=%s' % (url2, time0, urlnum[0])
        return scroe_url

    def output_sameurl(self, root_url):  # 组建动态网址

        time0 = time.strftime("%Y%m%d%H%M%S11111", time.localtime())
        a = re.compile(r'/(d+)/')
        urlnum = a.findall(root_url)
        url2 = "http%3A%2F%2Fmovie.mtime.com%2F" + urlnum[0] + '%2F'
        same_url = 'http://service.library.mtime.com/Movie.api?' 
                    'Ajax_CallBack=true' 
                    '&Ajax_CallBackType=Mtime.Library.Services' 
                    '&Ajax_CallBackMethod=GetSimilarRecommenMovieInfoByMovieId' 
                    '&Ajax_CrossDomain=1' 
                    '&Ajax_RequestUrl=%s' 
                    '&t=%s' 
                    '&Ajax_CallBackArgument0=%s' % (url2, time0, urlnum[0])
        return same_url

class output_data(object):
    def data_save(self, data):
        with open('pachong.csv', "a+", encoding='utf-8') as f:
            f1 = csv.writer(f, lineterminator='
')
            f1.writerows(data)


class controllers(object):
    def __init__(self):
        self.manager = url_manager()
        self.download = url_download()
        self.scrapy = url_scrapy()
        self.output = output_data()
        self.scoreurl = output_url()

    def control(self, url):
        self.manager.add_new_url(url)
        num = 1
        data1 = 0
        while (1):
            if num > 200:
                break
            elif self.manager.have_new_url():
                url_down = self.manager.get_new_url()
                score_url = self.scoreurl.output_scroe(url_down)
                same_url = self.scoreurl.output_sameurl(url_down)
                info = self.download.download(score_url)
                url_info=self.download.download(same_url)
                data1, url1 = self.scrapy.get_data(url_down, info)
                same_url1= self.scrapy.get_sameurldata(url_down, url_info)
                if data1 != None:
                    self.output.data_save(data1)
                    print(num, "is finished:", url_down)
                    num += 1
                self.manager.add_new_urls(same_url1)
            else:
                print('has no url')
                break


if __name__ == "__main__":
    url = r'http://movie.mtime.com/225824/'
    a = controllers()
    a.control(url)
方案二:采用selenium技术进行加载动态网站信息,直接获取数据

原文地址:https://www.cnblogs.com/xuehaiwuya0000/p/10622924.html