Python动态网站的抓取

网页下载器

# coding:utf-8
import requests
import urllib2
import sys
type = sys.getfilesystemencoding()
class HtmlDownloader(object):

def download(slef, url):

if url is None:
return None

user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'

headers = {'User-Agent': user_agent}
req = urllib2.Request(url, headers=headers)
response = urllib2.urlopen(req)
if response.getcode() == 200:
html = response.read()
return html

return None

网页解析器

# coding:utf-8
import re
import json
class HtmlParser(object):

def parser_url(self, page_url, response):

pattern = re.compile(r'(http://movie.mtime.com/(d+)/)')
urls = pattern.findall(response)
if urls != None:
# 将urls进行去重
return list(set(urls))
else:
return None

# 解析异步响应值
def parser_json(self, page_url, response):

# 将"="和";"之间的内容提取出来
pattern = re.compile(r'=(.*?);')
result = pattern.findall(response)[0]

if result != None:
value = json.loads(result)
try:
isRelease = value.get('value').get('isRelease')
except Exception, e:
print e
return None
if isRelease:
if value.get('value').get('releaseType') == None:
return self._parser_release(page_url, value)
else:
return self._parser_no_release(page_url, value, isRelease=2)
else:

return self._parser_no_release(page_url, value)

def _parser_release(self, page_url, value):

try:
isRelease = 1
movieRating = value.get('value').get('movieRating')
boxOffice = value.get('value').get('boxOffice')
moveTitle = value.get('value').get('moveTitle')
RPictureFinal = movieRating.get('RPictureFinal')
RStoryFinal = movieRating.get('RStoryFinal')
RDirectorFinal = movieRating.get('RDirectorFinal')
ROtherFinal = movieRating.get('ROtherFinal')
RathingFinal = movieRating.get('RarhingFinal')

MovieId = movieRating.get('MoviedId')
Usercount = movieRating.get('Usercount')
AttitudeCount = movieRating.get('AttitudeCount')

TotalBoxOffice = boxOffice.get('TotalBoxOffice')
TotalBoxOfficeUnit = boxOffice.get('TotalBoxOfficeUnit')
TodayBoxOffice = boxOffice.get('TodayBoxOffice')
TodayBoxOfficeUnit = boxOffice.get('TodayBoxOfficeUnit')

ShowDays = boxOffice.get('ShowDays')

try:

Rank = boxOffice.get('Rank')
except Exception, e:
Rank = 0

return (
MovieId, moveTitle, RathingFinal, ROtherFinal, RPictureFinal, RDirectorFinal, RStoryFinal, Usercount,
AttitudeCount
, TotalBoxOffice + TotalBoxOfficeUnit, TodayBoxOffice + TodayBoxOfficeUnit, Rank, ShowDays, isRelease)
except Exception, e:
print e, page_url, value

return None

# 解析未上映的电影信息
def _parser_no_release(self, page_url, value, isRelease=0):

try:
movieRating = value.get('value').get('movieRating')
moveTitle = value.get('value').get('movieTitle')
RPictureFinal = movieRating.get('RPictureFinal')
RStoryFinal = movieRating.get('RStoryFinal')
RDirectorFinal = movieRating.get('RDirectorFinal')
ROtherFinal = movieRating.get('ROtherFinal')
RatingFinal = movieRating.get('RatingFinal')

MovieId = movieRating.get('MovieId')
Usercount = movieRating.get('Usercount')
AttitudeCount = movieRating.get('AttitudeCount')

try:

Rank = 0

except Exception, e:
Rank =0
return (
MovieId, moveTitle, RatingFinal, ROtherFinal, RPictureFinal, RDirectorFinal, RStoryFinal,
Usercount,
AttitudeCount
, u'无', u'无', Rank, 0, isRelease)

except Exception, e:

print e, page_url, value

return None

数据存储器

# coding:utf-8
import MySQLdb

class DataOutput(object):

def __init__(self):
self.con =MySQLdb.connect(host='127.0.0.1', user='root', passwd='', db='go',port=3306,charset='utf8')
self.cx = self.con.cursor()
self.create_table('MTime')
self.datas = []

def create_table(self, table_name):

values = "id int(11) not null primary key auto_increment,"
"MovieId int(11),"
"MovieTitle varchar(40) NOT NULL,"
"RatingFinal double NOT NULL DEFAULT 0.0,"
"ROtherFinal double NOT NULL DEFAULT 0.0,"
"RPictureFinal double NOT NULL DEFAULT 0.0,"
"RDirectorFinal double NOT NULL DEFAULT 0.0,"
"RStoryFinal double NOT NULL DEFAULT 0.0,"
"Usercount int(11) NOT NULL DEFAULT 0,"
"AttitudeCount int(11) NOT NULL DEFAULT 0,"
"TotalBoxOffice varchar(20) NOT NULL,"
"TodayBoxOffice varchar(20) NOT NULL,"
"Rank int(11) NOT NULL DEFAULT 0,"
"ShowDays int(11) NOT NULL DEFAULT 0,"
"isRelease int(11) NOT NULL"
""
#print 'CREATE TABLE IF NOT EXISTS %s(%s)' % (table_name, values)

self.cx.execute('CREATE TABLE IF NOT EXISTS %s(%s) ENGINE=InnoDB DEFAULT CHARSET=utf8' % (table_name, values))

def store_data(self, data):

if data is None:
return
self.datas.append(data)
if len(self.datas) > 10:
self.output_db('MTime')

def output_db(self, table_name):
for data in self.datas:
self.cx.execute("INSERT INTO MTime (MovieId,MovieTitle,RatingFinal,ROtherFinal,RPictureFinal,RDirectorFinal,"
"RStoryFinal,Usercount,AttitudeCount,TotalBoxOffice,TodayBoxOffice,Rank,ShowDays,isRelease) "
"VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",data)

self.datas.remove(data)

self.con.commit()
self.con.close()

def output_end(self):

if len(self.datas) > 0:
self.output_db('MTime')

self.cx.close()

爬虫调度器

# coding:utf-8
from UrlManager import UrlManager
from DataOutput import DataOutput
from HtmlDownloader import HtmlDownloader
from HtmlParser import HtmlParser
import time
class SpiderMan(object):

def __init__(self):

self.downloader = HtmlDownloader()
self.parser = HtmlParser()
self.output = DataOutput()

def crawl(self,root_url):

content = self.downloader.download(root_url)

urls = self.parser.parser_url(root_url,content)

for url in urls:

try:
t= time.strftime("%Y%m%d%H%M%S3282",time.localtime())
rank_url ="http://service.library.mtime.com/Movie.api?"
"Ajax_CallBack=true"
"&Ajax_CallBackType=Mtime.Library.Services"
"&Ajax_CallBackMethod=GetMovieOverviewRating"
"&Ajax_CrossDomain=1"
"&Ajax_RequestUrl=%s"
"&t=%s"
"&Ajax_CallBackArgument0=%s" %(url[0],t,url[1])

#print rank_url
#exit()
rank_content = self.downloader.download(rank_url)

data = self.parser.parser_json(rank_url,rank_content)

self.output.store_data(data)
except Exception,e:
print e
self.output.output_end()
print "Crawl finish"

if __name__ == '__main__':

spider = SpiderMan()
spider.crawl('http://theater.mtime.com/China_Beijing/')