分析动态网页请求爬取腾讯视频评论

# -*- coding: utf-8 -*-
# 分析动态网页请求爬取腾讯视频评论
import scrapy
import re
import json
import time
from tencent.items import TencentItem


class TenspiderSpider(scrapy.Spider):
    name = "tenspider"
    # allowed_domains = ["v.qq.com/x/cover/ga7nei8pd5i9mek.html"]
    start_urls = ['http://v.qq.com/x/cover/ga7nei8pd5i9mek.html/']
    #为了生成comment_id
    comment_url = 'https://ncgi.video.qq.com/fcgi-bin/video_comment_id?otype=json&op=3&cid='
    #为了生成评论页js的url
    base_url = 'https://coral.qq.com/article/{comment_id}/comment?commentid=0&reqnum=1000'

    def start_requests(self):
        movie_cid = re.search(r'cover/(.*?).html', self.start_urls[0]).group(1)
        video_comment_url = self.comment_url + movie_cid
        # print(video_comment_url)
        yield scrapy.Request(url = video_comment_url, callback= self.parse_video)

    def parse_video(self, response):
        html = re.search(r'=(.*?);',response.text).group(1)
        data = json.loads(html)
        comment_id = data.get('comment_id')
        f_comment_url = self.base_url.format(comment_id=comment_id)
        yield scrapy.Request(url = f_comment_url, callback = self.parse_comment)

    def parse_comment(self, response):
        item = TencentItem()
        data = json.loads(response.text)
        for each in data.get('data').get('commentid'):
            comment = each['content']
            timestamp = each['time']
            # 转换成localtime
            time_local = time.localtime(timestamp)
            # 转换成新的时间格式(2016-05-05 20:28:54)
            date = time.strftime("%Y-%m-%d %H:%M:%S", time_local)
            user = each['userinfo']['nick']
            region = each['userinfo']['region']
            userid = each['userinfo']['userid']

            item['comment'] = comment
            item['user'] = user
            item['date'] = date
            item['region'] = region
            item['userid'] = userid

            yield item