爬虫----scrapy账号登录豆瓣,并且重定向到电影界面,获取界面信息

Request:这是url重定向

FormRequest:这是表单提交,就是登录界面时,输入账号、密码,点击登陆的过程



# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request, FormRequest

class LoginSpider(scrapy.Spider):
name = 'login'
allowed_domains = ['www.douban.com','movie.douban.com']
# start_urls = ['http://www.douban.com/']
# 默认 从start_urls中的 地址返回response,我们自己编写函数start_requests,自己返回response
def start_requests(self):
return [Request(
url='https://www.douban.com/accounts/login?source=main',
meta={'cookiejar':1},
callback=self.parse
)]

def parse(self, response):
url = 'https://www.douban.com/accounts/login?source=main'
# cookie_1 = response.headers.getlist('Set-Cookie')
# print response.text
data = {}
data['submit'] = ''
data['form_email'] = '*********@qq.com' #自己的账户
data['form_password'] = '**********' #自己的密码

return [FormRequest.from_response(
response,
url=url,
meta={'cookiejar':response.meta['cookiejar']},
formdata=data,
callback=self.next
)]

def next(self, response):
url = ''
a = response.xpath('//a')
for i in a:
name = i.xpath('./text()').extract()
if len(name):
if u'退出' in name[0]:
url = i.xpath('./@href').extract()[0]
print name[0], '---', url
if u'电影' in name[0]:
url = i.xpath('./@href').extract()[0]
break
else:
continue
return [Request(
url=url,
meta={'cookiejar':response.meta['cookiejar']},
callback=self.next2
)]

def next2(self, response):
a = response.body
print a
原文地址:https://www.cnblogs.com/wozuilang-mdzz/p/9755101.html