爬虫实现模拟登陆豆瓣

一：获取页面然后返回验证码，自己填写验证码来模拟登陆（相当于手动模拟登陆）

# -*- coding: utf-8 -*-
import requests
from HTMLParser import HTMLParser


class DoubanClient(object):
    def __init__(self):
        object.__init__(self)
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.73 Safari/537.36',
                   'origin': 'http://www.douban.com'}
        self.session = requests.session()
        self.session.headers.update(headers)


    def login(self, username, password,
              source='index_nav',
              redir='http://www.douban.com/',
              login='登录'):

        url = 'https://www.douban.com/accounts/login'
        r = self.session.get(url)
        (captcha_id, captcha_url) = _get_captcha(r.content)
        if captcha_id:
            captcha_solution = raw_input('please input solution for captcha [%s]:' % captcha_url)
        url = 'https://www.douban.com/accounts/login'
        data = {'form_email': username,
                'form_password': password,
                'source': source,
                'redir': redir,
                'login': login}
        headers = {'referer': 'http://www.douban.com/accounts/login?source=main',
                   'host': 'accounts.douban.com'}
        if captcha_id:
            data['captcha-id'] = captcha_id
            data['captcha-solution'] = captcha_solution
        self.session.post(url, data=data, headers=headers)
        print(self.session.cookies.items())

    def edit_signature(self, username, signature):
        url = 'https://www.douban.com/people/%s/' % username
        r = self.session.get(url)
        data = {'ck': _get_ck(r.content), 'signature': signature}
        url = 'https://www.douban.com/j/people/%s/edit_signature' % username
        headers = {'referer': url,
                   'host': 'www.douban.com',
                   'x-requested-with': 'XMLHttpRequest'}
        r = self.session.post(url, data=data, headers=headers)
        print(r.content)


def _attr(attrs, attrname):
    for attr in attrs:
        if attr[0] == attrname:
            return attr[1]
    return None


def _get_captcha(content):

    class CaptchaParser(HTMLParser):
        def __init__(self):
            HTMLParser.__init__(self)
            self.captcha_id = None
            self.captcha_url = None

        def handle_starttag(self, tag, attrs):
            if tag == 'img' and _attr(attrs, 'id') == 'captcha_image' and _attr(attrs, 'class') == 'captcha_image':
                self.captcha_url = _attr(attrs, 'src')

            if tag == 'input' and _attr(attrs, 'type') == 'hidden' and _attr(attrs, 'name') == 'captcha-id':
                self.captcha_id = _attr(attrs, 'value')

    p = CaptchaParser()
    p.feed(content)
    return p.captcha_id, p.captcha_url


def _get_ck(content):

    class CKParser(HTMLParser):
        def __init__(self):
            HTMLParser.__init__(self)
            self.ck = None

        def handle_starttag(self, tag, attrs):
            if tag == 'input' and _attr(attrs, 'type') == 'hidden' and _attr(attrs, 'name') == 'ck':
                self.ck = _attr(attrs, 'value')

    p = CKParser()
    p.feed(content)
    return p.ck


if __name__ == '__main__':
    c = DoubanClient()
    c.login('username@douban.com', 'password@douban.com')
    c.edit_signature('username', 'python 爬虫基础')

二：需要先登陆一次，获得你的登陆cookie，然后粘贴过来（cookie会过期，只能维持一小段时间）

#coding=gbk

import urllib2

HEADERS = {"cookie": '写你的cookie'}#里面写你在www.douban.com的cookie
url = 'http://www.douban.com/'
req = urllib2.Request(url, headers=HEADERS)
text = urllib2.urlopen(req).read()

print text  ##证明已经登陆成功