Python爬虫学习笔记——豆瓣登陆(一)

#-*- coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup
import html5lib
import re
import urllib
url1 = 'http://accounts.douban.com/login'
url2 = 'http://www.douban.com/people/*****/contacts'
formdata={
"redir":"http://www.douban.com/",
"form_email":"************",
"form_password":"*******",
#'captcha-solution':'blood',
#'captcha-id':'cRPGXEYPFHjkfv3u7K4Pm0v1:en',
"login":"登录"
}

headers = {
    "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Encoding":"gzip, deflate, sdch",
    "Accept-Language":"zh-CN,zh;q=0.8",
    "Referer":"http://accounts.douban.com/login",
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36"
}

r1 = requests.post(url1,data=formdata,headers=headers)
rcontent = r1.text
soup = BeautifulSoup(rcontent,"html5lib")
#安装了html5lib没用python本身的html解析库
captchaAddr = soup.find('img',id='captcha_image')['src']
reCaptchaID = r'<input type="hidden" name="captcha-id" value="(.*?)"/'
captchaID = re.findall(reCaptchaID,rcontent)
print(captchaID)
urllib.request.urlretrieve(captchaAddr,"captcha.jpg")
captcha = input('please input the captcha:')
formdata['captcha-solution'] = captcha
formdata['captcha-id'] = captchaID
r1 = requests.post(url1,data=formdata,headers=headers)
r2 = requests.get(url2,cookies=r1.cookies,headers=headers)
f = open('spider2.txt','w',encoding='utf-8')
f.write(r2.text)
f.close()

目前能够登陆成功，但是cookies不能用，容我再研究研究requests。。。

windows，python,sublime编码真是处处是坑，今天一直在折腾UTF-8和GBK编码的问题了，网上很多方法都试了最后都不管用

sublime的真的没辙了ctrl+b一直提示decode error，最后是直接cmd import module运行的，还是碰到了GBK编码问题，最后在open一个文件的时候，使用强制的encoding='utf-8'解决的。

PS：关于手动输入验证码，这个是参考下面的帖子的，其实我bs的库没怎么研究过，还不大懂，学python也没多久，再学习学习，看看是否能够自动识别，因为本身在做人工神经网络的毕设，不知道能不能使用。或者绕过验证码，貌似豆瓣有给api的，但是自己算边学边练习吧

参考链接：http://blog.csdn.net/andrewseu/article/details/47253791

http://blog.csdn.net/greatpresident/article/details/8209712