存档

# -*- coding: utf-8 -*-
import urllib2,cookielib
import urllib
import cStringIO
import datetime
from PIL import Image
from lxml import etree
import sys
reload(sys)
sys.setdefaultencoding('utf8')

def setOpener():
    cookie = cookielib.CookieJar()
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
    opener.addheaders.append(('User-Agent','Mozilla/5.0 (Windows NT 5.1; rv:25.0) Gecko/20100101 Firefox/25.0'))
    return opener

def md5(str):
    import hashlib
    import types
    if type(str) is types.StringType:
        m = hashlib.md5()
        m.update(str)
        return m.hexdigest()
    else:
        return ''

class spider:
    def __init__(self):
        self.opener=setOpener()#保存cookie信息
        self.imgUrl='http://210.42.121.241/servlet/GenImg'
        self.loginUrl='http://210.42.121.241/servlet/Login'
        self.queryScoreUrl='http://210.42.121.241/servlet/Svlt_QueryStuScore'
        self.studentID=''
        self.password=''
        self.captcha=''#验证码
        self.mainPageContent=''


    def getCaptcha(self):
        res =self.opener.open(urllib2.Request(self.imgUrl))
        tempIm = cStringIO.StringIO(res.read())
        im = Image.open(tempIm)
        return im
        #im.save('test.jpg')
        #im.show()
        #self.captcha = raw_input("验证码:")

    def loginMainPage(self):
        #需要post的数据
        pwdMD5=md5(self.password)
        postdata = urllib.urlencode({
            'id':self.studentID,
            'pwd':pwdMD5,
            'xdvfb':self.captcha
            })
        req = urllib2.Request(
            url = self.loginUrl,
            data = postdata
            )
        response = self.opener.open(req)
        self.mainPageContent = response.read().decode('gb2312')


    def getAndSaveScore(self):
         page=etree.HTML(self.mainPageContent)
         text=page.xpath('//div[@id="school"]/@onclick')
         try:
            token=text[0][65:101]
         except IndexError:
            print "Error:未能正确打开主页面"
            return 0
         else:
          GMT_FORMAT = '%a, %d %b %Y %H:%M:%S GMT'
          GMT_time=datetime.datetime.utcnow().strftime(GMT_FORMAT)
          getParams=urllib.urlencode({
              'csrftoken':token,
              'learnType':'',
              'scoreFlag':'0',
              't':GMT_time,
              'term':'',
              'year':'0'
          })

          url = self.queryScoreUrl
          fullUrl=url+'?'+getParams
          #print fullUrl
          req = urllib2.Request(fullUrl)
          response = self.opener.open(req)
          result = response.read().decode('gb2312')
          # 由于该网页是gb2312的编码,所以需要解码
          #print result
          out=open('inputScore.html','wb')
          out.write(result)
          out.close()
          return 1


#mySpider=spider()
#mySpider.getCaptcha()
#mySpider.loginMainPage()
#mySpider.getAndSaveScore()




原文地址:https://www.cnblogs.com/muyangshaonian/p/9650509.html