python 爬虫爬取历年双色球开奖信息

 目前写的这些爬虫都是些静态网页,对于一些高级网页(像经过JS渲染过的页面),目前技术并不能解决,自己也是在慢慢学习过程中,如有错误,欢迎指正;

对面前端知识本人并不懂,过程中如果涉及到前端知识,也是百度而来,毕竟爬虫还是和前端页面打交道多,前端知识还是要多学习; 

此篇还是继续静态页面,更换了不同的内容,以及涉及到多个python 模块和自己二次封装的模块,个人感觉这些模块不使用在爬虫方面也是很有用的;

第一部分,封装了自带模块logging,其中使用了getpass 模块,用来记录日志的用户名,都是些简单的使用,关于注释,本来已写好,并未上传到git,导致此次上传代码没有下次注意,哈哈;

个人建议:在学习python 过程中多练习写代码,在写的过程中去理解其中的用法;

#!/usr/bin/env python
#coding:utf-8
#author chenjisheng
#date 20171129
import logging
import getpass


class MyLog(object):
    "this class will create log"
    def __init__(self):
        user = getpass.getuser()          
        self.logger = logging.getLogger(user)
        self.logger.setLevel(logging.DEBUG)
        logFile = './progress.log'
        formatter = logging.Formatter(
            '%(asctime) -12s %(levelname)-8s %(name) -10s %(message)-12s'
        )
        '''logfile output screen and files'''
        logHand = logging.FileHandler(logFile)
        logHand.setFormatter(formatter)
        logHand.setLevel(logging.ERROR)
        logHandt = logging.StreamHandler()
        logHandt.setFormatter(formatter)
        self.logger.addHandler(logHand)
        self.logger.addHandler(logHandt)

        '''five level and five functions '''
    def debug(self,msg):
        self.logger.debug(msg)

    def info(self,msg):
        self.logger.info(msg)

    def warn(self,msg):
        self.logger.warning(msg)

    def error(self,msg):
        self.logger.error(msg)

    def critical(self,msg):
        self.logger.critical(msg)

if __name__ == "__main__":
    mylog = MyLog()
    mylog.debug('i am debug')
    mylog.info('i am info')
    mylog.warn('i am warning')
    mylog.error('i am error')
    mylog.critical('i am critical')

 第二部分:使用了re,urllib2,xlwt,bs4,sys模块;xlwt模块在之前的博客里已写过;bs4 模块大名鼎鼎,不过多解析,至于为什么用它,因为其简单,其它的爬虫模块也不会;也在学习当中;

#!/usr/bin/env python
#coding:utf-8
"""Created on 2017-11-29"""

import re
import urllib2
import xlwt
from bs4 import BeautifulSoup
from myLog import MyLog as mylog
import sys
reload(sys)
sys.setdefaultencoding('utf8')

class DoubleColorBallItem(object):
    date = None
    order = None
    red1 = None
    red2 = None
    red3 = None
    red4 = None
    red5 = None
    red6 = None
    bule = None
    money = None
    firstPrize = None
    secondPrize = None

class GetDoubleColorBallNumber(object):
    """capture BallNumbers"""

    def __init__(self):
        self.urls = []
        self.log = mylog()
        self.getUrls()
        self.items = self.spider(self.urls)
        self.pipelines(self.items,self)


    def getUrls(self):
        URL = r'http://kaijiang.zhcw.com/zhcw/html/ssq/list_1.html'
        htmlContent = self.getResponseContent(URL)
        soup = BeautifulSoup(htmlContent,'lxml')
        tag = soup.find_all(re.compile('p'))[-1]
        # pages = tag.strong.get_text()
        pages = 2
        for i in xrange(1,int(pages)+1):
            url = r'http://kaijiang.zhcw.com/zhcw/html/ssq/list_'+str(i)+'.html'
            self.urls.append(url)
            self.log.info(u'append URL:%s to URLS 
'%url)

    def getResponseContent(self,url):
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
            }
            req = urllib2.Request(url,headers=headers)
            response = urllib2.urlopen(req)
        except Exception,e:
            self.log.error(u'return datas failed URL:%s
'%url)
        else:
            self.log.info(u'return datas successfuly URL:%s
'%url)
            return response.read()

    def spider(self,urls):
        items = []
        for url in urls:
            htmlContent = self.getResponseContent(url)
            soup = BeautifulSoup(htmlContent,'lxml')
            tags = soup.find_all('tr',attrs={})
            for tag in tags:
                if tag.find('em'):
                    item = DoubleColorBallItem()
                    tagTd = tag.find_all('td')
                    item.date = tagTd[0].get_text()
                    item.order = tagTd[1].get_text()
                    tagEm = tagTd[2].find_all('em')
                    item.red1 = tagEm[0].get_text()
                    item.red2 = tagEm[1].get_text()
                    item.red3 = tagEm[2].get_text()
                    item.red4 = tagEm[3].get_text()
                    item.red5 = tagEm[4].get_text()
                    item.red6 = tagEm[5].get_text()
                    item.bule = tagEm[6].get_text()
                    item.money = tagTd[3].find('strong').get_text()
                    item.firstPrize = tagTd[4].find('strong').get_text()
                    item.secondPrize = tagTd[5].find('strong').get_text()
                    items.append(item)
                    self.log.info(u'get date:%s datas OK
'%item.date)
        return items

    def pipelines(self,items,nu):
        # fileName = 'DoubleBall.txt'
        # with open(fileName,'w') as fp:
        #     for item in items:
        #         fp.write('%s %s 	 %s %s %s %s %s %s  	 %s 	  %s   %s 
'%(item.date,item.order,item.red1,item.red2,item.red3,item.red4,item.red5,
        #                                                                   item.red6,item.bule,item.firstPrize,item.secondPrize))
        #         self.log.info(u'write date:%s OK '%item.date)
        W = xlwt.Workbook('utf-8')
        ws = W.add_sheet(u"双色球记录")
        # ws.col(1).width = 6666
        # ws.col(2).width = 3333
        ws.write(0,1,label=u"时间")
        ws.write(0,2,label=u"期号")
        ws.write(0,3, label=u"红色1")
        ws.write(0,4, label=u"红色2")
        ws.write(0,5, label=u"红色3")
        ws.write(0,6, label=u"红色4")
        ws.write(0,7, label=u"红色5")
        ws.write(0,8, label=u"红色6")
        ws.write(0,9, label=u"蓝色")
        ws.write(0,10, label=u"一等奖")
        ws.write(0,11, label=u"二等奖")
        nu = 1
        for item in items:
            ws.write(nu,1,label=item.date)
            ws.write(nu,2,label=item.order)
            ws.write(nu,3,label=item.red1)
            ws.write(nu,4,label=item.red2)
            ws.write(nu,5,label=item.red3)
            ws.write(nu,6,label=item.red4)
            ws.write(nu,7,label=item.red5)
            ws.write(nu,8,label=item.red6)
            ws.write(nu,9,label=item.bule)
            ws.write(nu,10,label=item.firstPrize)
            ws.write(nu,11,label=item.secondPrize)
            nu += 1
        W.save(u"双色球记录表.xls")
if __name__ == '__main__':
    GDCBN = GetDoubleColorBallNumber()

 以上部分,也是学习了别人经验,也从代码中学到了不少知识,愿本文也能给你带来灵感;

原文地址:https://www.cnblogs.com/Mail-maomao/p/7955389.html