python抓取伯乐在线的全部文章,对标题分词后存入mongodb中

依赖包:

1.pymongo

2.jieba


# -*- coding: utf-8 -*-

"""
@author: jiangfuqiang
"""
from HTMLParser import HTMLParser
import urllib2
import sys
import pymongo
import time
import jieba
import traceback

default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
    reload(sys)
    sys.setdefaultencoding(default_encoding)
class FetchJobble(HTMLParser):

    def __init__(self):
        HTMLParser.__init__(self)
        self.isPostThumb = False
        self.isPostMeta = False
        self.isMetaTitle = False
        self.isCategoryTag = False
        self.isComment = False
        self.isexcerpt = False
        self.isReadMore = False
        self.isPicture = False
        self.data = {}
        self.result = []

    def handle_starttag(self,tag,attrs):
        if tag == 'div':
            for key,value in attrs:
                if key == 'class':
                    if value == 'post-thumb':
                        self.isPostThumb = True
                    elif value == 'meta-title':
                        self.isMetaTitle = True
        elif tag == 'a' and self.isPostThumb == True:

            for key, value in attrs:
                if self.isReadMore:
                    if key == 'href':
                        self.data['redmoreLink'] = value
                        self.data['keyword'] = ",".join(jieba.cut(self.data['title']))
                        self.result.append(self.data)
                        self.isPostThumb = False
                        self.isMetaTitle = False
                        self.isReadMore = False
                        self.isCategoryTag = False
                        self.isComment = False
                        self.isexcerpt = False
                        self.isPicture = False

                        self.data = {}
                else:
                    if key == 'class':
                        if value == 'meta-title':
                            self.isMetaTitle = True
                    elif key == 'rel':
                        if value == 'category tag':
                            self.isCategoryTag = True
                    elif key =='href':
                        if value.find('#respond') > 0:
                            self.isComment = True
        elif tag == 'span' and self.isComment == True:
            for key, value in attrs:
                if key == 'class' and value == 'excerpt':
                    self.isexcerpt = True
                elif key == 'class' and value == 'read-more':
                    self.isReadMore = True
        elif tag == 'img' and self.isPostThumb and self.isPostMeta == False:
            for key, value in attrs:
                if key == 'src':
                    self.data['imgSrc'] = value

    def handle_endtag(self,tag):

        pass

    def handle_data(self,data):
         if self.isMetaTitle:
            self.data['title'] = data
            self.isMetaTitle = False
         elif self.isCategoryTag:
             ct = ''
             if 'tag' in self.data.keys() :
                 ct = self.data['tag'] + "," + data
             else:
                 ct = data
             self.data['tag'] = ct
             self.isCategoryTag = False
         elif self.isComment and 'comment' not in self.data.keys():
             self.data['comment'] = data.split(" ")[0]
         elif self.isexcerpt:
             self.data['desc'] = data
             self.isexcerpt = False


    def getResult(self):
        return self.result

if __name__ == "__main__":
    con = pymongo.Connection('localhost', 27017)
    db = con.blog
  
    fetchblog = db.fetch_blog

    url = "http://blog.jobbole.com/all-posts/page/%d"
    count = 1
    flag = False
    headers={
             'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
    while flag == False:
        try:
            req = urllib2.Request(url%count,headers=headers)
            request = urllib2.urlopen(req)
            data = request.read()
            fj = FetchJobble()
            fj.feed(data)
            result = fj.getResult()
            if len(result) < 1:
                flag = True
            else:
                for doc in result:
                    fetchblog.insert(doc)
                print "page is %d"%count
                count += 1

                time.sleep(5)
        except Exception, e:
            traceback.print_exc()
            print "parse error",e

原文地址:https://www.cnblogs.com/lcchuguo/p/4008352.html