python静态网页爬虫之xpath(简单的博客更新提醒功能)

直接上代码:

#!/usr/bin/env python3
#antuor:Alan
#-*- coding: utf-8 -*-

import requests
from lxml import etree
import datetime,time
import os









class xxoohelper(object):     #易读
    def __init__(self):
        self.url = 'http://www.cnblogs.com/alan-babyblog/'      #初始化
    def getSource(self):                       
        html = requests.get(self.url).content       #content比text好用,一个返回的是byte,一个返回的是str
        return html
    def getContent(self,html):                     #先大后小
        selector = etree.HTML(html)
        title  = selector.xpath('//div[1]/div[2]/a/text()')[0].strip()  #从列表提取文本
        content = selector.xpath('//div[1]/div[2]/div[1]/div/div[1]/div[3]/div/text()')[0].strip()
        post_time = selector.xpath('//div[1]/div[2]/div[1]/div/div[1]/div[5]/text()')[0].strip()
        send_text = title+content+post_time  #类型是str
        return send_text
    def tosave(self,text):
        with open('myblog.txt','a') as f:
            f.write(('{0}
').format(text))   #换行
    def tocheck(self,data):
        if not os.path.exists('myblog.txt'):   #判断是否存在文件
            return True
        else:
            with open ('myblog.txt','r') as f:
                existblog = f.readlines()
                #print(data+'
')
                if data +'
' in existblog:  #判断是否已经纪录过内容
                    return False
                else:
                    return True
if __name__ == '__main__':      #程序入口
    helper = xxoohelper()  #实例化
    while True :           #while循环不断监控页面
        source = helper.getSource()
        content = helper.getContent(source)
        if helper.tocheck(content):
            post_time = str(datetime.datetime.now())
            print(post_time,'有新内容
',content)
            helper.tosave(content)
        else:
            print('扫描中......')
            pass
        time.sleep(30)

  

原文地址:https://www.cnblogs.com/alan-babyblog/p/5522457.html