Python通过HTTP协议定期抓取文件

Python通过HTTP协议定期抓取文件 - 天地一沙鸥 - 博客频道 - CSDN.NET

Python通过HTTP协议定期抓取文件

分类：
Python

2007-07-26 18:56
698人阅读
评论(1)
收藏
举报
可以扩充成为简单的抓取工具，定时抓取

#!usr/bin/python

import urllib2,time;
class ErrorHandler(urllib2.HTTPDefaultErrorHandler):
    def http_error_default(self, req, fp, code, msg, headers):
        result = urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
        result.status = code
        return result

URL='http://www.ibm.com/developerworks/js/ajax1.js'
req=urllib2.Request(URL)
mgr=urllib2.build_opener(ErrorHandler())

while True:
    ns=mgr.open(req)
    if(ns.headers.has_key('last-modified')):
        modified=ns.headers.get('last-modified')
    if(ns.code==304):
        print '''
          ==============================
              NOT MODIFIED
          ==============================
        '''
    elif(ns.code==200):
        print ns.read()
    else:
        print 'there is an error';

    if(not locals().has_key('modified')):
        modified=time.time();
    req.add_header('If-Modified-Since',modified)
    time.sleep(10)