爬虫下载脚本

download 文件

 1 #!/usr/bin/python
 2 #_*_coding:utf-8 _*_
 3 import  urlparse
 4 import urllib2
 5 import random
 6 import time
 7 from   datetime  import  datetime, timedelta
 8 import socket 
 9 import disk_cache 
10 DEFAULT_AGENT='WSWP'  # 设置代理
11 DEFAULT_DELAY=5       #设置下载延迟 为了限制下载速度
12 DEFAULT_RETRIES=1#发生错误时候尝试的次数
13 DEFAULT_TIMEOUT=60   
14 CACHE=disk_cache.DiskCache()
15 class   Downloader:
16    def __init__ (self ,delay=DEFAULT_DELAY,user_agent=DEFAULT_AGENT, proxies=None,num_retries=DEFAULT_RETRIES, timeout=DEFAULT_TIMEOUT, opener=None,cache=CACHE ):
17             socket.setdefaulttimeout(timeout)
18             self.throttle=Throttle(delay)
19             self.user_agent=user_agent
20             self.proxies=proxies
21             self.num_retries=num_retries
22             self.opener=opener
23             self.cache=cache
24 
25 
26    def __call__(self,url):
27        result=None
28        print self.cache
29        if  self.cache:
30             try:
31                 print  'women doushi'
32                 result=self.cache[url]
33                 print result 
34                 print '123' 
35             except  KeyError:
36                 pass
37             else:
38                   if self.num_retries>0  and 500< result['code']<600:
39                          result=None
40        if result is  None:
41               self.throttle.wait(url)
42               proxy=random.choice(self.proxies)  if self.proxies   else  None
43               headers={'User-agent':self.user_agent}
44               result=self.download(url,headers, proxy=proxy,num_retries=self.num_retries)
45               if self.cache:
46                     self.cache[url]=result
47 
48        return result['html']
49 
50    def  download(self,url, headers,proxy, num_retries, data=None):
51             print 'Downloading：', url
52             request=urllib2.Request(url,data,headers or {})
53             opener=self.opener or  urllib2.build_opener()
54             if proxy:
55                   proxy_params={urlparse.urlparse(url).scheme:proxy}
56                   opener.add_handler(urllib2.ProxyHandler(proxy_params))
57 
58 
59             try:
60                        
61 
62               response=opener.open(request)
63               html=response.read()
64               code=response.code
65 
66             except   Exception   as e :
67                    print  'Download error:',  str(e)
68 
69                    html=''
70                    
71                    if num_retries>0  and   500<=code<600:
72                            return  self._get(url,headers, prox,num_retries-1,data)
73 
74 
75                    else:
76                      code=None
77             return  {'html':html,'code':code}
78                     
79 class     Throttle:
80       def  __init__ (self,delay):
81          self.delay=delay
82          self.domains={}
83       def wait(self,url):
84            domain=urlparse.urlsplit(url).netloc
85            last_accessed=self.domains.get(domain)
86            if self.delay>0 and last_accessed is not None:
87                      sleep_secs=self.delay-(datetime.now()-last_accessed).seconds
88                      if  sleep_secs >0:
89                               time.sleep(sleep_secs)
90            self.domains[domain]=datetime.now()
91 p=Downloader()
92 x=p('http://www.meituan.com')

View Code

缓存disk_cache 脚本

  1 import os
  2 import re
  3 import urlparse
  4 import shutil
  5 import zlib
  6 from datetime import datetime, timedelta
  7 try:
  8     import cPickle as pickle
  9 except ImportError:
 10     import pickle
 11 
 12 
 13 
 14 class DiskCache:
 15     """
 16     Dictionary interface that stores cached 
 17     values in the file system rather than in memory.
 18     The file path is formed from an md5 hash of the key.
 19 
 20     >>> cache = DiskCache()
 21     >>> url = 'http://example.webscraping.com'
 22     >>> result = {'html': '...'}
 23     >>> cache[url] = result
 24     >>> cache[url]['html'] == result['html']
 25     True
 26     >>> cache = DiskCache(expires=timedelta())
 27     >>> cache[url] = result
 28     >>> cache[url]
 29     Traceback (most recent call last):
 30      ...
 31     KeyError: 'http://example.webscraping.com has expired'
 32     >>> cache.clear()
 33     """
 34 
 35     def __init__(self, cache_dir='cache', expires=timedelta(days=30), compress=True):
 36         """
 37         cache_dir: the root level folder for the cache
 38         expires: timedelta of amount of time before a cache entry is considered expired
 39         compress: whether to compress data in the cache
 40         """
 41         self.cache_dir = cache_dir
 42         self.expires = expires
 43         self.compress = compress
 44 
 45     
 46     def __getitem__(self, url):
 47         """Load data from disk for this URL
 48         """
 49         path = self.url_to_path(url)
 50         if os.path.exists(path):
 51             with open(path, 'rb') as fp:
 52                 data = fp.read()
 53                 if self.compress:
 54                     data = zlib.decompress(data)
 55                 result, timestamp = pickle.loads(data)
 56                 if self.has_expired(timestamp):
 57                     raise KeyError(url + ' has expired')
 58                 return result
 59         else:
 60             # URL has not yet been cached
 61             raise KeyError(url + ' does not exist')
 62 
 63 
 64     def __setitem__(self, url, result):
 65         """Save data to disk for this url
 66         """
 67         path = self.url_to_path(url)
 68         folder = os.path.dirname(path)
 69         if not os.path.exists(folder):
 70             os.makedirs(folder)
 71 
 72         data = pickle.dumps((result, datetime.utcnow()))
 73         if self.compress:
 74             data = zlib.compress(data)
 75         with open(path, 'wb') as fp:
 76             fp.write(data)
 77 
 78 
 79     def __delitem__(self, url):
 80         """Remove the value at this key and any empty parent sub-directories
 81         """
 82         path = self._key_path(url)
 83         try:
 84             os.remove(path)
 85             os.removedirs(os.path.dirname(path))
 86         except OSError:
 87             pass
 88 
 89 
 90     def url_to_path(self, url):
 91         """Create file system path for this URL
 92         """
 93         components = urlparse.urlsplit(url)
 94         # when empty path set to /index.html
 95         path = components.path
 96         if not path:
 97             path = '/index.html'
 98         elif path.endswith('/'):
 99             path += 'index.html'
100         filename = components.netloc + path + components.query
101         # replace invalid characters
102         filename = re.sub('[^/0-9a-zA-Z-.,;_ ]', '_', filename)
103         # restrict maximum number of characters
104         filename = '/'.join(segment[:255] for segment in filename.split('/'))
105         return os.path.join(self.cache_dir, filename)
106 
107 
108     def has_expired(self, timestamp):
109         """Return whether this timestamp has expired
110         """
111         return datetime.utcnow() > timestamp + self.expires
112 
113 
114     def clear(self):
115         """Remove all the cached values
116         """
117         if os.path.exists(self.cache_dir):
118             shutil.rmtree(self.cache_dir)
119 
120 
121 
122 if __name__ == '__main__':
123     cache=DiskCache()
124     print cache

View Code