python爬煎蛋妹子图

 1 # python3
 2 # jiandan meizi tu
 3 import urllib
 4 import urllib.request as req
 5 import os
 6 import time
 7 import random
 8 
 9 
10 def url_open(url):
11     req1 = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/4.0'})
12     req2 = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/4.1'})
13     req3 = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/4.5'})
14     req4 = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.1'})
15     
16     req_list = [req1, req2,req3, req4]
17     response = urllib.request.urlopen(random.choice(req_list))
18     html = response.read()
19     # print ('url_open done!')
20     return html
21 
22 def url_open2(url):
23     req1 = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/4.0'})
24     req2 = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/4.1'})
25     req3 = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/4.5'})
26     req4 = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.1'})
27     req_list = [req1, req2,req3, req4]
28 
29     ip_list = ['117.135.251.136:82']
30     ip = random.choice(ip_list)
31     print (ip)
32 
33     proxy = req.ProxyHandler({'http': ip})
34     # auth = req.HTTPBasicAuthHandler()
35     opener = req.build_opener(proxy, req.HTTPHandler)
36     req.install_opener(opener)
37     conn = req.urlopen(random.choice(req_list))
38     return_str = conn.read()
39     return return_str
40 
41 def get_current_page(url):
42     html = url_open2(url).decode('utf-8')
43     a = html.find('current-comment-page') + 23
44     b = html.find(']',a)
45     return html[a:b]
46 
47 def find_imgs(url):
48     html = url_open2(url).decode('utf-8')
49     img_addrs = []
50     a = html.find('img src="http')
51     while a != -1:        
52         b = html.find('.jpg',a, a+255)
53         if b != -1:
54             img_addrs.append(html[a+9:b+4])
55         else:
56             b = a + 13
57         a = html.find('img src="http', b)
58     return img_addrs
59 
60 def save_imgs(folder,img_addrs):
61     for each in img_addrs:
62         filename = each.split('/')[-1]
63         with open(filename,'wb') as f:
64             img = url_open2(each)
65             f.write(img)
66 
67 
68 def download_mm(folder = 'xx',pages = 300):
69     # os.mkdir(folder)
70     os.chdir(folder)
71     
72     url = 'http://jandan.net/ooxx/'
73     current_page_num = int(get_current_page(url))
74     for i in range(pages):
75         print (time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()),'current_page_num', current_page_num)
76         if i%3 == 0:
77             print (time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()),"sleep 2 seconds...")
78             time.sleep(2)
79         current_page_num -= 1
80         page_url = url + 'page-' + str(current_page_num) + '#comments'
81         img_addrs = find_imgs(page_url)
82         save_imgs(folder, img_addrs)
83 
84 if __name__ == '__main__':
85     download_mm()
原文地址:https://www.cnblogs.com/duanguyuan/p/5208586.html