爬取www.mmjpg.com网站图片,你懂得哦!

 1 #!/usr/bin/env python
 2 #-*-conding:utf-8-*-
 3 import requests
 4 import random
 5 import subprocess
 6 import urllib.request
 7 from bs4 import BeautifulSoup
 8 import sys
 9 import threading
10 from concurrent.futures.process import ProcessPoolExecutor
11 sys.setrecursionlimit(1000000)
12 
13 class obj(object):
14     """ """
15     def __init__(self):
16         self.number = 1
17 
18     def startpage(self,url,end):
19         code = ''
20         for i in range(5):
21             a = chr(random.randint(97, 122))
22             b = random.randint(1, 9)
23             uuid = random.choice([a, b])
24             code += str(uuid)
25         try:
26             response = requests.get(url)
27             response.encoding = 'utf8'
28             html = response.text
29             soup = BeautifulSoup(html,'html.parser')
30             tag = soup.find(name='div',id='content')
31             nexturl = tag.find(name='a').attrs.get('href')
32             image = tag.find(name='a').find(name='img')
33             imageurl = image.attrs.get('src')
34             headers = {
35             'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
36             'Content-Type':'image/jpeg',
37             'Host':'img.mmjpg.com',
38             'Referer':nexturl,
39             'If-None-Match':'59a96b74-%s'%code,
40             }
41             userinfo = image.attrs.get('alt')
42             userinfomation = userinfo.split(' ')[0]
43             userinfos = userinfo.replace(' ','')
44             getnum = int(imageurl.split('/')[5].split('.')[0])
45             #print (userinfo)
46             rs = requests.get(imageurl,headers=headers)
47             if getnum == 1:
48                 self.mkdir(userinfomation)
49                 self.number = 1
50             self.getimage(rs.content,userinfomation,userinfos)
51             print (imageurl,userinfomation)
52             url = nexturl
53             self.number += 1
54             if nexturl.split('/')[-2] != str(end):
55                 self.startpage(url,end)
56         except Exception as e:
57             print (e)
58 
59     def getimage(self,url,name,num):
60         try:
61             iminfo = "H:\temp\%s\%s.jpg"%(name,num)
62             status,resp = subprocess.getstatusoutput('dir %s'%iminfo)
63             f = open(iminfo,'wb')
64             f.write(url)
65             f.close()
66         except Exception as e:
67             print (e)
68 
69 
70     def mkdir(self,dir):
71         status,result = subprocess.getstatusoutput("dir H:\temp\%s"%(dir))
72         if status !=0:
73             subprocess.Popen("md H:\temp\%s"%(dir),shell=True)
74 
75 site = obj()
76 
77 def main():
78     startpage = 1097
79     for i in range(41):
80         endpage = startpage - 27
81         if startpage == 17:
82             endpage =0
83         url ='http://www.mmjpg.com/mm/%s'%startpage
84         t = threading.Thread(target=site.startpage,args=(url,endpage))
85         t.start()
86         #print (startpage,endpage)
87         startpage -= 27
88 if __name__ == '__main__':
89     main()
原文地址:https://www.cnblogs.com/zl-py/p/7491865.html