python 爬图 helloworld

最近发现 吾志 上用户的头像都很个性,另外,对于没有把日记设为私密的用户,最后一天的日记是公开的,谁都可以查看。

所以,如果每天把所有可查看的日记爬一遍,那么~~ 哈哈

以前对爬虫只是了解一点点,没有真的玩过。既然今晚兴致来了,那就随便学一下咯~

参考 http://blog.csdn.net/pleasecallmewhy/article/details/8925978

参考 http://cuiqingcai.com/1052.html

 1 #coding=utf-8
 2 import os
 3 import urllib
 4 import urllib2
 5 import re
 6 import cookielib
 7 
 8 
 9 
10 def mkdir(path):
11     # 去除左右两边的空格
12     path = path.strip()
13     # 去除尾部  符号
14     path = path.rstrip("\")
15 
16     if not os.path.exists(path):
17         os.makedirs(path)
18 
19     return path
20 
21 
22 def save_file(path, file_name, data):
23     if data == None:
24         return
25 
26     mkdir(path)
27     if (not path.endswith("/")):
28         path = path + "/"
29     f = open(path+file_name, "wb")
30     f.write(data)
31     f.flush()
32     f.close()
33 
34 
35 
36 user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36'
37 headers = {'User-Agent' : user_agent}
38 values = {}
39 data = urllib.urlencode(values)
40 
41 def getHtml(url):
42     req = urllib2.Request(url, data, headers)
43     page = urllib2.urlopen(req, timeout=10)
44     html = page.read()
45     page.close()
46     #print html
47     return html
48 
49 def get_file(url):
50     try:
51         opener = urllib2.build_opener()
52         opener.addheaders = [('User-Agent', 'Mozilla/5.0')]
53         urllib2.install_opener(opener)
54         req = urllib2.Request(url)
55         operate = opener.open(req)
56         data = operate.read()
57         operate.close()
58         return data
59     except BaseException, e:
60         print e, 'fuck'
61         return None
62 
63 
64 def getImg(html):
65     reg = r'src="(.+?.jpg)" alt='
66     imgre = re.compile(reg)
67     imglist = re.findall(imgre, html)
68 
69     x = 0
70     for imgurl in imglist:
71         #urllib.urlretrieve(imgurl, '%s.jpg' % x)
72         da = get_file(imgurl)
73         save_file('.', '%s.jpg' % x, da)
74         x += 1
75 
76     return x
77 
78 
79 
80 html = getHtml("https://wuzhi.me/last")
81 
82 print getImg(html)

十分简陋,哈哈~

+V d2h5X251bGw= 请备注:from博客园
原文地址:https://www.cnblogs.com/hangj/p/4679087.html