爬虫2

#coding=utf-8
import urllib
import re
import os

def getHtml(url):
page = urllib.urlopen(url)
html = page.read()
return html

def getImg(html):
reg = r'src="..(.+?.JPG)"'
imgre = re.compile(reg)
imglist = re.findall(imgre,html)
x = 0


for imgurl in imglist:
print imgurl
imgurl = "http://www.cust.edu.cn"+imgurl
print imgurl

urllib.urlretrieve(imgurl,'D:img\%s.jpg' % x)
x+=1


par = r'<span style="font-family:宋体">(.*)</span>'
parre = re.compile(par)
parlist = re.findall(parre,html)


for item in parlist:
print item
print '-----------------------------------'



html = getHtml("http://www.cust.edu.cn/lgxw/32913.htm")

print getImg(html)

原文地址:https://www.cnblogs.com/helloaworld/p/7090129.html