pyspider—爬取下载图片

以第一ppt网站为例:http://www.1ppt.com/

from pyspider.libs.base_handler import *
import urllib2,HTMLParser,re

import urllib2,HTMLParser,re

#根url
host = "http://www.1ppt.com/"
#本地保存地址
localSavePath = '/data/girls/'
#起始图片html地址
startHtmlUrl = ''
#图片页Html的地址
htmlUrlList = []
#图片Url地址
imageUrlList = []
patter = '[0-9]*.jpg';
#根据得到的图片路径URL将图片下载下来保存本地
def downloadImage(url):
    print url
    cont = urllib2.urlopen(url).read()
    match = re.search(patter,url);
    if match:
        print '正在下载文件:',match.group()
        filename = localSavePath+match.group()
        f = open(filename,'w+')
        f.write(cont)
        f.close()
    else:
        print 'no match'

#根据首页得到的图片集遍历每个图片集
def getImageUrlByHtmlUrl(htmlUrl):
    parser = MyHtmlParse(False)
    request = urllib2.Request(htmlUrl)
    try:
        response = urllib2.urlopen(request)
        content = response.read()
        parser.feed(content)
    except urllib2.URLError,e:
        print e.reason
        return

class MyHtmlParse(HTMLParser.HTMLParser):
    def __init__(self,isIndex):
        self.isIndex = isIndex;
        HTMLParser.HTMLParser.__init__(self)

    def handle_starttag(self,tag,attrs):
        #print tag
        #print attrs

        if(self.isIndex):
            if(tag == 'a'):
                if(len(attrs) == 3):
                    #print attrs[0]
                    if(attrs[1][0] =='title'):
                        newUrl = host + attrs[0][1]
                        #    print '找到一处图片的网页链接:',newUrl
                        global startHtml
                        startHtmlUrl = newUrl
                        getImageUrlByHtmlUrl(newUrl)
        else:
            #print tag
            if(tag == 'img'):
                #    print attrs
                #print attrs[0][0]
                #print attrs[1][0]
                if(attrs[0][0] == 'src' and attrs[1][0] == 'alt' and attrs[0][1] ):
                    imageUrl = attrs[0][1]
                    match = re.search(patter,imageUrl)
                    if match:
                        print '找到一张图片:',imageUrl
                        downloadImage(imageUrl)
                        imageUrlList.append(imageUrl)    
                        #if (tag == 'a'):       
                        #if (len(attrs) == 4):
                        ##if (attrs[1] == ('class','next')):
                        #nextUrl = host + attrs[2][1]
                        #print '找到一处图片的网页链接:',nextUrl
                        #global startHtmlUrl
                        #if (startHtmlUrl != nextUrl):
                        #getImageUrlByHtmlUrl(nextUrl)


#分析首页得到每个图片集的链接
def parse_url_picture(indexUrl):
    #indexUrl = 'http://desk.zol.com.cn/meinv/'
    #分析首页得到每个图片集的链接
    #indexUrl = 'http://www.1ppt.com'
    m = urllib2.urlopen(indexUrl).read()
    #print m
    parserIndex = MyHtmlParse(True)
    parserIndex.feed(m)

picture_website = r'http://www.1ppt.com/'
class Handler(BaseHandler):
    crawl_config = {
    }
    
    @every(minutes=24 * 60)
    def on_start(self):
        self.crawl(picture_website, callback=self.index_page)
        return
    @config(age= 10 * 24 * 60 * 60)
    def index_page(self, response):
        for each in response.doc('a[href^="http"]').items():
            print each.attr.href
            parse_url_picture(each.attr.href)
            self.crawl(each.attr.href, callback=self.detail_page)
        return
    
    @config(priority=2)
    def detail_page(self, response):
        return{
        }

下面脚本是直接运行(不用放到爬虫平台上):

#coding: utf-8 #############################################################
# File Name: girls.py
# Author: mylonly
# mail: mylonly@gmail.com
# Created Time: Mon 09 Jun 2014 09:23:18 PM CST
#########################################################################
#!/usr/bin/python

import urllib2,HTMLParser,re

#根url
host = "http://1ppt.com"
#本地保存地址
localSavePath = '/data/girls/'
#起始图片html地址
startHtmlUrl = ''
#图片页Html的地址
htmlUrlList = []
#图片Url地址
imageUrlList = []
patter = '[0-9]*.jpg';
#根据得到的图片路径URL将图片下载下来保存本地
def downloadImage(url):
    print url
    cont = urllib2.urlopen(url).read()
    match = re.search(patter,url);
    if match:
        print '正在下载文件:',match.group()
        filename = localSavePath+match.group()
        f = open(filename,'w+')
        f.write(cont)
        f.close()
    else:
        print 'no match'

#根据首页得到的图片集遍历每个图片集
def getImageUrlByHtmlUrl(htmlUrl):
    parser = MyHtmlParse(False)
    request = urllib2.Request(htmlUrl)
    try:
        response = urllib2.urlopen(request)
        content = response.read()
        parser.feed(content)
    except urllib2.URLError,e:
        print e.reason

class MyHtmlParse(HTMLParser.HTMLParser):
    def __init__(self,isIndex):
        self.isIndex = isIndex;
        HTMLParser.HTMLParser.__init__(self)
        
    def handle_starttag(self,tag,attrs):
        #print tag
        #print attrs
        
        if(self.isIndex):
            if(tag == 'a'):
                if(len(attrs) == 3):
                    #print attrs[0]
                    if(attrs[1][0] =='title'):
                        newUrl = host + attrs[0][1]
                    #    print '找到一处图片的网页链接:',newUrl
                        global startHtml
                        startHtmlUrl = newUrl
                        getImageUrlByHtmlUrl(newUrl)
        else:
            #print tag
            if(tag == 'img'):
            #    print attrs
                print attrs[0][0]
                print attrs[1][0]
                if(attrs[0][0] == 'src' and attrs[1][0] == 'alt' and attrs[0][1] ):
                    imageUrl = attrs[0][1]
                    match = re.search(patter,imageUrl)
                    if match:
                        print '找到一张图片:',imageUrl
                        downloadImage(imageUrl)
                        imageUrlList.append(imageUrl)    
            #if (tag == 'a'):
                #if (len(attrs) == 4):
                    ##if (attrs[1] == ('class','next')):
                    #nextUrl = host + attrs[2][1]
                    #print '找到一处图片的网页链接:',nextUrl
                    #global startHtmlUrl
                    #if (startHtmlUrl != nextUrl):
                        #getImageUrlByHtmlUrl(nextUrl)
#分析首页得到每个图片集的链接
indexUrl = 'http://www.1ppt.com'
m = urllib2.urlopen(indexUrl).read()
#print m
parserIndex = MyHtmlParse(True)
parserIndex.feed(m)
原文地址:https://www.cnblogs.com/panliu/p/4849212.html