batch_get_real_image_urls.py 博源

#!/usr/bin/env python
#encoding=utf-8
"""
2010.10.09
v0.1
获取图片真实的URL,之前做了Referer和302跳转
"""
import socket as original_socket
original_socket.setdefaulttimeout(10)
import sys
reload(sys)
sys.setdefaultencoding(sys.stdout.encoding)
from functools import wraps
from pyquery import PyQuery as pq
import os
import time
import glob
import eventlet
from eventlet import pools
from eventlet.timeout import with_timeout
from eventlet.green import urllib2
g_data_folder           = os.path.join("./","images")
g_infos_folder          = os.path.join("./","images_infos")
g_error_file            = os.path.join("./","images_infos/error.txt")
g_success_file          = os.path.join("./","images_infos/success.txt")
g_xmls_folder           = os.path.join("./","xmls/")
g_filter                = os.path.join(g_xmls_folder,"*.xml")
str="""
Host: www.215588.com
User-Agent: Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2.10) Gecko/20100914 Firefox/3.6.10
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8
Accept-Language: en-us,en;q=0.5
Accept-Encoding: gzip,deflate
Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7
Keep-Alive: 115
Connection: keep-alive
Referer: http://www.215588.com/gouwu/showproduct.asp?id=592
Cookie: RecentlyGoods=508%2C184%2C592%2C; ASPSESSIONIDACBTTQQD=KHGEPHICJFEOEEPIGAJJNKHI; AJSTAT_ok_times=2; ASPSESSIONIDACCQSQQC=CBKBIEFDAJDHKEMIDIKFMPNM; AJSTAT_ok_pages=1
"""
str="""
Host: www.215588.com
User-Agent: Mozilla/5.0 (X11; U; Linux i686; zh-CN; rv:1.9.2.12pre) Gecko/20101005 Ubuntu/10.04 (lucid) Namoroka/3.6.12pre
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8
Accept-Language: zh-cn,zh;q=0.5
Accept-Encoding: gzip,deflate
Accept-Charset: GB2312,utf-8;q=0.7,*;q=0.7
Keep-Alive: 115
Connection: keep-alive
Referer: http://www.215588.com/gouwu/showroom.asp
Cookie: ftwww215588com=0; AJSTAT_ok_times=8; RecentlyGoods=592%2C; ASPSESSIONIDACCQSQQC=BBJGIEFDJEBFDGJLGEOPFIDF; AJSTAT_ok_pages=3
"""
str=str.strip().replace("\r\n","\n")
headers={}
for elem in str.split("\n"):
    a,b=elem.split(":",1)
    a=a.strip()
    b=b.strip()
    headers[a]=b
    #print a,b
#print headers
def init():
    if not os.path.exists(g_infos_folder):
        os.makedirs(g_infos_folder)
    if not os.path.exists(g_data_folder):
        os.makedirs(g_data_folder)
def clear():
    "清理生成的数据"
    delete(g_data_folder)
    delete(g_infos_folder)
    print "还原为初始"
def error(url):
    f=open(g_error_file,"a")
    f.write("%s\n"%(url,))
    f.close()
def success(url):
    f=open(g_success_file,"a")
    f.write("%s\n"%url)
    f.close()
def delete(src):
    '''delete files and folders'''
    if os.path.isfile(src):
        try:
            os.remove(src)
        except:
            pass
    elif os.path.isdir(src):
        for item in os.listdir(src):
            itemsrc=os.path.join(src,item)
            delete(itemsrc) 
        try:
            os.rmdir(src)
        except:
            pass
def statistics(f):
    def tongji():
        total,successed=0,0
        if os.path.exists(g_xmls_folder):
            total=len(set(glob.glob(g_filter)))
            print "total lines:%s"%total
        if os.path.exists(g_success_file):
            successed=len(set(open(g_success_file,"r").read().strip().split('\n')))
            print "successed lines:%s"%successed
        print "left lines:%s"%(total-successed)
    @wraps(f)
    def wrapper(*args,**args2):
        tongji()
        time.sleep(3)
        back = f(*args, **args2)
        tongji()
        return back
    return wrapper
def cost_time(f):
    @wraps(f)
    def wrapper(*args, **args2):
        t0 = time.time()
        print "@%s, {%s} start" % (time.strftime("%X", time.localtime()), f.__name__)
        back = f(*args, **args2)
        print "@%s, {%s} end" % (time.strftime("%X", time.localtime()), f.__name__)
        print "@%.3fs taken for {%s}" % (time.time() - t0, f.__name__)
        return back
    return wrapper
def get_real_image_url(file):
    s=pq(open(file,"r").read())
    url=s("field[@name='%s']"%"bigImage").text()
    def do(url):
        try:
            req = urllib2.Request(url = url,headers = headers)
            img = urllib2.urlopen(req)
            return img.url
        except Exception,ex:
            error(url,ex)
            return None
    rr = with_timeout(10, do, url, timeout_value=None)
    if rr is not None:
        s("field[@name='%s']"%"bigImage").text(rr)
        r='<?xml version="1.0" encoding="utf-8"?>'+s.wrap("<add></add>").html()
        new_file=os.path.join(g_data_folder,os.path.basename(file))
        open(new_file,"w").write(r)
        success(file)
        print "success",url
    else:
        error(file)
        print "error",url
@cost_time
@statistics
def batch_get_real_image_urls(files=glob.glob(g_filter)):
    """获取图片真实链接"""
    pool=eventlet.GreenPool(20)
    for file in files:
        pool.spawn_n(get_real_image_url,file)
    pool.waitall()
@cost_time
@statistics
def process_continue():
    "接着success抓取剩下的部分"
    #读取完整的部分和已完成的部分进行取非交集合
    done,all=set(),set()
    if os.path.exists(g_success_file):
        done=set(open(g_success_file,"r").read().strip().split("\n"))
    if os.path.exists(g_xmls_folder):    
        all=set(glob.glob(g_filter))
    left=all-done
    batch_get_real_image_urls(left)
def count():
    total,successed=0,0
    if os.path.exists(g_xmls_folder):
        total=len(set(glob.glob(g_filter)))
        print "total lines:%s"%total
    if os.path.exists(g_success_file):
        successed=len(set(open(g_success_file,"r").read().strip().split('\n')))
        print "successed lines:%s"%successed
    print "left lines:%s"%(total-successed)
    return total,successed
    
def process_forever():
    "循环处理,直到全部完成"
    total,successed=count()
    while (total-successed)>0:
        process_continue()
        total,successed=count()
        
if __name__=="__main__":
    init()
    l=len(sys.argv)
    dict = {
            "batch"     :batch_get_real_image_urls,
            "continue"  :process_continue,
            "clear"     :clear,
            "loop"      :process_forever,
           }
    if l==2:
        argv=sys.argv[1].strip()
        if argv in dict:
            dict[argv]()
    else:
        for k,v in dict.iteritems():
            print k,v.__doc__
    print "done!"
原文地址:https://www.cnblogs.com/lexus/p/1846562.html