select python fetch webpages not complete yet

#!/usr/bin/env python
#encoding=utf-8
import select,socket,codecs,doctest,time,datetime,os
def read_urls():
    urls=[]
    prefix="http://book.360buy.com/%s.html"
    for idx,line in enumerate(codecs.open("./book/1.csv","r","utf-8").readlines()):
        if idx==0:continue
        if idx>=100:break
    if line.find(",")==-1:continue
    #print line
        wid,name=line.rstrip().split(",",1)
        urls.append(prefix%wid)
    return urls

def _parser(url):
    """
    >>> _parser("http://book.360buy.com/123.html")
    ('book.360buy.com','/123.html')
    """
    a,b=url[7:].split("/",1)
    return (a,"/"+b)

def fetch(url):
    hostname,path=_parser(url)
    s=socket.socket(socket.AF_INET,socket.SOCK_STREAM)
    #print "==>",s.fileno()
    #time.sleep(10)

    #addr=socket.gethostbyname(hostname)
    #print addr
    s.connect((hostname,80))
    html="""GET %s HTTP/1.0\r\n"""%path
    html+="""Host: %s\r\n"""%hostname
    html+="""User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64; rv:17.0) Gecko/20100101 Firefox/17.0\r\n"""
    html+="""Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\n"""
    html+="""Accept-Language: zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3\r\n"""
    html+="""Cookie: BAIDUID=4782C3288E4A1689E0F8CBC0DF82BB1D:FG=1; BDUT=sc2x4782C3288E4A1689E0F8CBC0DF82BB1D13bda69e4000; H_PS_PSSID=1428_1667_1662\r\n"""
    html+="""Cache-Control: max-age=0\r\n"""
    html+="""\r\n"""
    f=None
    s.sendall(html)
    return s
    #rlist,wlist,elist=select.select([],[],[])

def async_down(urls):
    sockets=[]
    dict={}
    files={}
    for url in urls:
       socket=fetch(url)
       dict[socket.fileno()]=url
       sockets.append(socket)
    #print sockets.__len__()
    #time.sleep(10)
    start=datetime.datetime.now()
    end=datetime.datetime.now()
    while sockets and (end-start).seconds<12:
       rlist,wlist,elist=select.select(sockets,[],[])
        #print "ready rlist:%s"%rlist.__len__()
    for s in rlist:
            data=s.recv(40960)
            if data:
                #continue
                f=None
                if s.fileno() not in files:
                    f=codecs.open("./results/%s.html"%s.fileno(),"w","utf-8")
            files[s.fileno()]=f
        f=files[s.fileno()]
                f.write(data.decode("gbk","ignore"))
                f.flush()
            else:
                sockets.remove(s)
        end=datetime.datetime.now()

    filenos=[socket.fileno() for socket in sockets]
    for fileno,f in files.iteritems():
        f.close()
        if fileno in filenos:
            os.remove("./results/%s.html"%fileno)

    print "left socket %s"%sockets.__len__()

if __name__=="__main__":
    start=datetime.datetime.now()
    #doctest.testmod()
    urls=read_urls()
    #print urls
    async_down(urls)
    end=datetime.datetime.now()
    print (end-start).seconds

原文地址:https://www.cnblogs.com/lexus/p/2848479.html