批量去重URL地址并剔除打不开网址

#coding=utf-8

import os
import httplib
import socket

dictlist ={};

def ReadHost():
    hosts = [];
    obn = open('d:/sss.txt', 'rb');
    for line in obn:
        #sometime you should filter 

        line = line.strip('
')
        hosts.append(line)
    obn.close();
    return hosts;

def GetWebStatus(host):
    try:
        conn = httplib.HTTPConnection(host)
        conn.request('GET','url')
        result = conn.getresponse()
        resultStatus= result.status
       # print(host,resultStatus)
        conn.close()
        if(resultStatus != 200):
            return 0
        else:
            return 1
    except httplib.HTTPException,e:
        return 0

def SysDNS():
    hosts = ReadHost();

    for host in hosts:
        #print(host)
        try:

            if (GetWebStatus(host) == 0):
                continue
            myaddrs = socket.getaddrinfo(host,None)
            for eachaddr in myaddrs:
                addrs = eachaddr[4][0]
                #print((addrs))
                if(dictlist.has_key(addrs)):
                    break;
                else:
                    dictlist[addrs] = host;
                    #print(host)
                    break;
        except socket.herror,e:
            continue;
        except socket.gaierror,e1:
            continue;
        except Exception as e2:
            print(e2)
            continue



def showDict():
    fw = open("d:/out.txt","wb");
    for (k,v) in dictlist.items():
        #print(k,v)
        fw.writelines(v);
    fw.close();

if __name__ == "__main__":
    SysDNS();
    showDict();

url可能会出现的错误:

[Errno 10060]
[Errno 10061]
[Errno 10054]
[Errno 10053]

原文地址:https://www.cnblogs.com/xiaobaichuangtianxia/p/3842562.html