根据请求网页状态返回码批量过滤无效网址

#coding=gbk
 
import os
import httplib2
import socket
import sys
 

def GetWebStatus(host):
    try:
        h =httplib2.Http(timeout=60)             
        resp, content = h.request(host)
        resultStatus =  resp.get('status')        
        if(resultStatus != '200'):
            #print(host+"无效")
            return 0
        else:
            #print(host+"有效")
            return 1
    except Exception:
        print host
        return 0
 
 
def ReadHost(xmlpath):

    obn = open(xmlpath, 'rb');
    line = obn.readline()
    while line:
        line = line.strip('
')
        if("UrlItem url=" in line):
            line = line[14:]
            maohao = line.find('"')
            line = line[:maohao]
        
        if('.' not in line):
            line = obn.readline()
            continue
        
        if('www.' in line):
            ss = 'http://'+line
        else:
            ss = 'http://www.'+line
        
            
        
        if (GetWebStatus(ss) == 0):
            fw = open("c:/wuxiao.txt","a+")
            fw.writelines(line)
            fw.writelines('
')
            fw.close()   
        else:
            fy = open("c:/valid.txt","a+")
            fy.writelines(line)
            fy.writelines('
')
            fy.close()  
        line = obn.readline()
    obn.close();

 
if __name__ == "__main__":
    if(len(sys.argv) < 2):
        print 'Error!pls enter the test file!If any question,pls contact coder! version:0.3'
    else:        
        ReadHost(sys.argv[1])

根据请求网页状态返回码 批量过滤无效网址

根据请求网页状态返回码批量过滤无效网址