wget多进程抓取的实现

把以前博客的东西夜迁移过来吧,这个是以前公司做的,原来放在csdn里面,先切过来。

用多进程实现的 wget多进程抓取的实现,有问题联系我 (微博:偶是周亮)

#!/bin/bash
url_path="-";
WGET_TIMECOUNT=2;
WGET_TIME=10;
FORK_SLEEP_TIME=1;
ONEURL_SLEEP_TIME=1;
SPIDER_PID_NUM=6;
function usage(){
        echo "usage:spider.sh -u url_path -d page_store_dir";
        exit 3;
}
function version(){
        echo "same-source-tools-spider-1.0.0";
        exit 4;
}
while getopts l:u:d:t:T:s:S:p: OPTION
do
        case $OPTION
                in
                u)url_path=${OPTARG};;
                d)spider_dir=${OPTARG};;
                t)WGET_TIMECOUNT=${OPTARG};;
                T)WGET_TIME=${OPTARG};;
                s)FORK_SLEEP_TIME=${OPTARG};;
                S)ONEURL_SLEEP_TIME=${OPTARG};;
                p)SPIDER_PID_NUM=${OPTARG};;
                l)LOG_PATH=${OPTARG};;
                h)usage;;
                v)version;;
                /?)usage;;
        esac
done
touch ${LOG_PATH};
#检查抓取文件是否存在
if [ -e ${url_path} ]; then
        echo "spider test: ${url_path} is exist" ;
else
        echo "url_path spider test: ${url_path} is not exist";
        exit 1;
fi
#检查存储网页的目录是否存在
if [ -e ${spider_dir} ]; then
        echo "spider test: ${spider_dir} is exist" ;
else
        echo "spider_dir spider test: ${spider_dir} is not  exist";
        exit 2;
fi
#清除原来的url文件
url_first_path="${spider_dir}/url_0";
if [ -e ${url_first_path} ]; then
        rm ${spider_dir}/url_*;
fi
#创建url多进程抓取文件
for ((i=0;i<${SPIDER_PID_NUM};i++));do
{
        touch ${spider_dir}/url_${i};    
}
done
no=0;
#向url多进程抓取文件中写入抓取的url
cat ${url_path} | while read line
do
        echo $line >> ${spider_dir}/url_${no};
        no=$(($no+1));
        if [ $no -ge ${SPIDER_PID_NUM} ]; then
                no=0;
        fi;
done
#开始多进程抓取
for ((i=0;i<${SPIDER_PID_NUM};i++));do
sleep ${FORK_SLEEP_TIME};
{
        url_path="${spider_dir}/url_${i}";
        if [ -e $url_path ]; then
                cat ${url_path} | /
                while read url 
                do
                        sleep ${ONEURL_SLEEP_TIME};
                        url_md5=`echo ${url} | md5sum | awk -F" " '{print $1}'`;
                        wget "${url}" -o ${LOG_PATH}_${url_md5} -O ${spider_dir}/${url_md5} -t ${WGET_TIMECOUNT} -T ${WGET_TIME};
                        #wget ${url} -o ${LOG_PATH}_${url_md5} -a ${LOG_PATH} -O ${spider_dir}/${url_md5} -t ${WGET_TIMECOUNT} -T ${WGET_TIME};
                        dateFlag=`date +"%Y%m%d-%H:%M:%S"`;
                        if [ $? -eq 0 ]; then
                                echo "${dateFlag} NOTICE:spiderwgetsuccess ${url}" ; 
                        else
                                echo "${dateFlag} ERROR:spiderwgeterror ${url}" ; 
                                rm ${spider_dir}/${url_md5};
                        fi
                done
        else
                continue;
        fi
} &
done
wait
原文地址:https://www.cnblogs.com/wully/p/3341302.html