shell爬虫

#!/bin/bash
curl_str='curl -x "http://http-pro.abuyun.com:9010" --proxy-basic --proxy-user H78H42TCN191075P:3D1EA6E4F458AB69'
curl_str='curl -L --socks5 socks-cla.abuyun.com:8030 --proxy-user S822RB9T27K96TPC:5E68523C79E62C41'
### encode url
encode_url(){
    local supplier_name_encode="$(echo "$1" | tr -d '
' | xxd -plain | sed 's/(..)/%1/g')"
    echo "https://xin.baidu.com/s?q=${supplier_name_encode}&t=0"|perl -npe 's/
//'
}

do_down_web_info_html(){
    local url=$(encode_url "$1")
    local html_file="$2"
    # wget ${url} -O ${html_file}
    #echo -e "
===================================> " ${curl_str} "${url}" > ${html_file}"
"
    ${curl_str} "${url}" > ${html_file}
}

get_supplier_pid(){
    local html_file="${1}.html"
    local supplier_name="$2"
    do_down_web_info_html "${supplier_name}" "${html_file}"
    ### 获取pid
    echo $(grep compinfo ${html_file}|head -1|awk -F'pid=' '{print $2}'|awk -F'"' '{print $1}')
}

do_down_supplier_unifiedcode(){
    local id="$1"
    local pid="$2"
    ### 获取统一社会信用代码
    local unifiedCode="${id}_unifiedCode"
    local api_basic_url="https://xin.baidu.com/detail/basicAjax?pid=${pid}"
    echo -e "
 获取统一社会信用代码 ===================================> "${curl_str} "${api_basic_url}" > ${unifiedCode}"
"
    ${curl_str} "${api_basic_url}" > ${unifiedCode}
}

do_switch_ip(){
    # ${curl_str} http://proxy.abuyun.com/switch-ip
    echo "do_switch_ip"
}

do_run(){
    local id="$1"
    local supplier_name="$2"
    pid=$(get_supplier_pid "${id}" "${supplier_name}")
    do_down_supplier_unifiedcode "${id}" "${pid}"
    # echo -n $(date "+%F %T")" | ${id} | ${supplier_name} | ${pid} |"
    ### 解析json为csv文件
    # jq -r '[(.data.entName|tostring),(.data.unifiedCode|tostring)]|join("|")' "${id}_unifiedCode"
}

result_file="result_code"
# "上海东福网络科技有限公司广州分公司","120"

token(){
    local pid=$1
    # 判断是否有传入pid
    if [ -z "${pid}" ]
    then
        echo "please input pid"
        exit 1
    fi
    
    # 设置并发数,默认为5
    local concurrency=20
    if [ -n "$2" ]
    then
        concurrency=$2
        echo "Concurrency: $2"
    fi

    # 创建有名管道,如果fd1不存在则创建
    [ -e /tmp/fd1 ] || mkfifo /tmp/fd1
    # 创建文件描述符,以可读(<)可写(>)的方式关联管道文件,这时候文件描述符999就有了有名管道文件的所有特性
    # 为了让程序有一定的扩展性,不想写死fd,因而引入了变量。
    # 因而引入eval命令,强制shell进行变量展开。
    # eval exec "${fd}>file"简单的说,eval将右边参数整体作为一个命令,进行变量的替换,然后将替换后的输出结果给shell去执行。
    eval exec "${pid}<>/tmp/fd1"
    # 关联后的文件描述符拥有管道文件的所有特性,所以这时候管道文件可以删除,我们留下文件描述符来用就可以了
    [ -e /tmp/fd1 ] && rm -f /tmp/fd1

    # 初始化并行数
    for ((i=1;i<=${concurrency};i++))
    do
        # &999代表引用文件描述符999,这条命令代表往管道里面放入了一个"令牌"
        echo ${i}>&${pid}
    done
}

main(){
    local pid=$$
    local start_time=`date +%s`
    # 生成管道文件
    token ${pid}
    local num=0
    cat ../tmpa|head -10000|tail -400|while read line
    do
        num=$((${num}+1))
        if [ ${num} -eq 100 ]
        then
            do_switch_ip
            num=0
            echo "==================================== reset num ===================================="
        fi
        # 获取令牌
        read -u${pid} name
        {
            local supplier_info=($(echo ${line}|sed 's/"//g'|sed 's/,/ /'))
            local id="${supplier_info[1]}"
            local supplier_name="${supplier_info[0]}"
            echo "${id} | ${supplier_name}"
            do_run ${id} ${supplier_name} 
            # | tee -a ${result_file}_${name}
            echo ${name}>&${pid}
        } &
    done

    wait
    # 定义脚本运行的结束时间
    local stop_time=`date +%s`
    echo "TIME:`expr ${stop_time} - ${start_time}`"
    # 关闭文件描述符的读
    eval exec "${pid}<&-"
    # 关闭文件描述符的写
    eval exec "${pid}>&-"
}

main
原文地址:https://www.cnblogs.com/chenzechao/p/11512672.html