Perl6 必应抓取(2):最终版

use HTTP::UserAgent;
use URI::Encode;

my $ua = HTTP::UserAgent.new(:user-agent<Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0>);
my $bing_url = 'http://cn.bing.com/search?q=';
my $choose = rx/'<cite>'(.*?)'</cite>'/;#要查的内容
my $filename = ~now.DateTime~'.txt';
$filename = do given $filename {S:g/':'/-/};
my $fp = open $filename, :w;
my $hv;
my $all_data; #进度显示

sub MAIN (Int $page_number) {
    say '+';
    say '=======================================================================';
    say '                  By: FireC@t';
    say '=======================================================================';
    #say '';
    my $strings = prompt 'Input String You Want: ';
    say 'Search : '~$strings;
        $all_data = 10*$page_number;
    say 'Data count(10*'~$page_number~') : '~$all_data; #输出数据数目
    my $start_time = now.DateTime;
    say 'Start Time : '~$start_time;
    say '=======================================================================';
    $strings = uri_encode($strings);
    my $count = 0;
    for 1..$page_number {
          #每一页的结果调用函数
          my $url_end = '&first='~$count;
          my $targeturl = $bing_url~$strings~$url_end;
          #say $targeturl;
          #调用函数查询结果URL
          Bing_search($targeturl);
          $count += 10;
    }
    my $end_time = now.DateTime;
    say '=======================================================================';
    say 'Finish Time : '~$end_time;
    say 'Time Use : '~($end_time-$start_time);
    say '=======================================================================';
    say 'Data save to : '~$filename;
    say '=======================================================================';
}

#查询函数
sub Bing_search($url) {
    my $html = $ua.get($url).content;#获取结果
    loop {
          $html ~~ $choose;
          last if not $0;

          my $swap_ = ~$0;
          $html = $/.postmatch;
          $swap_ = do given $swap_ {S:g/'<strong>'//};
          $swap_ = do given $swap_ {S:g/'</strong>'//};
          say '('~$hv~':'~$all_data~')'~$swap_;
                    $hv++;
          $fp.say($swap_);
    }
}

说明, 在dos下输入中文, 因为终端编码问题, 程序会报错。

在linux下运行正常, 或dos下设置编为utf8。

用法:

> perl6 bing_s.p6 10

这里的参数 10为页数, 可随意更改。

BUG:

  如果bing中的结果只有 100 条, 而我们向他取 1000 条, 这时我们会取到相同的数据。

修复:

  在运行前, 用bing的数据库条目与用户输入的对比。 如果用户请求数目超出bing现有数目, 取bing最大值代替用户输入的最大值。

update: 2017/08/25

修复后代码:

use HTTP::UserAgent;
use URI::Encode;

=begin pod
用于国内版bing查询
# by FireC@t
# 2017/08/25
=end pod




my $ua = HTTP::UserAgent.new(:user-agent<Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0>);
my $bing_url = 'http://cn.bing.com/search?q=';
my $choose = rx/'<cite>'(.*?)'</cite>'/;#要查的内容
my $filename = ~now.DateTime~'.txt';
$filename = do given $filename {S:g/':'/-/};
my $fp = open $filename, :w;
my $hv=1;
my $all_data; #进度显示

sub MAIN (Int $page_number) {
    say '+';
    say '=======================================================================';
    say '                  By: FireC@t';
    say '=======================================================================';
    #say '';
    my $strings = prompt 'Input String You Want: ';
    say 'Search : '~$strings;
    say 'User Data  Get Page : '~$page_number; #输出数据数目
    my $start_time = now.DateTime;

    $strings = uri_encode($strings);
        #调用用户处理函数, 处理记录数, 防止重复

        $all_data = 10*$page_number;#先计算用户实际要求的数目
        #如果用户请求数据过多, 提示
        #say 'Test all_data ->'~$all_data; # for test;
        my $page_number_swap = User_data_chang($strings);
        if $page_number_swap < $page_number {
                    say 'Not enough Data for bing('~$page_number_swap~' pages), page_number change: '~$page_number~' to '~$page_number_swap;
                    #改写用户要求的实际数目
                    $all_data = 10*$page_number_swap;
        }

    my $count = 0;
        say 'Start Time : '~$start_time;
        say '=======================================================================';
        #sleep(100);
    for 1..$page_number_swap {
          #每一页的结果调用函数
          my $url_end = '&first='~$count;
          my $targeturl = $bing_url~$strings~$url_end;
          say '>> '~$targeturl~'';
          #调用函数查询结果URL
          Bing_search($targeturl);
          $count += 10;
    }
    my    $end_time = now.DateTime;
    say '=======================================================================';
    say 'Finish Time : '~$end_time;
    say 'Time Use : '~($end_time-$start_time);
    say '=======================================================================';
        say 'Data('~$hv-1~' lines) save to : '~$filename;
        say '=======================================================================';
}

#查询函数
sub Bing_search($url) {
    my $html = $ua.get($url).content;#获取结果
    loop {
          $html ~~ $choose;
                    last if not $0;

          my $swap_ = ~$0;
          $html = $/.postmatch;
          $swap_ = do given $swap_ {S:g/'<strong>'//};
          $swap_ = do given $swap_ {S:g/'</strong>'//};
          say '('~$hv~':'~$all_data~') '~$swap_;
                    $hv++; #记录数据数目
          $fp.say($swap_);
    }
}

#用于处理用户请求记录数
sub User_data_chang($strings){
        #获取所有记录数:
        my $start_url = $bing_url~$strings;
        my $all_result_number = $ua.get($start_url);
        $all_result_number ~~ /'sb_count">'(.*?)s.*?'</span>'/;
        if not  $0 {
                    #say 'Not Result';
                    return 0;
                    #没有结果, 直接返回0个页面
        }
        #如果有结果
        my $data_number = ~$0;#123,45
        #say $data_number; #test
        my $bing_all_data = Int($data_number.subst: /','/,'',:g); #获得结果总数
        #test
        #say $bing_all_data;
        #say $all_data;

        #$all_data为用户请求总数
        if $all_data > $bing_all_data {
                    #如果用户请求数大于数据已有数目, 那就返回所有请求
                    #调用分页函数返回一共有多少页
                    my $user_page = User_page($bing_all_data);
                    #say 'return page:'~$user_page;    sleep(1000);
                    return $user_page;#返回页数

        }else {
                    #否则返回用户自定义页数
                    my $user_page = User_page($all_data);
                    #say 'return page:'~$user_page;    sleep(1000);
                    return $user_page;
        }
}


#用于处理页数
sub User_page($data_number) {
            my $page_check = ~($data_number/10);
            if $page_check.split('.').elems == 2 {
                    #说明有小数
                    return $page_check.split('.')[0] + 1;
            }else {
                    #没小数, 取整数
                    return $page_check.split('.')[0];
            }
}
原文地址:https://www.cnblogs.com/perl6/p/7426189.html