Perl 关键词搜索机器人

这段代码在网上找的。觉得很不错,准备弄来分析下。。

看别人的代码也是一种另类的学习方法。在学习的过程当中多看别人的代码能够提升自己的理解。

特别是一些自己没有用过的模块,通过这些实例就能知道怎么去使用。

当然,你也可以自己去研究官方那些文档。但是对于我来说,我觉得最快的方法就是看别人写的代码实例。

或许每个人都有点不同吧。

#!/usr/bin/perl
 # siteindexingbot.pl
 use warnings;
 use strict;

 use LWP::Simple;
 use LWP::RobotUA;
 use WWW::RobotRules;
 use HTML::Parse;
 use HTML::HeadParser;
 use URI::URL;

 my ($response, $tree, $link, %scanned);

 # the arrays and hashes used to store page data
 my (@pages, %titles, %keywords);

 my $url = $ARGV[0] or die "Usage: siteindexingbot [url]\n";
 my $base_url = &globalize_url('/', $url);
 my $robots_txt = $base_url . '/robots.txt';

 my $robot_rules = new WWW::RobotRules (
    "indexifier/1.0 (libwww-perl-$LWP::VERSION)"
 );

 # look for and parse the robots.txt file
 if (head($robots_txt)) {
    print "robots.txt file found OK.\n";
    $robot_rules->parse($robots_txt, get($robots_txt));
 } else {
    print "robots.txt file not found.\n";
 }

 # build the user agent
 my $ua = new LWP::UserAgent (
    "indexifier/1.0 (libwww-perl-$LWP::VERSION)",
    'me@here.com',
    $robot_rules
 );

 #$ua->proxy('http' => 'http://proxy.mylan.com/' );
 $ua->timeout(30);
 $ua->max_size(1024 * 100);
 $ua->parse_head('TRUE');

 &scan($base_url);

 open (FILE, ">indexed.txt") or die "Opening indexed.txt: $!";
 foreach my $page(@pages) {
    print FILE join( "\t",
    ($page, $titles{$page}, $keywords{$page})
    ), "\n";
 }
 close (FILE);

 exit;

 sub scan {
    my $url = shift;
    print "Scanning '$url':\n";
    if ($scanned{$url}) {
       return;
    } else {
       &get_info($url);   # this is the extra subroutine
       $scanned{$url} = 'TRUE';
       my @links = &get_links($url);
       foreach $link(@links) {
          if ($robot_rules->allowed($link)) {
             if ($link =~ /^$base_url/i) {
                my $request = HTTP::Request->new ('HEAD' => $link);
                my $response = $ua->request($request);
                my $content_type = $response->header('Content-type');
                if ($response->is_error) {
                   print "Dead link to $link found on $url\n";
                } else {
                   print "$url links to $link\n";
                   if ($content_type eq 'text/html') {
                      &scan($link);
                   } else {
                      print "$link is not HTML\n";
                   }
                }
             } else {
                print "$link is not local to $base_url\n";
             }
          } else {
             print "Access to $link is not allowed by robots.txt\n";
          }
       }
    }
 return;
 }

 sub globalize_url {
    my ($link, $referring_url) = @_;
    my $url_obj = new URI::URL($link, $referring_url);
    my $absolute_url = $url_obj->abs->as_string;
    $absolute_url =~ s/^(.+?)#(.+?)$/$1/ig;
    return $absolute_url;
 }

 sub get_links {
    my $url = shift;
    my $request = HTTP::Request->new ('GET' => $url);
    $request->header('Accept' => 'text/html');
    my $response = $ua->request($request);
    my $tree = HTML::Parse::parse_html($response->content);
    my $links_ref = $tree->extract_links('a', 'frame', 'iframe');
    my @links;
    foreach $link(sort @$links_ref) {
       push(@links, &globalize_url(${$link}[0], $url));
    }
 return @links;
 }

 sub get_info {
    my $url = shift;
    my $request = HTTP::Request->new('GET' => $url);
    $request->header('Accept' => 'text/html');
    my $response = $ua->request($request);
    my $html = $response->content;
    my ($title, $keywords, $type);
    my $parser = HTML::HeadParser->new;
    $parser->parse($html);
    $title = $parser->header('title') || 'Untitled Document';
    $keywords = $response->header('X-Meta-description') || 'none';
    push (@pages, $url);
    $titles{$url} = $title;
    $keywords{$url} = $keywords;
    return;
 }
原文地址:https://www.cnblogs.com/xiaoCon/p/2934512.html