perl 爬虫两个技巧

<pre name="code" class="cpp">jrhmpt01:/root/lwp# cat data.html 
     <div class="m-page J-ajax-page">
        <a class="changePage" page="1" href="javascript:void(0);">首页</a> <a class="changePage" page="11" href="javascript:void(0);">上一页</a>  <a class="changePage" page="11" href="javascript:void(0);">11</a>  <a class="cur" href="javascript:void(0)">12</a> <span class="page_info">12/12</span>
     </div>

    <div class="m-page J-ajax-page">
        <a class="changePage" page="1" href="javascript:void(0);">首页</a> <a class="changePage" page="11" href="javascript:void(0);">上一页</a>  <a class="changePage" page="11" href="javascript:void(0);">11</a>  <a class="cur" href="javascript:void(0)">12</a> <span class="page_info">12/12</span>
     </div>
	 
	 

jrhmpt01:/root/lwp# cat c1.pl 
use  LWP::UserAgent;
use DBI;  
use POSIX;
use Data::Dumper;
use HTML::TreeBuilder;
my $ua = LWP::UserAgent->new;
$ua->timeout(10);
$ua->env_proxy;
$ua->agent("Mozilla/8.0");


  use HTML::TreeBuilder::XPath;
   $tree= HTML::TreeBuilder::XPath->new;
  $tree->parse_file( "data.html");
my @title=  $tree->findvalues('/html/body//a[@class="changePage"]');
print "@title is @title
";

jrhmpt01:/root/lwp# perl c1.pl
@title is 首页 上一页 11 首页 上一页 11

my @title=  $tree->findvalue('/html/body//a[@class="changePage"]');
表示 根据body的内容 查找a标签的@class="changePage"的值


jrhmpt01:/root/lwp# cat c1.pl 
use  LWP::UserAgent;
use DBI;  
use POSIX;
use Data::Dumper;
use HTML::TreeBuilder;
my $ua = LWP::UserAgent->new;
$ua->timeout(10);
$ua->env_proxy;
$ua->agent("Mozilla/8.0");


  use HTML::TreeBuilder::XPath;
   $tree= HTML::TreeBuilder::XPath->new;
  $tree->parse_file( "data.html");
my    @pages=$tree->find_by_tag_name('a');
                      #@urlall除了包含每个类别的文章,还包含阅读排行里的文章
                      foreach (@pages) {
                                               @titlepage = $_->attr('page');
                                               foreach (@titlepage) {
                                                 if ($_){ 
                                                print "$_ is $_
";
                                                     };
                                           };
};
jrhmpt01:/root/lwp# perl c1.pl 
$_ is 1
$_ is 11
$_ is 11
$_ is 1
$_ is 11
$_ is 11


根据a标签,查看page属性的值



   

原文地址:https://www.cnblogs.com/zhaoyangjian724/p/6200215.html