2014-05-02

#!/usr/bin/perl

use utf8;
use Data::Dumper qw(Dumper);
use HTML::Element;
use HTML::TreeBuilder;

#binmode(STDIN,':encoding(utf8)');
#binmode(STDOUT,':encoding(utf8)');
binmode STDOUT,"utf8";
#binmode(STDERR,':encoding(utf8)');
$Data::Dumper::Indent = 1 ;

#foreach my $file_name (@ARGV){
my $file_name = "huxiu-webDetail";
unless(-e $file_name){
    print "$file_name is not exsit ";
}
    open(DATA ,$file_name);   
    binmode DATA,"utf8";
    my $tree = HTML::TreeBuilder->new;
    $tree->parse_file(*DATA);
   
#    $title = $tree->find_by_tag_name('title');
#    @desc = $tree->find_by_tag_name('description');
#    @link = $tree->find_by_tag_name('link');
#    @image = $tree->find_by_tag_name('image');
   
#    foreach(@title){
#        print $title," ";
#    }
#    $title = $tree->find_by_tag_name('title');
    $head = $tree->find_by_tag_name("head");
    $body = $tree->find_by_tag_name("body");
#    @metacontent = $meta->content_list;
#    print $head->{'_parent'}{'_content'}[1]{'_content'}[0]{'_content'}[0]{'_content'}[0]{'_content'}[0]{'_content'}[0]{'_content'}[0]{'src'}," ";
   
# _parent is a key of Hash,and the value is ref hash Array.
#_content is a key of Arry,and the value is ref hash Arry.
    $var_par = $head->{'_parent'};
    $var_con = $head->{'_content'};
    $var_tag = $head->{'_tag'};

    foreach $key( keys %head)
    {
#        print $key," ";
    }
#    print $var_con;
    foreach $key(keys %$var_par)
    {
#        print $key," ";
    }
    while(($key,$value)=each%$var_par)
    {
#        print "$key=>$value ";
    }
#########################################################
#                                                        #
# print ALL Hash key and Hash value in  Head`s _content #
#                                                        #
#########################################################
   
    print "========================================= ";
    my $icon_count = @$var_con - 1;
    for my $i (0 .. $icon_count)
    {
        my $hash = $var_con->[$i];
        foreach my $key(keys %$hash)
        {
#            print $key," ";
        }
    }
    print "======================================== ";
#    foreach $key(keys %($var_con[0]))
#    {
#        print $key," ";
#    }
#    foreach $key(keys (%$var_tag))
#    {
#        print $key," ";
#    }
#    foreach $key(keys %$body)
#    {
#        print $keys," ";
#    }

    print $var_par->{'_content'}," ";
    print $var_con," ";
    print $var_tag," ";
#    print $i=@$var_con," ";
#    print $var_par->{'_content'}[0]," ";
#    print $var_par->{'_content'}[0]{'_content'}[0]," ";
#    print $var_con->[0]," ";
    sub printcontent{
        my $vax = @_->[0];
        my $tag = @_->[1];
        my $icount = @$vax-1;
#        print $icount+1," ";
#        print  $vax->[0]," ";
        for my $i(0 .. $icount){
#            print  $i,$vax->[$i]," ";
#            print $i,$vax->[$i]{'_tag'}," ";
#            if( @$vax->[$i]{'_content'}!=())
#            {
#                print $i,":";
#                printcontent ($vax->[$i]{'_content'});
#            }
#            elsif($vax->[$i]{'content'}!=undef)
#            {
#                print $i,":";
#                printcontent ($vax[$i]{'content'});
#            }
#            else
#            {
                my $hash = $vax->[$i];
                foreach my $key(keys %$hash)
                {
                    if($key ne "_parent"){
                        print $i,":",$key,"=";
                        print $vax->[$i]{$key}," ";
                    }
                    elsif($key ==  '_content')
                    {
#                        Dumper $key," ";
                        if(@$vax->[$i]{'_content'}[0]{'_content'}!=()){
#                            print $i,":_content=",$vax->[$i]{'_content'}[0]," ";
                            printcontent($vax->[$i]{'_content'});
                        }
                        else{
                            print $i,":_content============",$vax->[$i]{'_content'}[0]," ";
                        }
                    }
                }
#            }
        }
    }
#    printcontent($var_par->{'_content'});

    printcontent ($var_par->{'_content'});
    print " ";
#    print  $head->{'_parent'}{'_content'}[1]{'_content'}[0]{'_content'}[1]{'_content'}[0]{'_content'}[0]{'_content'}[1]{'_content'}[0]{'_content'}[0]," ";
#    print $var_con->[1]{'_content'}[0]," ";
#    print $var_con->[2]{'content'}," ";
#    print $var_con->[2]{'_tag'}," ";
#    print $t=@$var;
#    print Dumper($head);
#    foreach( @metacontent)
#    {
#        print $_," ";
#    }
   
   
#    print Dumper($tree), " ";
#    print $title->as_text()," ";
#    print $body->as_text()," ";
#    :q@p = $tree->find_by_tag_name("body")->content_list;
#    @headcontent = $head->content_list;
#    @bodycontent = $body->content_list;   
   
#    print Dumper(@headcontent);
#    print Dumper(@bodycontent)," ";
#    foreach(@headcontent)
#    {
#        print $_->as_text()," " ;
#    }

    $tree = $tree->delete;
    close(DATA);
#}

功能

把HTML标签转化为perl的数据结构

找出tag和对应的值。

能够攫取网页内容与格式。

不足:

_content会多打一个,要在第一个if语句中过滤。小问题。这种类型的还比较多

找不到内容对应的原来格式。即没有做内容与原来格式的关联。大问题。功能不完善。下一步的重点。

原文地址:https://www.cnblogs.com/ppazhang/p/3703573.html