perl 大文本词频统计.

思想是设置子文本最大长度,然后分割成多个子文本, 最后合并.

词频则是当前位置字和前一位置的字的组合  进入hash.

代码如下

use Encode;   ##编码解码
system("time /t");  ##开始时间
$g_MaxBiNum=1000000;  ##最大文本长度
BiCount("train.txt");
MergeBi(@BiTmp,"bi.txt");
foreach (@BiTmp){
		unlink($_);
}
system("time /t");   ##结束时间

TrainWordToNum;  ##统计字频


sub BiCount  ##统计词语
{
	my($File)=@_;
	$BiFile="tmp";
	open(In,"$File");
	$ZiNum=0;
	$ID=0;
	@BiTmp=();
	while(<In>){
		chomp;	
		s/s+//g;
		$Line=$_;
		while( $Line ne "" ){
			$Len=1;
			if ( ord($Line) & 0x80 ){
				$Len=2;
			}
			$H2=substr($Line,0,$Len);
			if ( $H1 ne  "" ){
				$Bi=$H1."_".$H2;
				$hashBi{$Bi}++;
			}
			$H1=$H2;
			$ZiNum++;
		
			if ( $ZiNum > $g_MaxBiNum ){
				$BiFileTmp=$BiFile."_".$ID;
				push(@BiTmp,$BiFileTmp);
				open(Out,">$BiFileTmp");
				print "$BiFileTmp done!
";
				foreach (sort keys %hashBi ){
					print Out "$_	$hashBi{$_}
";
				}
				%hashBi=();
				$ZiNum=0;
				close(Out);
				$ID++;
			}
			
			$Line=substr($Line,$Len,length($Line)-$Len);
		}
	}
		
	close(In);
	
}

sub MergeBi
{
	my($RefBiFileList,$Merged)=@_;
	open(Out,">$Merged");
	foreach (@{$RefBiFileList}){
		my $H="F".$_;
		open($H,"$_");
		if ( <$H>=~/(S+)	(d+)/ ){
			${$hash{$1}}{$H}=$2;		
		}
	}
	@BiStr=sort keys %hash;
	while( @BiStr > 0 ){
		$Num=0;
		@Fhandle=();
		foreach $Handle(keys %{$hash{$BiStr[0]}} ){
			$Num+=${$hash{$BiStr[0]}}{$Handle};
			push(@Fhandle,$Handle);
		}
		print Out "$BiStr[0]	$Num
";
		
		delete $hash{$BiStr[0]};
		foreach $Handle(@Fhandle){
			
			if ( <$Handle>=~/(S+)	(d+)/ ){
				${$hash{$1}}{$Handle}=$2;		
			}
		}
		@BiStr=sort keys %hash;
	}
	
	foreach (@{$RefBiFileList}){
		my $H="F".$_;
		close($H);
	}
}

sub TrainWordToNum{
	open(in,"train.txt");
	while(<in>)
	{
		chomp;
		$line=decode("GBK",$_);
		@AllW=$line=~/./g;
		foreach $_(@AllW)
		{
			$_=encode("GBK",$_);
			$Word2Num{$_}++;
		}
	}
	close(in);
}

  

原文地址:https://www.cnblogs.com/cagercoding/p/6910829.html