Twenty Newsgroups Classification任务之二seq2sparse(3)

接上篇,如果想对上篇的问题进行测试其实可以简单的编写下面的代码:

package mahout.fansy.test.bayes.write;

import java.io.IOException;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.mahout.common.Pair;
import com.google.common.collect.Lists;
import com.google.common.io.Closeables;

public class TestCreateDictionaryChunks {
	/**
	 * @param args
	 * @throws IOException 
	 */
	public static void main(String[] args) throws IOException {
		String output="hdfs://ubuntu:9000/user/test/test_dictionary";
		writeToPath(output);
	}
	public static void writeToPath(String output) throws IOException{
		Path path=new Path(output);
		Configuration conf=new Configuration();
		conf.set("mapred.job.tracker", "ubuntu:9001");
		FileSystem fs=FileSystem.get(path.toUri(),conf);
				
		SequenceFile.Writer dictWriter = new SequenceFile.Writer(fs, conf, path, Text.class, IntWritable.class);
		int i = 0;
		try {
	      for (Pair<Writable,Writable> record
	           : getInputData()) {
	        Writable key = record.getFirst();
	        dictWriter.append(key, new IntWritable(i++));
	      }
		}finally{
			Closeables.closeQuietly(dictWriter);
		}
	}
	
	public static List<Pair<Writable,Writable> > getInputData(){
		List<Pair<Writable,Writable> > records= Lists.newArrayList();
		records.add(new Pair<Writable, Writable>(new Text("a1"),new LongWritable(93)));
		records.add(new Pair<Writable, Writable>(new Text("a2"),new LongWritable(43)));
		records.add(new Pair<Writable, Writable>(new Text("a3"),new LongWritable(33)));
		records.add(new Pair<Writable, Writable>(new Text("a4"),new LongWritable(32)));
		records.add(new Pair<Writable, Writable>(new Text("a5"),new LongWritable(31)));
		records.add(new Pair<Writable, Writable>(new Text("a6"),new LongWritable(23)));
		records.add(new Pair<Writable, Writable>(new Text("a7"),new LongWritable(83)));
		return records;
	}
}

然后在39行(左右也行)设置断点,即可查看dictWriter变量的属性变化。额,我设了断点,但是它的属性太多了,而且好像还用了转码(a用97表示),表示看懂鸭梨山大,所以还没看懂。

撇开上面的暂时不管,继续往下看,看到这里的调用:

makePartialVectors(input, baseConf, maxNGramSize, dictionaryChunk, partialVectorOutputPath,
        maxTermDimension[0], sequentialAccess, namedVectors, numReducers);

这个是第三个Job任务了,这个任务的Mapper就是Mapper,所以Mapper应该是没有做什么任务的;reducer是TFPartialVectorReducer,额reducer有点复杂,至少我看了一遍看的不是很明白,不知道到底代码是要干嘛的,所以我就想另辟蹊径看怎么才能知道这个reducer是干嘛的,所以。。。

所以我就直接读出上面Reducer的输出不就可以了,好吧,查看job的详细信息,额,输出的文件夹是/home/mahout/mahout-work-mahout/20news-vectors/partial-vectors-0,但是怎么找不到?额,当我看到下面的代码的时候,才发现,原来被删除了:

HadoopUtil.delete(conf, partialVectorPaths);

额,好的。那就只能设置断点了,设置断点,然后在DictionaryVectorizer的203行设置就ok了,然后直接dubug,ok,产生了这个文件,编写下面的代码进行读取:

package mahout.fansy.test.bayes.read;

import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Writable;
import org.apache.mahout.common.iterator.sequencefile.PathFilters;
import org.apache.mahout.common.iterator.sequencefile.PathType;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable;
import org.apache.mahout.math.VectorWritable;

public class ReadPartialVectors0 {

	/**
	 * @param args
	 */
	public static void main(String[] args) {
		Configuration conf;
		conf=new Configuration();
		conf.set("mapred.job.tracker", "ubuntu:9001");
	//	String path="hdfs://ubuntu:9000/home/mahout/mahout-work-mahout_bak/partial-vectors-0";
		String path="hdfs://ubuntu:9000/home/mahout/mahout-work-mahout0/20news-vectors/partial-vectors-0/part-r-00000";
		
		getValue(path,conf);
	}
		/**
	     * 把序列文件读入到一个变量中;
	     * @param path 序列文件
	     * @param conf  Configuration
	     * @return  序列文件读取的变量
	     */
	    public static List<VectorWritable> getValue(String path,Configuration conf){
	    	Path hdfsPath=new Path(path);
	    	List<VectorWritable> list = new ArrayList<VectorWritable>();
	    	for (Writable value : new SequenceFileDirValueIterable<Writable>(hdfsPath, PathType.LIST,
	    	        PathFilters.partFilter(), conf)) {
	    	      Class<? extends Writable> valueClass = value.getClass();
	    	      if (valueClass.equals(VectorWritable.class)) {
	    	    	  VectorWritable st = (VectorWritable) value;
	    	          list.add(st);
	    	      } else {
	    	        throw new IllegalStateException("Bad value class: " + valueClass);
	    	      }
	    	    }
	    	return list;
	    }
}

首先说下这个Job的输入是input-folder: /home/mahout/mahout-work-mahout/20news-vectors/tokenized-documents, dictionary-file: /home/mahout/mahout-work-mahout/20news-vectors/dictionary.file-0,设置断点进行第一个value的读取,读取到的VectorWritable有三个属性分别是:conf、vector、writesLaxPrecision,重点关注vector属性,点开这个属性,可以看到这个属性还包含两个属性:delegate、name,分别点击可以看到下面的属性值:

delegate:

{2860:1.0,77227:1.0,3891:1.0,6907:1.0,93219:1.0,8840:1.0,11880:1.0,52668:1.0,14985:1.0,61480:1.0,17162:1.0,17967:1.0,74642:1.0,20094:1.0,21897:1.0,23545:1.0,8768:1.0,26088:1.0,89680:2.0,27788:1.0,30497:2.0,32005:1.0,34520:1.0,24587:1.0,36683:1.0,24585:1.0,37499:1.0,39074:1.0,41504:1.0,43181:1.0,15782:1.0,44299:1.0,80863:1.0,47408:1.0,48633:1.0,50212:2.0,52684:1.0,53133:2.0,56028:1.0,57362:1.0,72034:1.0,59242:1.0,60435:1.0,62310:1.0,63329:3.0,65367:1.0,87875:3.0,67166:1.0,80837:1.0,68924:1.0,31669:1.0,70770:2.0,12231:1.0,91347:2.0,75330:1.0,68516:1.0,75991:2.0,78034:1.0,79870:1.0,56197:1.0,81770:1.0,19256:4.0,83383:1.0,86089:1.0,26422:1.0,87772:1.0,61466:1.0,42221:1.0,31560:1.0,86088:1.0,36835:3.0,12207:1.0,36832:6.0,10574:1.0,43931:1.0,10643:1.0,79071:1.0,15710:1.0,15709:1.0,52646:1.0,15705:1.0,52744:1.0,28015:1.0,10662:1.0,61561:1.0,7929:1.0,68463:1.0,64943:4.0,36948:1.0,63178:1.0,71972:1.0,80760:1.0,31687:1.0,1600:2.0,7182:1.0,80749:1.0,89671:1.0,13903:1.0,36961:1.0,71947:3.0,13899:2.0,17414:1.0,73700:2.0,71940:6.0,86011:1.0,13890:1.0,29719:4.0,8610:1.0,20916:1.0,31095:1.0,63129:1.0,87754:1.0,21259:13.0,15291:1.0,33214:2.0,89716:2.0,31450:3.0,70143:2.0,80693:1.0,41993:3.0,57822:1.0,65162:1.0,27916:1.0,68370:8.0,47257:1.0,70123:1.0,47255:1.0,85949:1.0,71871:1.0,45485:1.0,31411:1.0,68345:2.0,31405:1.0,61306:1.0,6776:1.0,36678:1.0,77482:1.0,43711:2.0,75372:1.0,80645:1.0,82667:2.0,19077:1.0,15558:2.0,31388:1.0,61285:2.0,31381:1.0,89423:1.0,38411:1.0,48963:3.0,91177:3.0,19054:1.0,45438:1.0,31365:1.0,40154:1.0,43668:8.0,91160:1.0,91159:1.0,68290:1.0,66529:3.0,45419:1.0,45418:5.0,91150:6.0,47068:1.0,59485:1.0,52448:1.0,55965:3.0,13748:2.0,8466:1.0,38944:2.0,13737:1.0,66504:1.0,31929:1.0,68261:1.0,11972:1.0,15487:1.0,19004:1.0,1413:1.0,33073:1.0,54175:4.0,55932:4.0,55931:1.0,64725:3.0,85832:1.0,27784:2.0,38971:1.0,66479:1.0,61200:1.0,31956:1.0,10722:1.0,36563:1.0,82862:2.0,8414:1.0,18961:1.0,1369:1.0,18958:2.0,68843:5.0,41467:1.0,76989:2.0,54813:2.0,82894:2.0,59393:1.0,51309:1.0,19722:1.0,68863:1.0,64664:2.0,17168:5.0,6613:1.0,92802:1.0,25959:1.0,70000:1.0,11885:1.0,61132:1.0,92793:1.0,38263:1.0,78719:1.0,91031:1.0,21501:3.0,45290:1.0,92781:2.0,46077:2.0,84688:1.0,60120:1.0,67142:1.0,27691:3.0,40003:1.0,82217:2.0,43517:1.0,43516:1.0,71659:1.0,27682:1.0,38234:1.0,6569:1.0,20639:2.0,6562:1.0,40840:1.0,17112:2.0,10075:1.0,90987:4.0,71636:1.0,11828:1.0,61078:1.0,78667:1.0,90978:1.0,34687:1.0,57553:1.0,22366:1.0,25883:1.0,8292:2.0,31158:1.0,27639:1.0,73371:2.0,38189:1.0,90958:2.0,40876:3.0,22355:1.0,24112:1.0,48729:1.0,20583:1.0,25859:2.0,90939:1.0,43445:2.0,41413:1.0,59266:3.0,41672:1.0,75090:1.0,11080:1.0,59248:1.0,66281:1.0,80352:2.0,39893:1.0,38131:1.0,36371:1.0,38128:3.0,2947:1.0,2946:1.0,2944:1.0,80339:2.0,78577:3.0,11114:1.0,27559:1.0,45148:1.0,68012:1.0,41626:1.0,63772:1.0,83839:2.0,90874:1.0,55693:1.0,90870:1.0,90869:1.0,6436:1.0,6434:1.0,36336:1.0,78551:3.0,89104:1.0,34574:1.0,2910:1.0,74324:2.0,48637:1.0,36323:1.0,62706:2.0,55669:1.0,38077:1.0,34556:1.0,31036:1.0,34553:3.0,39829:2.0,9924:1.0,46861:2.0,53896:1.0,22232:3.0,46857:2.0,46856:3.0,45096:1.0,66201:1.0,53887:1.0,76751:1.0,32773:2.0,9904:1.0,85538:2.0,64425:1.0,5924:1.0,55628:1.0,11652:1.0,55623:1.0,66175:1.0,90173:1.0,71450:1.0,11638:1.0,76720:3.0,43295:2.0,6353:1.0,30978:2.0,16905:1.0,39297:1.0,85501:1.0,23935:1.0,48560:1.0,67906:2.0,53833:1.0,55589:1.0,53829:1.0,23923:1.0,50303:2.0,67892:1.0,86705:1.0,31087:1.0,27432:2.0,16875:1.0,3870:1.0,51612:1.0,32701:1.0,60844:2.0,64055:1.0,60842:1.0,67876:1.0,6309:1.0,67873:3.0,41486:1.0,17926:1.0,62589:2.0,44992:2.0,46746:1.0,66094:1.0,80165:1.0,87199:1.0,67847:2.0,43220:1.0,67845:1.0,90710:4.0,53769:1.0,89938:2.0,87182:1.0,41445:15.0,66070:1.0,87177:4.0,43199:1.0,56938:1.0,87171:3.0,30881:1.0,7803:1.0,78370:3.0,67811:1.0,67810:2.0,48459:1.0,44940:1.0,85393:1.0,16791:2.0,957:1.0,17085:1.0,83627:1.0,27200:1.0,36132:1.0,90652:1.0,63999:2.0,41396:1.0,9732:1.0,24119:1.0,37874:1.0,39435:2.0,27317:1.0,6207:1.0,30832:1.0,83600:1.0,14915:1.0,55453:1.0,46656:1.0,43136:1.0,43134:1.0,71277:1.0,11469:2.0,73032:2.0,51742:2.0,74789:1.0,78304:3.0,88857:2.0,60706:2.0,64041:1.0,6127:1.0,34317:1.0,23679:1.0,48387:5.0,55422:1.0,44744:1.0,20233:1.0,34301:1.0,20227:1.0,58923:1.0,20223:8.0,87063:1.0,69333:4.0,83543:1.0,83538:1.0,39560:14.0,18449:1.0,83532:2.0,25484:1.0,27241:1.0,80010:1.0,34275:1.0,71212:1.0,34271:2.0,46542:2.0,32503:1.0,14917:2.0,53613:1.0,71118:1.0,76475:1.0,6195:1.0,69428:1.0,27209:1.0,62388:1.0,23689:1.0,27205:1.0,818:1.0,83488:1.0,14886:1.0,62371:2.0,43028:1.0,39508:1.0,44784:2.0,37747:2.0,55336:1.0,55335:2.0,14876:4.0,62365:3.0,48290:1.0,71150:1.0,53559:1.0,50533:1.0,72928:1.0,16615:1.0,27708:1.0,21889:1.0,16608:1.0,44750:1.0,20123:1.0,23801:1.0,25396:2.0,83442:1.0,25394:4.0,16598:2.0,23810:1.0,18546:1.0,25385:4.0,30661:1.0,81667:6.0,41372:1.0,37690:1.0,71107:1.0,76383:2.0,25371:1.0,8039:3.0,2496:1.0,39434:1.0,55429:1.0,6011:1.0,67716:1.0,51743:1.0,2490:1.0,60536:1.0,86920:1.0,27109:1.0,30626:1.0,23589:2.0,1038:5.0,79872:1.0,49968:3.0,14787:1.0,76350:2.0,20061:1.0,34132:2.0,27095:3.0,53479:1.0,67550:1.0,39404:2.0,2464:1.0,71064:1.0,71063:1.0,85132:1.0,18286:2.0,67536:1.0,66001:6.0,76329:1.0,71049:1.0,69516:1.0,53722:2.0,85118:1.0,64254:3.0,82194:1.0,27066:1.0,62244:1.0,64002:1.0,66017:1.0,64637:1.0,59115:1.0,83347:1.0,42889:1.0,71663:1.0,46405:1.0,8115:2.0,36548:1.0,23525:1.0,62222:2.0,39184:2.0,16484:3.0,1109:1.0,30550:1.0,37583:2.0,19991:2.0,32303:1.0,46372:1.0,67116:1.0,11186:1.0,60437:1.0,83302:1.0,11181:1.0,37989:1.0,72742:1.0,28763:1.0,28762:1.0,49868:2.0,16446:1.0,28758:1.0,39310:1.0,90320:1.0,83283:1.0,18199:1.0,72726:2.0,16436:1.0,83277:1.0,74866:2.0,30504:2.0,55565:3.0,83271:3.0,90303:1.0,72712:1.0,90299:1.0,14661:1.0,81901:1.0,7621:1.0,8195:2.0,53826:1.0,90289:1.0,64360:1.0,23996:1.0,25200:1.0,48065:1.0,80163:1.0,37508:1.0,52087:1.0,9360:1.0,51574:1.0,18747:1.0,62126:3.0,48053:1.0,28703:1.0,18146:2.0,5871:1.0,87202:2.0,26935:1.0,90257:3.0,5824:1.0,19891:1.0,32202:1.0,40994:1.0,76173:2.0,72652:1.0,32191:1.0,36615:3.0,37466:2.0,27558:1.0,72639:1.0,58564:2.0,28660:2.0,53284:1.0,24065:1.0,69109:1.0,21615:2.0,90214:1.0,53905:1.0,25827:1.0,19849:1.0,88449:1.0,84930:1.0,49749:1.0,64446:1.0,59182:1.0,44466:6.0,39187:1.0,53257:1.0,88436:2.0,86675:3.0,9278:1.0,33902:1.0,55684:1.0,84910:1.0,35657:1.0,17078:4.0,76106:2.0,11823:2.0,44439:5.0,90166:10.0,19805:1.0,11834:2.0,19803:1.0,86643:1.0,90160:1.0,19798:3.0,30351:1.0,81360:1.0,52209:1.0,33864:1.0,18032:1.0,18031:1.0,76077:1.0,54220:1.0,49689:1.0,30336:1.0,19980:1.0,90139:2.0,70788:1.0,70787:1.0,27659:1.0,70784:1.0,76059:1.0,35599:1.0,81332:1.0,14489:4.0,10970:1.0,6610:1.0,14484:1.0,83206:2.0,88360:1.0,81323:1.0,23274:1.0,46992:1.0,90105:3.0,72514:1.0,27690:1.0,60199:1.0,17981:1.0,40847:4.0,90875:1.0,28527:1.0,53151:1.0,17970:1.0,47870:2.0,76013:1.0,27709:1.0,7409:1.0,3887:1.0,55798:1.0,60169:3.0,3878:1.0,3875:1.0,19704:1.0,3872:1.0,16184:1.0,81266:1.0,9142:1.0,3864:1.0,17935:1.0,7379:1.0,65425:1.0,33008:1.0,3855:1.0,72455:5.0,17925:2.0,40791:1.0,31993:2.0,19674:1.0,86515:1.0,75960:1.0,75958:3.0,9112:1.0,88264:2.0,67155:1.0,82985:1.0,88261:1.0,70670:1.0,56597:1.0,70664:1.0,51314:1.0,44277:1.0,21409:1.0,82973:1.0,90008:1.0,53068:1.0,88246:1.0,93522:1.0,63618:3.0,68894:1.0,88242:3.0,65370:1.0,19634:1.0,75921:1.0,31945:1.0,63605:1.0,51291:1.0,56566:1.0,15524:2.0,80460:2.0,63597:3.0,58317:2.0,19617:1.0,24893:1.0,24890:2.0,89971:1.0,17851:1.0,17850:2.0,17848:10.0,17847:11.0,30159:4.0,36604:1.0,31344:3.0,54778:2.0,12560:1.0,2005:2.0,28389:1.0,31906:3.0,79778:1.0,79397:1.0,67082:1.0,16068:1.0,70595:1.0,47726:1.0,38383:1.0,75866:1.0,92791:1.0,75247:1.0,31883:1.0,15699:3.0,51222:1.0,52979:1.0,19556:1.0,9000:2.0,3722:1.0,66498:2.0,45928:3.0,38887:1.0,45922:1.0,38428:6.0,3701:1.0,33601:1.0,36680:1.0,70538:1.0,85079:1.0,16006:1.0,54700:1.0,63016:1.0,24787:1.0,66534:1.0,86348:1.0,63028:1.0,40611:1.0,33574:1.0,10706:1.0,65234:1.0,3668:1.0,58194:1.0,81060:1.0,21251:4.0,38840:1.0,86331:3.0,44110:1.0,86325:1.0,81047:1.0,63450:1.0,70080:1.0,56411:3.0,35301:1.0,63444:2.0,82791:4.0,15947:1.0,71847:1.0,60529:1.0,68709:1.0,56198:1.0,91161:2.0,15933:3.0,77496:1.0,15929:5.0,17687:3.0,15926:1.0,71981:1.0,17683:2.0,65172:1.0,37025:1.0,58130:1.0,17671:1.0,72198:1.0,65161:1.0,65159:3.0,13970:1.0,70432:1.0,65154:1.0,84502:1.0,14139:1.0,56351:1.0,72178:1.0,86248:2.0,51067:1.0,47543:1.0,61439:4.0,71912:1.0,19394:1.0,64955:1.0,45773:1.0,14109:1.0,26421:1.0,33351:1.0,15792:2.0,41254:1.0,77422:1.0,17615:1.0,42240:1.0,68624:1.0,65105:1.0,15779:1.0,66858:1.0,7050:1.0,8769:1.0,70195:2.0,3526:1.0,7043:1.0,10560:1.0,86196:1.0,77400:1.0,0:6.0}

name:

/alt.atheism/49960

看到这两个属性值后,大概可以猜测这个Job的作用是把输入document-token的每个文件的单词都先转换为dictionary中的对应的数值,然后再在后面加上其重复的个数即可得到delegate的值了,当然这个也只是猜测,不过估计八九不离十吧。

再说吧。。。

分享,快乐,成长


转载请注明出处:http://blog.csdn.net/fansy1990 


原文地址:https://www.cnblogs.com/suncoolcat/p/3292111.html