hadoop与spark的处理技巧(四)推荐引擎处理技巧

经常一起购买的商品

scala> var file=sc.textFile("/user/ghj/togeterBought")
file: org.apache.spark.rdd.RDD[String] = /user/ghj/togeterBought MapPartitionsRDD[28] at textFile at <console>:25

scala> file.collect
res0: Array[String] = Array(t1 p1 p2 p3, t2 p2 p3, t3 p2 p3 p4, t4 p5 p6, t5 p3 p4)

scala> var mapFile=file.map(line=>{
     |   import scala.collection.mutable.ListBuffer;
     |   var listBuff=ListBuffer[(String,String)]();
     |   var list=line.split(" ").toList;
     |   var ll=list.takeRight(list.size-1);
     |   for(p1<-ll){
     |     for(p2<-ll){
     |       if(ll.indexOf(p1) != ll.indexOf(p2)){
     |         if(p1<p2){
     |           listBuff=listBuff:+((p1,p2));
     |         }else{
     |           listBuff=listBuff:+((p2,p1));
     |         }
     |       }
     |     }
     |   }
     |   listBuff;
     | }).flatMap(x=>x).map(x=>(x,1)).reduceByKey(_+_).map(x=>(x,x._2/2));
mapFile: org.apache.spark.rdd.RDD[(String, String)] = MapPartitionsRDD[30] at flatMap at <console>:46

scala> mapFile.collect
res4: Array[(((String, String), Int), Int)] = Array((((p5,p6),2),1), (((p1,p3),2),1), (((p2,p4),2),1), (((p3,p4),4),2), (((p2,p3),6),3), (((p1,p2),2),1))
原文地址:https://www.cnblogs.com/gaohuajie/p/10137368.html