scala实现wordcount方法-商品标签统计-气温统计

scala实现单词统计
---------------------
    import scala.io.Source

    /**
      * Created by Administrator on 2018/5/7.
      */
    object WCApp {
        def main(args: Array[String]): Unit = {
            //1.加载文件
            val src = Source.fromFile("d:/mr/word.txt")

            //2.取得所有行
            val lines = src.getLines().toList
    //        for(line <- lines){
    //            println(line)
    //        }

            //3.压扁单词
            val words = lines.flatMap(_.split(" "))

            //标一成对
            val map1 = words.map((w:String) => (w, 1))

            //按照单词分组
            val map2 = map1.groupBy(t=>t._1)
            val map3 = map2.mapValues(list => list.size)

            lines.foreach(println)
        }

    }



scala实现单词统计2
---------------------
    import scala.io.Source

    /**
      * Created by Administrator on 2018/5/7.
      */
    object WCApp2 {
        def main(args: Array[String]): Unit = {
            //1.加载文件
            val src = Source.fromFile("d:/mr/word.txt")

            //2.取得所有行
            val lines = src.getLines().toList
    //        for(line <- lines){
    //            println(line)
    //        }

            //3.压扁单词
            val words = lines.flatMap(_.split(" "))

            //标一成对
            val map1 = words.map((w:String) => (w, 1))

            //按照单词分组{hello->[(hello,1),(hello,1),(hello,1)]}
            val map2 = map1.groupBy(t=>t._1)

            //{hello->(hello,4) , ...}
            val map3 = map2.mapValues(list => {
    //            def op(a:Tuple2[String,Int] , b:Tuple2[String,Int]) = {
    //                val word = a._1
    //                val cnt = a._2 + b._2
    //                (word , cnt)
    //            }
    //            list.reduce(op _)
                list.reduce((a,b)=>(a._1,a._2 + b._2 ))
            })
            //
            val map4 = map3.map((t:Tuple2[String,Tuple2[String,Int]])=>t._2)
            map4.foreach(println)
        }

    }




Bitmap实现topn统计
------------------------
    import scala.io.Source

    /**
      * 气温的年度内topN查询,使用reduce实现
      */
    object TempTopN2_Bitmap {
        def main(args: Array[String]): Unit = {
            //1.加载气温文件
            val f = Source.fromFile("d:/mr/temp.dat")

            //2.取得所有行
            val temps = f.getLines().toList

            //3.提取每行的年度和气温,形成元组{(1900,28),....}
            val map1 = temps.map((line:String) => {
                val arr = line.split(" ")
                val year = arr(0).toInt
                val temp = arr(1).toInt
                (year, temp)
            })

            //4.按照年度分组{(1920->{(),(),(),...}),...}
            val map2 = map1.groupBy((t:Tuple2[Int,Int])=>t._1)

            //5.对每个key对应的value进行按照气温只top3聚合
            val map3 = map2.mapValues(list=>{
                val bytes = list.foldLeft(new Array[Byte](128))((a,b)=>{
                    val temp = b._2
                    if(temp > 0){
                        val index = temp / 8
                        val mod = temp % 8
                        a(index) = (a(index) | (1 << mod)).toByte
                    }
                    a
                })

                //定义方法,处理bitmap
                def process(): String ={
                    var count = 0;
                    var tempStr = "";
                    for (x <- (0 until bytes.length).reverse) {
                        val b = bytes(x)
                        for (y <- (0 to 7).reverse) {
                            if (((b >> y) & 1) != 0) {
                                count += 1
                                tempStr = tempStr + "," + (8 * x + y)
                                if (count == 3) {
                                    return tempStr
                                }
                            }
                        }
                    }
                    tempStr
                }
                process()
            })

            val map4 = map3.toList.sortBy(e=>e._1)
            map4.foreach(println(_))
        }
    }

scala实现商品评论
---------------------
    1.TagUtil.java
        package com.oldboy.scala.util;

        import com.alibaba.fastjson.JSON;
        import com.alibaba.fastjson.JSONArray;
        import com.alibaba.fastjson.JSONObject;

        import java.util.ArrayList;
        import java.util.List;

        /**
         * 标签工具类
         */
        public class TagUtil {
            /**
             * 从json数据中抽取出评论集合
             */
            public static List<String> extractTags(String json){
                //评论集合
                List<String> tags = new ArrayList<String>() ;

                //将文件解析成json对象
                JSONObject obj = JSON.parseObject(json) ;

                //得到数组
                JSONArray array = obj.getJSONArray("extInfoList");

                //判断数组有效性
                if(array != null && array.size() > 0){
                    JSONObject obj2 = array.getJSONObject(0);
                    JSONArray arr2 = obj2.getJSONArray("values") ;
                    if(arr2 != null && arr2.size() > 0 ){
                        for(int i = 0 ; i < arr2.size() ; i ++){
                            tags.add(arr2.getString(i));
                        }
                    }
                }

                return tags ;
            }
        }

    2.TaggenDemo
        import javax.swing.text.html.HTML.Tag

        import com.oldboy.scala.util.TagUtil

        import scala.io.Source

        /**
          * 便签生成统计
          */
        object TaggenDemo {

            def main(args: Array[String]): Unit = {
                //1.加载文件
                val file = Source.fromFile("d:/mr/temptags.txt") ;

                //2.提取所有行
                val lines = file.getLines().toList

                //3.压扁变换每行形成(busid,tag)
                val map1 = lines.flatMap(line=>{
                    var list0:List[(String,String)] = Nil
                    var arr = line.split("	")
                    val busid = arr(0)
                    var json = arr(1)
                    import scala.collection.JavaConversions._
                    val list:List[String] = TagUtil.extractTags(json).toList ;
                    for(tag <- list){
                        list0 = (busid, tag) +: list0
                    }
                    list0
                })

                //4.对元组进行分组,{(busid,tag)->List((busid,tag),(busid,tag),...}
                val map2 = map1.groupBy(t => t)

                //5.统计每个key下List的size,{(busid,tag)->300}
                val map3 = map2.mapValues(_.size)

                //6.交换元素位置,List((busid , (tag,cnt)),...)
                val map4 = map3.toList.map(t=>(t._1._1 , (t._1._2,t._2)))

                //7.按照busid再次分组Map(busid->List((busid , (tag,cnt)),...))
                val map5 = map4.groupBy(t=>t._1)

                //8.对每个商家内的评论按照数量倒排序.Map(busid->List((busid,(tag,59)))
                val map6 = map5.mapValues(list=>{
                    val list2 = list.sortBy(t=> -t._2._2).take(5)
                    val list3 = list2.map(t=>t._2)
                    list3
                })

                //9.对商家进行排序,按照商家的最大评论数倒排序
                val map7 = map6.toList.sortBy(t=> -t._2(0)._2)
                map7.foreach(t=>{
                    val busid = t._1
                    val str = t._2.mkString(";")
                    println(busid + "==>" + str)
                })
            }
        }
原文地址:https://www.cnblogs.com/zyde/p/9004770.html