Spark 机器学习 ---TF-IDF

package Spark_MLlib

import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer}
import org.apache.spark.sql.SparkSession

/**
  * TF-IDF
  */
object 特征抽取 {
       val spark=SparkSession.builder().master("local").appName("TF-IDF").getOrCreate()
          import spark.implicits._
  def main(args: Array[String]): Unit = {
        val soureceData= spark.createDataFrame(Seq(
          (0,"soyo spark like spark hadoop spark and spark like spark"),
          (1,"i wish i can like java i"),
          (2,"but i dont know how to soyo"),
          (3,"spark is good spark tool")
        )).toDF("label","sentence")
       //进行分词
       val tokenizer=new Tokenizer().setInputCol("sentence").setOutputCol("words")
       val wordsData=tokenizer.transform(soureceData)
           wordsData.show(false)  //表示不省略,打印字符串的所有单词
       val hashTF=new HashingTF().setInputCol("words").setOutputCol("rawsFeatures").setNumFeatures(1000)
       //生成特征向量
       val featuredData=hashTF.transform(wordsData)
           featuredData.show(false)
       val idf=new IDF().setInputCol("rawsFeatures").setOutputCol("features")
       val idfModel=idf.fit(featuredData)
       val result=idfModel.transform(featuredData)
       result.show(false)
       result.select("label","features").show(false)

  }
}

结果:

+-----+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|label|features                                                                                                                                                                  |
+-----+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0    |(1000,[105,107,181,330,333],[2.5541281188299534,0.5108256237659907,0.9162907318741551,1.0216512475319814,0.9162907318741551])                                             |
|1    |(1000,[329,330,495,833,967],[1.5324768712979722,0.5108256237659907,0.9162907318741551,0.9162907318741551,0.9162907318741551])                                             |
|2    |(1000,[83,107,237,329,388,779,977],[0.9162907318741551,0.5108256237659907,0.9162907318741551,0.5108256237659907,0.9162907318741551,0.9162907318741551,0.9162907318741551])|
|3    |(1000,[105,111,168,281],[1.0216512475319814,0.9162907318741551,0.9162907318741551,0.9162907318741551])                                                                    |
+-----+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------+


原文地址:https://www.cnblogs.com/soyo/p/7725404.html