SparkMLlib基础内容

SparkMLlib基础内容

　　　　（一），多种数据类型

　　　　　　　　1.1 本地向量集　　　　　　　　

def testVetor: Unit ={
    val vd:Vector=Vectors.dense(2,3,6)
    println(vd(2))//输出结果为6,稠密型数据集下标从0开始依次递增
    val vr:Vector=Vectors.sparse(10,Array(1,3,5,8),Array(1,2,3,4))
    //sparse数据集为一个矩阵中的指定位置复制,其余位置默认为0
    println(vr(8))//输出为2,即指定的下标的值
    println(vr(4))//输出为0
  }

View Code

　　　　　　　　1.2向量标签使用

def testLablePoint: Unit ={
    val vd:Vector=Vectors.dense(2,3,6)
    val lp=LabeledPoint(1,vd)
    println(lp.label)//输出为1
    println(lp.features)//输出为[2.0,3.0,6.0]
    val vr:Vector=Vectors.sparse(10,Array(1,3,5,8),Array(1,2,3,4))
    //sparse数据集为一个矩阵中的指定位置复制,其余位置默认为0
    val lp2=LabeledPoint(2,vr)
    println(lp2.label)//输出为2
    println(lp2.features)//输出为(10,[1,3,5,8],[1.0,2.0,3.0,4.0])
  }

View Code

　　　　　　　　　　svm文件加载　　　

/*文本格式 (label,index:value)
    7 1:1 2:1 3:1 4:9 5:2 6:1 7:2 8:0 9:0 10:1 11:3
    8 1:4 2:4 3:0 4:3 5:4 6:2 7:1 8:3 9:0 10:0 11:0*/
    val svmFile=MLUtils.loadLibSVMFile(sc,"svmFile")
    svmFile.foreach(println(_))//分解成sparse向量格式
   /* (7.0,(11,[0,1,2,3,4,5,6,7,8,9,10],[1.0,1.0,1.0,9.0,2.0,1.0,2.0,0.0,0.0,1.0,3.0]))
    (8.0,(11,[0,1,2,3,4,5,6,7,8,9,10],[4.0,4.0,0.0,3.0,4.0,2.0,1.0,3.0,0.0,0.0,0.0]))
  */

View Code

　　　　　　　　1.3 矩阵的使用

　　　　　　　　　　本地矩阵

val mx= Matrices.dense(2,3,Array(1,2,3,4,5,6))//将数组转为2行3列
println(mx)
/*Result
1.0  3.0  5.0  
2.0  4.0  6.0  */

View Code

　　　　　　　　1.4 分布式矩阵

　　　　　　　　　　1.4.1 行矩阵

/*
1.0    3.0    5.0  
2.0    4.0    6.0 *?
val rdd=sc.textFile("test").map(_.split("	").map(_.toDouble))
      .map(line=>Vectors.dense(line))
    val row=new RowMatrix(rdd)
    println(row.numRows())//2
    println(row.numCols())//3

View Code

　　　　　　　　　　1.4.2 带索引的行矩阵

val rdd=sc.textFile("test").map(_.split("	").map(_.toDouble))
      .map(line=>Vectors.dense(line)).map((vd) => new IndexedRow(vd.size,vd))
    val indexRow=new IndexedRowMatrix(rdd)
    indexRow.rows.foreach(println(_))
/*result
IndexedRow(3,[1.0,3.0,5.0])
IndexedRow(3,[2.0,4.0,6.0])
*/

　　　　　　　　　　1.4.3 坐标矩阵

val rdd=sc.textFile("test").map(_.split("	").map(_.toDouble))
      .map(value => (value(0).toLong,value(1).toLong,value(2)))
      .map(value2 =>new MatrixEntry(value2._1,value2._2,value2._3))
    val comRow=new CoordinateMatrix(rdd)
    comRow.entries.foreach(println(_))
/*
MatrixEntry(1,3,5.0)
MatrixEntry(2,4,6.0)
*/

　　　　（二），数理统计概念

　　　　　　　　　皮尔逊相关系数：https://segmentfault.com/q/1010000000094674

val Data_test=sc.parallelize(Seq(1,2,3,4,5,6)).map(_.toDouble)
      .map(x => Vectors.dense(x))
    val Data_test2=sc.parallelize(Seq(1,2,3,4,5,6)).map(_.toDouble)
        .map(x =>LabeledPoint(x,Vectors.dense(x)) )
    val stat=Statistics.colStats(Data_test)
    println(stat.normL1)//曼哈顿距离
    println(stat.normL2)//欧几里德距离
    println(stat.variance)//平均值
    val correlation=Statistics.corr(Data_test)//皮尔逊相关系数
    println(correlation)
    val vd=Statistics.chiSqTest(Data_test2)//卡方检验
    vd.foreach(println(_))
/*results
[21.0]
[9.539392014169456]
[3.5]
1.0  
Chi squared test summary:
method: pearson
degrees of freedom = 25 
statistic = 30.000000000000014 
pValue = 0.22428900483440284 
No presumption against null hypothesis: the occurrence of the outcomes is statistically independent..
*/