今天原来是情人节,但想到情侣不能约会,就超级开心,也成功的做完了全部的实验项目,收获很大,对新的语言有了一定的了解。
下面是实验的部分代码:
scala> val labelIndexer = new StringIndexer().setInputCol("label").setOutputCol("indexedLabel").fit(result) labelIndexer: org.apache.spark.ml.feature.StringIndexerModel = strIdx_6721796011c5 scala> labelIndexer.labels.foreach(println) <=50K >50K scala> val featureIndexer = new VectorIndexer().setInputCol("pcaFeatures").setOutputCol("indexedFeatures").fit(result) featureIndexer: org.apache.spark.ml.feature.VectorIndexerModel = vecIdx_7b6672933fc3 scala> println(featureIndexer.numFeatures) 3 scala> val labelConverter = new IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer. labels) labelConverter: org.apache.spark.ml.feature.IndexToString = idxToStr_d0c9321aaaa9 scala> val lr = new LogisticRegression().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures").setMaxIter( 100) lr: org.apache.spark.ml.classification.LogisticRegression = logreg_06812b41b118 scala> val lrPipeline = new Pipeline().setStages(Array(labelIndexer, featureIndexer, lr, labelConverter)) lrPipeline: org.apache.spark.ml.Pipeline = pipeline_b6b87b6e8cd5 scala> val lrPipelineModel = lrPipeline.fit(result) lrPipelineModel: org.apache.spark.ml.PipelineModel = pipeline_b6b87b6e8cd5 scala> val lrModel = lrPipelineModel.stages(2).asInstanceOf[LogisticRegressionModel] lrModel: org.apache.spark.ml.classification.LogisticRegressionModel = logreg_06812b41b118 scala> println("Coefficients: " + lrModel.coefficientMatrix+"Intercept: "+lrModel.interceptVector+"numClasses: "+lrModel.numClasses+"numFeatures: "+lrModel.numFeatures) Coefficients: -1.9828586428133616E-7 -3.5090924715811705E-4 -8.451506276498941E-4 Intercept: [-1.4525982557843347]numClasses: 2numFeatures: 3 scala> val lrPredictions = lrPipelineModel.transform(testdata) lrPredictions: org.apache.spark.sql.DataFrame = [features: vector, label: string ... 7 more fields] scala> val evaluator = new MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction") evaluator: org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator = mcEval_38ac5c14fa2a scala> val lrAccuracy = evaluator.evaluate(lrPredictions) lrAccuracy: Double = 0.7764235163053484 scala> println("Test Error = " + (1.0 - lrAccuracy)) Test Error = 0.22357648369465155