spark机器学习

多层感知器(MLP)

 1 from __future__ import print_function
 2 from pyspark.ml.classification import MultilayerPerceptronClassifier
 3 from pyspark.ml.evaluation import MulticlassClassificationEvaluator
 4 from pyspark.sql import SparkSession
 5 
 6 spark = SparkSession
 7     .builder.appName("multilayer_perceptron_classification_example").getOrCreate()
 8 
 9 # 加载数据
10 data = spark.read.format("libsvm")
11     .load("data/mllib/sample_multiclass_classification_data.txt")
12 
13 # 切分训练集和测试集
14 splits = data.randomSplit([0.6, 0.4], 1234)
15 train = splits[0]
16 test = splits[1]
17 
18 # 输入、隐层、隐层、输出个数
19 layers = [4, 5, 4, 3]
20 
21 # 创建多层感知器
22 trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)
23 
24 # 训练模型
25 model = trainer.fit(train)
26 
27 # 预测和计算准确度
28 result = model.transform(test)
29 result.show()
30 predictionAndLabels = result.select("prediction", "label")
31 evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
32 print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))
33 
34 spark.stop()

+-----+--------------------+----------+
|label|            features|prediction|
+-----+--------------------+----------+
|  0.0|(4,[0,1,2,3],[-0....|       2.0|
|  0.0|(4,[0,1,2,3],[-0....|       0.0|
|  0.0|(4,[0,1,2,3],[-0....|       0.0|
|  0.0|(4,[0,1,2,3],[-0....|       2.0|
|  0.0|(4,[0,1,2,3],[-0....|       2.0|
|  0.0|(4,[0,1,2,3],[-1....|       2.0|
|  0.0|(4,[0,1,2,3],[0.1...|       0.0|
|  0.0|(4,[0,1,2,3],[0.2...|       0.0|
|  0.0|(4,[0,1,2,3],[0.3...|       0.0|
|  0.0|(4,[0,1,2,3],[0.3...|       0.0|
|  0.0|(4,[0,1,2,3],[0.3...|       0.0|
|  0.0|(4,[0,1,2,3],[0.4...|       0.0|
|  0.0|(4,[0,1,2,3],[0.5...|       0.0|
|  0.0|(4,[0,1,2,3],[0.7...|       0.0|
|  0.0|(4,[0,1,2,3],[0.8...|       0.0|
|  0.0|(4,[0,1,2,3],[1.0...|       0.0|
|  0.0|(4,[0,2,3],[0.166...|       0.0|
|  0.0|(4,[0,2,3],[0.388...|       0.0|
|  1.0|(4,[0,1,2,3],[-0....|       1.0|
|  1.0|(4,[0,1,2,3],[-0....|       1.0|
+-----+--------------------+----------+
only showing top 20 rows

Test set accuracy = 0.901960784314