垃圾邮件分类(Scala 版本)

import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.classification.LogisticRegressionWithSGD
import org.apache.spark.mllib.feature.HashingTF
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.{SparkConf, SparkContext}

/**
  * Created by DengNi on 2016/9/21.
  * 邮件分类 scala
  */
object spam_normal {

  def main(args: Array[String]) {

    Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)

    val conf = new SparkConf().setAppName("scala").setMaster("local[*]")

    val sc = new SparkContext(conf)

    val spam = sc.textFile("spam.txt")
    val norm = sc.textFile("noraml.txt")
    //创建一个HashingTF 实例来把邮件文本映射为包含 10000 个特征的向量
    val tf = new HashingTF(numFeatures = 10000)
    //各个邮件都被切分为单词,每个单词被映射为一个特征
    val spamFeatures  = spam.map(email =>tf.transform(email.split(" ")))
    val normFeatures  = norm.map(email =>tf.transform(email.split(" ")))

    //创建lablepoint 数据集分别存放垃圾邮件和正常邮件

    val positiveExample =spamFeatures.map(features => LabeledPoint(1,features))
    val negativeExample =normFeatures.map(features => LabeledPoint(0,features))

    val trainingDatat = positiveExample.union(negativeExample)
    //因为逻辑回归是迭代算法,所以使用缓存技术
    trainingDatat.cache()

    //使用SGD 算法运行逻辑回归

    val model  = new LogisticRegressionWithSGD().run(trainingDatat)

    //使用两组数据测试

    val psTest  = tf.transform("fuck you love sex cheap by sending money fund".split(" "))

    val negTest = tf.transform("Hi hwo do you good to see you want to spark".split(" "))

    println(model.predict(psTest)) //should be 1
    println(model.predict(negTest)) //should be 0

  }



}


"C:Program FilesJavajdk1.7.0_80injava" -Didea.launcher.port=7533 "-Didea.launcher.bin.path=C:Program Files (x86)JetBrainsIntelliJ IDEA Community Edition 2016.1.3in" -Dfile.encoding=UTF-8 -classpath "C:Program FilesJavajdk1.7.0_80jrelibcharsets.jar;C:Program FilesJavajdk1.7.0_80jrelibdeploy.jar;C:Program FilesJavajdk1.7.0_80jrelibextaccess-bridge-64.jar;C:Program FilesJavajdk1.7.0_80jrelibextdnsns.jar;C:Program FilesJavajdk1.7.0_80jrelibextjaccess.jar;C:Program FilesJavajdk1.7.0_80jrelibextlocaledata.jar;C:Program FilesJavajdk1.7.0_80jrelibextsunec.jar;C:Program FilesJavajdk1.7.0_80jrelibextsunjce_provider.jar;C:Program FilesJavajdk1.7.0_80jrelibextsunmscapi.jar;C:Program FilesJavajdk1.7.0_80jrelibextzipfs.jar;C:Program FilesJavajdk1.7.0_80jrelibjavaws.jar;C:Program FilesJavajdk1.7.0_80jrelibjce.jar;C:Program FilesJavajdk1.7.0_80jrelibjfr.jar;C:Program FilesJavajdk1.7.0_80jrelibjfxrt.jar;C:Program FilesJavajdk1.7.0_80jrelibjsse.jar;C:Program FilesJavajdk1.7.0_80jrelibmanagement-agent.jar;C:Program FilesJavajdk1.7.0_80jrelibplugin.jar;C:Program FilesJavajdk1.7.0_80jrelib esources.jar;C:Program FilesJavajdk1.7.0_80jrelib t.jar;D:igdataworkspaces ecommderoutproduction ecommder;F:scalalibscala-actors-migration.jar;F:scalalibscala-actors.jar;F:scalalibscala-library.jar;F:scalalibscala-reflect.jar;F:scalalibscala-swing.jar;D:igdataworkspaces ecommderlibspark-assembly-1.6.0-hadoop2.6.0.jar;C:Program Files (x86)JetBrainsIntelliJ IDEA Community Edition 2016.1.3libidea_rt.jar" com.intellij.rt.execution.application.AppMain spam_normal
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
16/09/21 22:16:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
16/09/21 22:16:15 INFO Slf4jLogger: Slf4jLogger started
16/09/21 22:16:15 INFO Remoting: Starting remoting
16/09/21 22:16:16 INFO Remoting: Remoting started; listening on addresses :[akka.tcp://sparkDriverActorSystem@192.168.184.1:3070]
16/09/21 22:16:18 WARN : Your hostname, root resolves to a loopback/non-reachable address: fe80:0:0:0:0:5efe:c0a8:8c01%17, but we couldn't find any external IP address!
16/09/21 22:16:19 INFO FileInputFormat: Total input paths to process : 1
16/09/21 22:16:19 INFO FileInputFormat: Total input paths to process : 1
16/09/21 22:16:20 INFO deprecation: mapred.tip.id is deprecated. Instead, use mapreduce.task.id
16/09/21 22:16:20 INFO deprecation: mapred.task.id is deprecated. Instead, use mapreduce.task.attempt.id
16/09/21 22:16:20 INFO deprecation: mapred.task.is.map is deprecated. Instead, use mapreduce.task.ismap
16/09/21 22:16:20 INFO deprecation: mapred.task.partition is deprecated. Instead, use mapreduce.task.partition
16/09/21 22:16:20 INFO deprecation: mapred.job.id is deprecated. Instead, use mapreduce.job.id
16/09/21 22:16:20 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
16/09/21 22:16:20 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
1.0
0.0

16/09/21 22:16:24 INFO RemoteActorRefProvider$RemotingTerminator: Shutting down remote daemon.

Process finished with exit code 0

原文地址:https://www.cnblogs.com/TendToBigData/p/10501370.html