決策樹進(jìn)行鳶尾花分類的案例
背景說明:
通過IDEA + Spark 3.4.1 + sbt 1.9.3 + Spark MLlib 構(gòu)建鳶尾花決策樹分類預(yù)測(cè)模型,這是一個(gè)分類模型案例,通過該案例,可以快速了解Spark MLlib分類預(yù)測(cè)模型的使用方法。文章來源:http://www.zghlxwxcb.cn/news/detail-634385.html
依賴
ThisBuild / version := "0.1.0-SNAPSHOT"
ThisBuild / scalaVersion := "2.13.11"
lazy val root = (project in file("."))
.settings(
name := "SparkLearning",
idePackagePrefix := Some("cn.lh.spark"),
libraryDependencies += "org.apache.spark" %% "spark-sql" % "3.4.1",
libraryDependencies += "org.apache.spark" %% "spark-core" % "3.4.1",
libraryDependencies += "org.apache.hadoop" % "hadoop-auth" % "3.3.6", libraryDependencies += "org.apache.spark" %% "spark-streaming" % "3.4.1",
libraryDependencies += "org.apache.spark" %% "spark-streaming-kafka-0-10" % "3.4.1",
libraryDependencies += "org.apache.spark" %% "spark-mllib" % "3.4.1",
libraryDependencies += "mysql" % "mysql-connector-java" % "8.0.30"
)
完整代碼
package cn.lh.spark
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.classification.{DecisionTreeClassificationModel, DecisionTreeClassifier}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{IndexToString, StringIndexer, StringIndexerModel, VectorIndexer, VectorIndexerModel}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}
/**
* 決策樹分類器,實(shí)現(xiàn)鳶尾花分類
*/
//case class Iris(features: org.apache.spark.ml.linalg.Vector, label: String) // MLlibLogisticRegression 中存在該樣例類,這里不用寫,一個(gè)包里不存在這個(gè)樣例類時(shí)需要寫
object MLlibDecisionTreeClassifier {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder().master("local[2]")
.appName("Spark MLlib DecisionTreeClassifier").getOrCreate()
val irisRDD: RDD[Iris] = spark.sparkContext.textFile("F:\\niit\\2023\\2023_2\\Spark\\codes\\data\\iris.txt")
.map(_.split(",")).map(p =>
Iris(Vectors.dense(p(0).toDouble, p(1).toDouble, p(2).toDouble, p(3).toDouble), p(4).toString()))
import spark.implicits._
val data: DataFrame = irisRDD.toDF()
data.show()
data.createOrReplaceTempView("iris")
val df: DataFrame = spark.sql("select * from iris")
println("鳶尾花原始數(shù)據(jù)如下:")
df.map(t => t(1)+":"+t(0)).collect().foreach(println)
// 處理特征和標(biāo)簽,以及數(shù)據(jù)分組
val labelIndexer: StringIndexerModel = new StringIndexer().setInputCol("label").setOutputCol(
"indexedLabel").fit(df)
val featureIndexer: VectorIndexerModel = new VectorIndexer().setInputCol("features")
.setOutputCol("indexedFeatures").setMaxCategories(4).fit(df)
//這里我們?cè)O(shè)置一個(gè)labelConverter,目的是把預(yù)測(cè)的類別重新轉(zhuǎn)化成字符型的
val labelConverter: IndexToString = new IndexToString().setInputCol("prediction")
.setOutputCol("predictedLabel").setLabels(labelIndexer.labels)
//接下來,我們把數(shù)據(jù)集隨機(jī)分成訓(xùn)練集和測(cè)試集,其中訓(xùn)練集占70%。
val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3))
val dtClassifier: DecisionTreeClassifier = new DecisionTreeClassifier()
.setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures")
//在pipeline中進(jìn)行設(shè)置
val pipelinedClassifier: Pipeline = new Pipeline()
.setStages(Array(labelIndexer, featureIndexer, dtClassifier, labelConverter))
//訓(xùn)練決策樹模型
val modelClassifier: PipelineModel = pipelinedClassifier.fit(trainingData)
//進(jìn)行預(yù)測(cè)
val predictionsClassifier: DataFrame = modelClassifier.transform(testData)
predictionsClassifier.select("predictedLabel", "label", "features").show(5)
// 評(píng)估決策樹分類模型
val evaluatorClassifier: MulticlassClassificationEvaluator = new MulticlassClassificationEvaluator()
.setLabelCol("indexedLabel")
.setPredictionCol("prediction").setMetricName("accuracy")
val accuracy: Double = evaluatorClassifier.evaluate(predictionsClassifier)
println("Test Error = " + (1.0 - accuracy))
val treeModelClassifier: DecisionTreeClassificationModel = modelClassifier.stages(2)
.asInstanceOf[DecisionTreeClassificationModel]
println("Learned classification tree model:\n" + treeModelClassifier.toDebugString)
spark.stop()
}
}
文章來源地址http://www.zghlxwxcb.cn/news/detail-634385.html
到了這里,關(guān)于【IDEA + Spark 3.4.1 + sbt 1.9.3 + Spark MLlib 構(gòu)建鳶尾花決策樹分類預(yù)測(cè)模型】的文章就介紹完了。如果您還想了解更多內(nèi)容,請(qǐng)?jiān)谟疑辖撬阉鱐OY模板網(wǎng)以前的文章或繼續(xù)瀏覽下面的相關(guān)文章,希望大家以后多多支持TOY模板網(wǎng)!