case class Adult(features: org.apache.spark.ml.linalg.Vector, label: String)
val df = sc.textFile("/export/server/spark-3.0.0-bin-hadoop3.2/adult.data.txt").map(_.split(",")).map(p => Adult(Vectors.dense(p(0).toDouble,p(2).toDouble,p(4).toDouble, p(10).toDouble, p(11).toDouble, p(12).toDouble), p(14).toString())).toDF()
复制代码
(2)读取数据集和测试集,进行主成分分析(PCA)
val test = sc.textFile("/export/server/spark-3.0.0-bin-hadoop3.2/adult.test.txt").map(_.split(",")).map(p => Adult(Vectors.dense(p(0).toDouble,p(2).toDouble,p(4).toDouble, p(10).toDouble, p(11).toDouble, p(12).toDouble), p(14).toString())).toDF()
val pca = new PCA().setInputCol("features").setOutputCol("pcaFeatures").setK(3).fit(df)
val result = pca.transform(df)
val testdata = pca.transform(test)
result.show(false)
testdata.show(false)
复制代码
可以看到数据集和测试集导入成功,如下图所示:
数据集
测试集
(3)训练分类模型并猜测住民收入
val labelIndexer = new StringIndexer().setInputCol("label").setOutputCol("indexedLabel").fit(result)
labelIndexer.labels.foreach(println)
val featureIndexer = new VectorIndexer().setInputCol("pcaFeatures").setOutputCol("indexedFeatures").fit(result)
println(featureIndexer.numFeatures)
val labelConverter = new IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels)
val lr = new LogisticRegression().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures").setMaxIter(100)
val lrPipeline = new Pipeline().setStages(Array(labelIndexer, featureIndexer, lr, labelConverter))
val lrPipelineModel = lrPipeline.fit(result)
val lrModel = lrPipelineModel.stages(2).asInstanceOf[LogisticRegressionModel]
val lrPredictions = lrPipelineModel.transform(testdata)
val evaluator = new MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction")
val lrAccuracy = evaluator.evaluate(lrPredictions)
println("Test Error = " + (1.0 - lrAccuracy))
复制代码
猜测的错误率如下图所示:
(4)超参数调优
val pca = new PCA().setInputCol("features").setOutputCol("pcaFeatures")
val labelIndexer = new StringIndexer().setInputCol("label").setOutputCol("indexedLabel").fit(df)
val featureIndexer = new VectorIndexer().setInputCol("pcaFeatures").setOutputCol("indexedFeatures")
val labelConverter = new IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels)
val lr = new LogisticRegression().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures").setMaxIter(100)
val lrPipeline = new Pipeline().setStages(Array(pca, labelIndexer, featureIndexer, lr, labelConverter))
val paramGrid = new ParamGridBuilder().addGrid(pca.k, Array(1,2,3,4,5,6)).addGrid(lr.elasticNetParam, Array(0.2,0.8)).addGrid(lr.regParam, Array(0.01, 0.1, 0.5)).build()
val cv = new CrossValidator().setEstimator(lrPipeline).setEvaluator(new MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction")).setEstimatorParamMaps(paramGrid).setNumFolds(3)
val cvModel = cv.fit(df)
val lrPredictions=cvModel.transform(test)
val evaluator = new MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction")
val lrAccuracy = evaluator.evaluate(lrPredictions)
println("准确率为"+lrAccuracy)
val bestModel= cvModel.bestModel.asInstanceOf[PipelineModel]
val lrModel = bestModel.stages(3).asInstanceOf[LogisticRegressionModel]