API ml DataFrames API mllib RDD.
Spark BinaryClassificationEvaluator : Area Under Receiver Operating Characteristic (AUROC) Area Under Precision Recall Curve (AUPRC).
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.RandomForestClassifier
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer}
val data = sqlContext.read.format("libsvm").load("D:/Sources/spark/data/mllib/sample_libsvm_data.txt")
val labelIndexer = new StringIndexer()
.setInputCol("label")
.setOutputCol("indexedLabel")
.fit(data)
val featureIndexer = new VectorIndexer()
.setInputCol("features")
.setOutputCol("indexedFeatures")
.setMaxCategories(4)
.fit(data)
val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3))
val rf = new RandomForestClassifier()
.setLabelCol("indexedLabel")
.setFeaturesCol("indexedFeatures")
.setNumTrees(10)
val labelConverter = new IndexToString()
.setInputCol("prediction")
.setOutputCol("predictedLabel")
.setLabels(labelIndexer.labels)
val pipeline = new Pipeline()
.setStages(Array(labelIndexer, featureIndexer, rf, labelConverter))
val model = pipeline.fit(trainingData)
val predictions = model.transform(testData)
predictions
.select("indexedLabel", "rawPrediction", "prediction")
.show()
val binaryClassificationEvaluator = new BinaryClassificationEvaluator()
.setLabelCol("indexedLabel")
.setRawPredictionCol("rawPrediction")
def printlnMetric(metricName: String): Unit = {
println(metricName + " = " + binaryClassificationEvaluator.setMetricName(metricName).evaluate(predictions))
}
printlnMetric("areaUnderROC")
printlnMetric("areaUnderPR")