val rdd= sc.textFile("file:///home/spark/score.txt")
1.
val count = rdd.map(line=>line.split(",")(0)).distinct().count
2.
val countCourse = rdd.map(line=>line.split(",")(1)).distinct().count
3.
val sum = rdd.filter(line=>line.split(",")(0)=="Tom")
val avg = sum.map(name=>(name.split(",")(0),name.split(",")(2).toInt)).mapValues(x=>(x,1)).reduceByKey((x,y)=>(x._1+y._1,x._2+y._2)).mapValues(x=>(x._1/x._2)).collect()
4.
val countC = rdd.map(row=>(row.split(",")(0),row.split(",")(1))).mapValues(x=>(x,1)).reduceByKey((x,y)=>(" ",x._2+y._2)).mapValues(x =>x._2).foreach(println)
5
val countPeople = rdd.filter(line=>line.split(",")(1)=="DataBase").count
val source = sc.textFile("file:///home/hadoop/primary_midsemester.txt")
val headerLine = source.first()
val remainingLines = source.filter(_ != headerLine)
val thirdColumn = remainingLines.map(line => {
val columns = line.split("\\s+")
columns(2).toInt
})
val thirdColumn1 = remainingLines.map(line => {
val columns = line.split("\\s+")
columns(3).toInt
})
val thirdColumn2 = remainingLines.map(line => {
val columns = line.split("\\s+")
columns(4).toInt
})
val avg1: Double = thirdColumn.reduce(_ + _).toDouble / thirdColumn.count()
val avg2: Double = thirdColumn1.reduce(_ + _).toDouble / thirdColumn1.count()
val avg3: Double = thirdColumn2.reduce(_ + _).toDouble / thirdColumn2.count()
val columns = headerLine.split(" ") // 使用split方法按空格分隔
val name1 = columns(2)
val name2 = columns(3)
val name3 = columns(4)
val maxScore = thirdColumn.aggregate(Int.MinValue)(_ max _, _ max _)
val minScore = thirdColumn.aggregate(Int.MaxValue)(_ min _, _ min _)
val maxScore1 = thirdColumn1.aggregate(Int.MinValue)(_ max _, _ max _)
val minScore1 = thirdColumn1.aggregate(Int.MaxValue)(_ min _, _ min _)
val maxScore2 = thirdColumn2.aggregate(Int.MinValue)(_ max _, _ max _)
val minScore2 = thirdColumn2.aggregate(Int.MaxValue)(_ min _, _ min _)
println(s"${name1}的最高分是: $maxScore; 最低分是:$minScore;${name2}的最高分是: $maxScore1; 最低分是:$minScore1;${name3}的最高分是: $maxScore2; 最低分是:$minScore2")
print(s"${name1}的均匀分是$avg1;${name2}的均匀分是$avg2;${name3}的均匀分是$avg3")