因为一直在搞算法开发,所以还是对算法比较感兴趣,学习Spark的过程,也顺带练习一下自己没用过的小算法。没有Java经验,没有Hadoop经验,没有SQL经验,对一些运行机制的理解还是不深,系统学习太枯燥,通过例程慢慢感悟吧!这次的参考书目是《Spark MLlib机器学习实践》,这本书也很好,没有太深奥的理论和繁琐的解释,有很多实用小程序,初学者上手很棒!
之前做相似度用的比较多的是欧式距离,余弦相似性也是相似度的一种度量,更适合稀疏数据,经常应用于协同过滤算法中。具体公式很简单,随便查一查好了。代码如下:
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
import scala.collection.mutable.Map
object collaborativeFilter {
val conf = new SparkConf().setAppName("collaborativeFilter").setMaster("local")
val sc = new SparkContext(conf)
val users = sc.parallelize(Array("Rami Said Malek", "Olivia Colman", "Mahershala Ali", "Regina King", "Brian")) // 用户名
val films = sc.parallelize(Array("Green Book","Roma", "Bohemian Rhapsody", "The Favourite", "BlacKkKlansman")) // 电影名
val source = Map[String, Map[String, Int]]() // 第一个String为用户名,嵌套的Map为电影名和打分值的存储
val filmSource = Map[String, Int]() // 电影名和分值的存储
def getSource : Map[String, Map[String, Int]] = {
val user1FilmSource = Map("Green Book" -> 2, "Roma" -> 3, "Bohemian Rhapsody" -> 1, "The Favourite" -> 0, "BlacKkKlansman" -> 1)
val user2FilmSource = Map("Green Book" -> 1, "Roma" -> 2, "Bohemian Rhapsody" -> 2, "The Favourite" -> 1, "BlacKkKlansman" -> 4)
val user3FilmSource = Map("Green Book" -> 2, "Roma" -> 1, "Bohemian Rhapsody" -> 0, "The Favourite" -> 1, "BlacKkKlansman" -> 4)
val user4FilmSource = Map("Green Book" -> 3, "Roma" -> 2, "Bohemian Rhapsody" -> 0, "The Favourite" -> 5, "BlacKkKlansman" -> 3)
val user5FilmSource = Map("Green Book" -> 5, "Roma" -> 3, "Bohemian Rhapsody" -> 1, "The Favourite" -> 1, "BlacKkKlansman" -> 2)
source += ("Rami Said Malek" -> user1FilmSource)
source += ("Olivia Colman" -> user2FilmSource)
source += ("Mahershala Ali" -> user3FilmSource)
source += ("Regina King" -> user4FilmSource)
source += ("Brian" -> user5FilmSource)
source
}
// 两两计算分值,采用余弦相似性
def getCollaborateSource(user1: String, user2: String): Double = {
val user1FlimSource = source.get(user1).get.values.toVector // 获得第一个用户评分
val user2FlimSource = source.get(user2).get.values.toVector // 获得第二个用户评分
// 对余弦相似度公式分子部分进行计算
val member = user1FlimSource.zip(user2FlimSource).map(d => d._1 * d._2).reduce(_ + _).toDouble
// 对余弦相似度公式分母部分进行计算
val temp1 = math.sqrt(user1FlimSource.map(num => {
math.pow(num, 2)
}).reduce(_ + _))
val temp2 = math.sqrt(user2FlimSource.map(num => {
math.pow(num, 2)
}).reduce(_ + _))
val denominator = temp1 * temp2
member / denominator
}
def main(args: Array[String]): Unit = {
getSource
val name = "Brian"
users.foreach(user => {println(name + "相对于" + user + "的相似性分数是:" + getCollaborateSource(name, user))})
}
}
输出结果:
Brian相对于Rami Said Malek的相似性分数是:0.8981462390204985
Brian相对于Olivia Colman的相似性分数是:0.6821910402406466
Brian相对于Mahershala Ali的相似性分数是:0.7416198487095662
Brian相对于Regina King的相似性分数是:0.738024966423108
Brian相对于Brian的相似性分数是:0.9999999999999998