前提 java scala hadoop intellij
下载spark http://spark.apache.org/downloads.html
intellij新建scala项目
配置引入库 最终效果如下,需引入hadoop的库
本地测试生成测试文件
/**
* Created by Administrator on 2017/2/10.
*/
package com.soecode.SparkDemo
import java.io.PrintWriter
/**
* 模拟一个城市人口
*/
object CreateTestFile {
def main(args: Array[String]) {
val start = System.currentTimeMillis();
val out = new PrintWriter("d://renkou.txt")
for (i <- 1 to 20000000) {
out.println(i + "," + getName + "," + getBirth + "," + getSex)
}
out.close()
val end = System.currentTimeMillis();
print("任务结束,耗时:" + (end - start) + "ms")
}
//随机产生名
def getName: String = {
val chs = "abcdefghijklmnopqrstuvwxyz"
val len = (1 + 5 * Math.random()).toInt
var str = ""
for (i <- 1 to len) {
val l = (0 + 25 * Math.random()).toInt
str += chs(l)
}
str
}
//随机产生出生日期
def getBirth: String = {
val year = (1949 + 67 * Math.random()).toInt
val month = (1 + 12 * Math.random()).toInt
val day = (1 + 30 * math.random).toInt
year + "-" + month + "-" + day
}
//随机产生性别
def getSex: Integer = if (Math.random() > 0.3) 1 else 0
}
统计性别
/**
* Created by Administrator on 2017/2/10.
*/
package com.soecode.SparkDemo
import org.apache.spark.{SparkConf, SparkContext}
/**
* 分析男女分布
*/
object StatBG {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("Demo").setMaster("local");//spark conf
val sc = new SparkContext(conf);//spark上下文
println("任务开始")
val start = System.currentTimeMillis();
val lines = sc.textFile("d://renkou.txt")//读取本地文件建立RDD
//使用map操作,形成新的集合。 如:Map(1,0,1,1,1) 0 代表女,1代表男
val result = lines.map(s=>{
val sp = s.split(",")
sp(3)
}).countByValue
val end = System.currentTimeMillis();
println("任务结束,耗时:"+(end-start)+"ms"); /*32128ms*/
println(result) /*Map(0 -> 6000325, 1 -> 13999675)*/
}
}
统计星座
/**
* Created by Administrator on 2017/2/10.
*/
package com.soecode.SparkDemo
import org.apache.spark.{SparkConf, SparkContext}
/**
* 分析星座分布
*/
object StatBirth extends Serializable {
val dayArr = Array[Integer](20, 19, 21, 20, 21, 22, 23, 23, 23, 24, 23, 22)
val constellationArr = Array[String]("摩羯座", "水瓶座", "双鱼座", "白羊座", "金牛座", "双子座", "巨蟹座", "狮子座", "处女座", "天秤座", "天蝎座", "射手座", "摩羯座")
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
val sc = new SparkContext(conf);
println("任务开始")
val start = System.currentTimeMillis();
val lines = sc.textFile("d://renkou.txt")
//RDD进行map操作,获取每行,然后split分割,换算星座返回新的map(金牛座,水瓶座,……)
val result = lines.map(s => {
val sp = s.split(",")
val sp_birth = sp(2).split("-")
val month = sp_birth(1).toInt
val day = sp_birth(2).toInt
getConstellation(month, day)
}).countByValue
val end = System.currentTimeMillis();
println("任务结束,耗时:" + (end - start) + "ms");
for (m <- result) println(m._1 + ":" + m._2)
/*任务结束,耗时:50054ms
巨蟹座:1719635
射手座:1610575
双鱼座:1778776
白羊座:1613289
处女座:1665877
双子座:1723186
金牛座:1722383
狮子座:1669192
天秤座:1721118
水瓶座:1610483
天蝎座:1611749
摩羯座:1553737*/
}
//获取星座
def getConstellation(month: Integer, day: Integer): String = {
if (day < dayArr(month - 1)) constellationArr(month - 1) else constellationArr(month)
}
}