数据如下
[root@master conf]# cat ~/relations
Zhangsan Wangwu 01:01:01
Zhangsan Zhaoliu 00:11:21
Zhangsan Yuqi 00:19:01
Zhangsan Jingba 00:21:01
Zhangsan Wuxi 01:31:17
Wangwu Zhaoliu 00:51:01
Wangwu Zhaoliu 01:11:19
Wangwu Yuqi 00:00:21
Wangwu Yuqi 00:23:01
Yuqi Zhaoliu 01:18:01
Yuqi Wuxi 00:18:00
Jingba Wangwu 00:01:01
Jingba Wangwu 00:00:06
Jingba Wangwu 00:02:04
Jingba Wangwu 00:02:54
Wangwu Yuqi 01:00:13
Wangwu Yuqi 00:01:01
Wangwu Zhangsan 00:01:01
要求:两个人统计相互通话总时长,并从高到低排列形成result1表,包括通话人和总时长两个字段
解答
字段转换
[root@master conf]# cat test_mapper.py
import sys
for line in sys.stdin:
line = line.strip()
fromstr, tostr, time = line.split('\t')
hours, minutes, secondes = time.split(':')
newtime = int(hours)*60*60 + int(minutes)*60 + int(secondes)
if cmp(fromstr, tostr) == -1:
#print(cmp(fromstr, tostr))
fromstr, tostr = tostr, fromstr
print ' '.join([fromstr + tostr, str(newtime)])
hive脚本
[root@master conf]# cat test.hive
CREATE TABLE relations_new (
fromtostr STRING,
duration INT)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ' ';
add FILE test_mapper.py;
INSERT OVERWRITE TABLE relations_new
SELECT
TRANSFORM (fromstr, tostr, duration)
USING 'python test_mapper.py'
AS (fromtostr, duration)
FROM relations;
SELECT fromtostr, SUM(duration)
FROM relations_new
GROUP BY fromtostr;
运行hive脚本
[root@master conf]# hive -f test.hive