1、需求
(1)日志的字段描述如下
日志字段 | 字段描述 |
---|---|
remote_addr | 客户端的ip地址(如果中间有代理服务器那么这里显示的ip就为代理服务器的ip地址) |
remote_user | 用于记录远程客户端的用户名称(一般为“-”) |
time_local | 用于记录访问时间和时区 |
request | 用于记录请求的URL以及请求方法 |
status | 响应状态码 |
body_bytes_sent | 给客户端发送的文件主体内容大小 |
request_body | 为post的数据 |
http_referer | 可以记录用户是从哪个链接访问过来的 |
http_user_agent | 用户所使用的代理(一般为浏览器) |
http_x_forwarded_for | 可以记录客户端IP,通过代理服务器来记录客户端的ip地址 |
host | 服务器主机名称 |
(2)需求一:IP地址
依据ip地址确定区域,定向营销
(3)需求二:访问时间
分析用户访问网站的时间段
针对销售来说,合理安排值班,销售课程。
2、需求分析
(1)准备一个原表
(2)针对不同的业务需求创建不同的子表
* 数据存储格式orc/parquet
* 数据压缩 snappy
* map output 数据压缩 snappy
* 外部表
* 分区表(不演示)
3、数据输入
https://github.com/liufengji/hive_data.git
4、原表处理
1)创建表
create table IF NOT EXISTS default.victor_log (
remote_addr string,
remote_user string,
time_local string,
request string,
status string,
body_bytes_sent string,
request_body string,
http_referer string,
http_user_agent string,
http_x_forwarded_for string,
host string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ' '
stored as textfile ;
2)向表中导入数据
load data local inpath '/opt/module/datas/victor.log' into table default.victor_log ;
3)查看有多少条数据
select count(*) from victor_log ;
4)查询表的数据结构
select * from victor_log limit 5 ;
5)查看官网正则表达式
https://cwiki.apache.org/confluence/display/Hive/GettingStarted
https://issues.apache.org/jira/browse/HIVE-662
CREATE TABLE serde_regex(
host STRING,
identity STRING,
user STRING,
time STRING,
request STRING,
status STRING,
size STRING,
referer STRING,
agent STRING)
ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
"input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*) (-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?",
"output.format.string" = "%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s"
)
STORED AS TEXTFILE;
LOAD DATA LOCAL INPATH "../data/files/apache.access.log"
INTO TABLE serde_regex;
LOAD DATA LOCAL INPATH "../data/files/apache.access.2.log"
INTO TABLE serde_regex;
SELECT * FROM serde_regex ORDER BY time;
校验正则表达式是否书写正确(全部为黄色即为正确)
http://wpjam.qiniudn.com/tool/regexpal/
6)根据正则表达式再次创建表
drop table if exists default.victor_log ;
create table IF NOT EXISTS default.victor_log (
remote_addr string,
remote_user string,
time_local string,
request string,
status string,
body_bytes_sent string,
request_body string,
http_referer string,
http_user_agent string,
http_x_forwarded_for string,
host string
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
"input.regex" = "(\"[^ ]*\") (\"-|[^ ]*\") (\"[^\]]*\") (\"[^\"]*\") (\"[0-9]*\") (\"[0-9]*\") (-|[^ ]*) (\"[^ ]*\") (\"[^\"]*\") (-|[^ ]*) (\"[^ ]*\")"
)
STORED AS TEXTFILE;
7)向表中导入数据
load data local inpath '/opt/module/datas/victor.log'
into table default.victor_log ;
8)查询表的数据结构
select * from victor_log limit 5 ;
5、针对不同的业务创建不同的子表
1)创建一个业务需求相关的表
drop table if exists default.victor_comm ;
create table IF NOT EXISTS default.victor_comm (
remote_addr string,
time_local string,
request string,
http_referer string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS orc tblproperties ("orc.compress"="SNAPPY");
2)向表中导入数据
insert into table default.victor_comm select remote_addr, time_local,
request,http_referer from default.victor_log ;
3)查看表中数据是否正常
select * from victor_comm limit 5 ;
6、数据清洗之定义UDF去除双引号
1)编写UDF函数
import org.apache.hadoop.hive.ql.exec.UDF;
public class RemoveQuotes extends UDF{
public String evaluate(final String s) {
if (s == null) {
return null;
}
return s.toString().replace("\"", "");
}
}
2)导出jar包,并导入到虚拟机上
3)添加UDF的jar包
4)添加自定义UDF到hive
add jar /opt/module/jars/removequotes.jar ;
create temporary function my_removequotes as "com.victor.hive.RemoveQuotes" ;
5)调用udf函数去除表中含义的双引号
insert overwrite table default.victor_comm select my_removequotes(remote_addr),
my_removequotes(time_local), my_removequotes(request), my_removequotes(http_referer)
from default.bf_log_src ;
6)查询几条数据校验一下
select * from victor_comm limit 5 ;
7、数据清洗之定义UDF转换日期时间
1)编写日期转换的UDF
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Locale;
import org.apache.hadoop.hive.ql.exec.UDF;
public class DateTransform extends UDF {
private final SimpleDateFormat inputFormat =
new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss", Locale.ENGLISH);
private final SimpleDateFormat outputFormat = new SimpleDateFormat("yyyyMMddHHmmss");
public String evaluate(final String inputDate) {
if (inputDate == null) {
return null;
}
Date parseDate;
String outputDate = null;
try {
// 解析
parseDate = inputFormat.parse(inputDate);
// 转换
outputDate = outputFormat.format(parseDate);
} catch (ParseException e) {
e.printStackTrace();
}
return outputDate;
}
}
2)导出jar包,并导入到虚拟机上
3)添加UDF的jar包
4)添加自定义UDF到hive
add jar /opt/module/jars/DateTransform.jar ;
create temporary function my_datetransform as " com.victor.hive.DateTransform" ;
5)调用udf函数处理日期时间转换
insert overwrite table default.victor_comm select my_removequotes(remote_addr),
my_datetransform(my_removequotes(time_local)), my_removequotes(request),
my_removequotes(http_referer) from default.victor_log ;
6)查询几条数据校验一下
select * from victor_comm limit 5 ;
8、业务具体处理
1)分析看视频的时间
select t.hour, count(*) cnt from
(select substring(time_local,9,2) hour from victor_comm ) t
group by t.hour order by cnt desc ;
2)分析IP地址,判断哪个地域
select t.prex_ip, count(*) cnt from
(
select substring(remote_addr,1,7) prex_ip from victor_comm
) t
group by t.prex_ip order by cnt desc limit 20 ;