350. 连续登陆大于等于7天的用户,及连续登陆的开始,结束时间
create table my_data.log_user_login(
user_name varchar(10),
time_login DATETIME
);
insert into log_user_login value('aaa','2022-06-01 11:30:45');
insert into log_user_login value('aaa','2022-06-01 12:30:45');
insert into log_user_login value('aaa','2022-06-02 11:30:45');
insert into log_user_login value('aaa','2022-06-03 11:30:45');
insert into log_user_login value('aaa','2022-06-06 11:30:45');
insert into log_user_login value('aaa','2022-06-07 11:30:45');
insert into log_user_login value('aaa','2022-06-07 17:30:45');
insert into log_user_login value('aaa','2022-06-08 11:30:45');
insert into log_user_login value('aaa','2022-06-08 19:30:45');
insert into log_user_login value('aaa','2022-06-09 11:30:45');
insert into log_user_login value('aaa','2022-06-10 11:30:45');
insert into log_user_login value('aaa','2022-06-11 11:30:45');
insert into log_user_login value('aaa','2022-06-12 11:30:45');
insert into log_user_login value('bbb','2022-06-02 11:30:45');
insert into log_user_login value('bbb','2022-06-03 11:30:45');
insert into log_user_login value('bbb','2022-06-04 11:30:45');
insert into log_user_login value('bbb','2022-06-05 11:30:45');
insert into log_user_login value('bbb','2022-06-06 11:30:45');
insert into log_user_login value('bbb','2022-06-07 11:30:45');
insert into log_user_login value('bbb','2022-06-08 11:30:45');
insert into log_user_login value('bbb','2022-06-09 11:30:45');
insert into log_user_login value('bbb','2022-06-29 11:30:45');
非常经典的题目,有重复数据先去重,然后row_number()开窗解决
select user_name,min(time_login) as start_time_login ,max(time_login) as end_time_login ,count(1) as login_day_cnt
from (
select user_name, time_login, date_sub(time_login, INTERVAL rn DAY) as tag
from (
select user_name,
time_login,
row_number() over (partition by user_name order by time_login) as rn
from (
select user_name, cast(time_login as date) as time_login
from log_user_login
group by user_name, cast(time_login as date)
) t1
) t2
) t3
group by user_name,tag
having count(1) >= 7 ;
351. 连续点击大于等于三次的用户数,中间不能有别人的点击
create table my_data.log_user_click_tf(
user_name varchar(10),
time_login DATETIME
);
insert into log_user_click_tf value('aaa','2022-06-01 11:30:45');
insert into log_user_click_tf value('aaa','2022-06-01 12:30:45');
insert into log_user_click_tf value('bbb','2022-06-01 13:30:45');
insert into log_user_click_tf value('aaa','2022-06-01 14:30:45');
insert into log_user_click_tf value('aaa','2022-06-01 15:30:45');
insert into log_user_click_tf value('ccc','2022-06-01 16:30:45');
insert into log_user_click_tf value('ccc','2022-06-01 18:30:45');
insert into log_user_click_tf value('ccc','2022-06-01 19:30:45');
insert into log_user_click_tf value('ccc','2022-06-01 20:30:45');
insert into log_user_click_tf value('ddd','2022-06-01 21:30:45');
insert into log_user_click_tf value('ddd','2022-06-01 22:30:45');
insert into log_user_click_tf value('ddd','2022-06-01 23:30:45');
偏移量函数的使用
select count(distinct user_name) as dis_cnt_user
from (
select user_name,
lag(user_name, 1) over (order by time_login) as before_1_user_name,
lag(user_name, 2) over (order by time_login) as before_2_user_name
from log_user_click_tf
) t1 where user_name = before_1_user_name and user_name = before_2_user_name
分组的使用
select
count(distinct user_name) as dis_cnt_user
from (
select user_name
from (
select user_name,
tag - group_tag as rn
from (
select user_name,
row_number() over (partition by user_name order by time_login) as group_tag,
row_number() over (order by time_login) as tag
from log_user_click_tf
) t1
) t2
group by user_name, rn
having count(1) >= 3
) t3
352. 计算除去部门最高工资和最低工资的平均工资
create table deptno_salary_hr(
user_name varchar(10),
deptno int,
salart double
);
insert into deptno_salary_hr value('a',1,10000);
insert into deptno_salary_hr value('b',1,20000);
insert into deptno_salary_hr value('c',1,30000);
insert into deptno_salary_hr value('d',1,40000);
insert into deptno_salary_hr value('e',1,50000);
insert into deptno_salary_hr value('f',1,60000);
insert into deptno_salary_hr value('a1',2,10000);
insert into deptno_salary_hr value('b1',2,10000);
insert into deptno_salary_hr value('c1',2,30000);
insert into deptno_salary_hr value('d1',2,40000);
正排和倒排的使用,求中位数也能用
select sum(salart) * 1.0 / count(1) as avg_salary
from (
select deptno,
salart,
dense_rank() over (partition by deptno order by salart) as asc_rn,
dense_rank() over (partition by deptno order by salart desc ) as desc_rn
from deptno_salary_hr
) t1
where asc_rn > 1 and desc_rn > 1
group by deptno
353. 留存率
自联结的应用,有没有更好的思路???
create table log_user_view_retained_df(
user_name varchar(10),
time_login DATETIME
);
insert into log_user_view_retained_df value('a0','2022-06-01 23:30:45');
insert into log_user_view_retained_df value('a1','2022-06-01 23:30:45');
insert into log_user_view_retained_df value('a2','2022-06-01 23:30:45');
insert into log_user_view_retained_df value('a3','2022-06-01 23:30:45');
insert into log_user_view_retained_df value('a4','2022-06-01 23:30:45');
insert into log_user_view_retained_df value('a5','2022-06-01 23:30:45');
insert into log_user_view_retained_df value('a6','2022-06-01 23:30:45');
insert into log_user_view_retained_df value('a7','2022-06-01 23:30:45');
insert into log_user_view_retained_df value('a8','2022-06-01 23:30:45');
insert into log_user_view_retained_df value('a9','2022-06-01 23:30:45');
insert into log_user_view_retained_df value('a0','2022-06-02 23:30:45');
insert into log_user_view_retained_df value('a1','2022-06-02 23:30:45');
insert into log_user_view_retained_df value('a2','2022-06-02 23:30:45');
insert into log_user_view_retained_df value('a3','2022-06-02 23:30:45');
insert into log_user_view_retained_df value('a4','2022-06-02 23:30:45');
insert into log_user_view_retained_df value('a5','2022-06-02 23:30:45');
insert into log_user_view_retained_df value('a6','2022-06-02 23:30:45');
insert into log_user_view_retained_df value('a7','2022-06-02 23:30:45');
insert into log_user_view_retained_df value('a0','2022-06-03 23:30:45');
insert into log_user_view_retained_df value('a1','2022-06-03 23:30:45');
insert into log_user_view_retained_df value('a2','2022-06-03 23:30:45');
insert into log_user_view_retained_df value('a3','2022-06-03 23:30:45');
insert into log_user_view_retained_df value('a4','2022-06-03 23:30:45');
insert into log_user_view_retained_df value('a5','2022-06-03 23:30:45');
insert into log_user_view_retained_df value('a6','2022-06-03 23:30:45');
insert into log_user_view_retained_df value('a7','2022-06-03 23:30:45');
insert into log_user_view_retained_df value('a0','2022-06-04 23:30:45');
insert into log_user_view_retained_df value('a1','2022-06-04 23:30:45');
insert into log_user_view_retained_df value('a0','2022-06-05 23:30:45');
insert into log_user_view_retained_df value('a1','2022-06-05 23:30:45');
insert into log_user_view_retained_df value('a0','2022-06-06 23:30:45');
insert into log_user_view_retained_df value('a1','2022-06-06 23:30:45');
insert into log_user_view_retained_df value('a1','2022-06-07 23:30:45');
select
time_login,
count(distinct user_name) as '活跃用户',
count(distinct case when interval_login =1 then user_name else null end ) as '次日留存用户',
count(distinct case when interval_login =1 then user_name else null end ) / count(distinct user_name) as '次日留存率',
count(distinct case when interval_login =3 then user_name else null end ) as '三日留存用户',
count(distinct case when interval_login =3 then user_name else null end ) / count(distinct user_name) as '三日留存率'
from (
select t1.user_name,
cast(t1.time_login as date) as time_login,
datediff(cast(t2.time_login as date), cast(t1.time_login as date)) as interval_login
from log_user_view_retained_df t1
join log_user_view_retained_df t2
on cast(t1.time_login as date) >= '2022-06-01' and cast(t1.time_login as date) <= '2022-06-30'
and cast(t2.time_login as date) >= '2022-06-01' and
cast(t2.time_login as date) <= '2022-06-30'
and t1.user_name = t2.user_name
) t3
group by time_login
354. 流失回归率
分析统计6.1日活跃玩家的流失率回归率,公式:流失3日回归率 6/2-6/3未登陆但是6/4日登陆的玩家 / 6/2-6/3未登陆的玩家总数
with interval_cnt as (select count(1) as cnt from log_user_view_retained_df where cast(time_login as date) = '2022-07-01'),
interval_user as (
select user_name from log_user_view_retained_df where cast(time_login as date) = '2022-07-01'
)
select start_login_time,user_cnt,user_cnt * 1.0 / if( cnt - sum(user_cnt) over (order by start_login_time rows between unbounded preceding and 1 preceding) is null ,0,cnt - sum(user_cnt) over (order by start_login_time rows between unbounded preceding and 1 preceding))
from (
select cast(start_login_timestamp as date) as start_login_time,count(1) as user_cnt,cnt
from (
# 取用户在区间最早的登录时间
select t1.user_name, min(t1.time_login) as start_login_timestamp,cnt
from log_user_view_retained_df t1
join interval_user t2
on t1.user_name = t2.user_name
and cast(t1.time_login as date) > '2022-07-01' and
cast(t1.time_login as date) <= '2022-07-30'
join interval_cnt tmp_cnt
group by t1.user_name,cnt
) t3
group by cast(start_login_timestamp as date),cnt ) t4;
355. AB球队得分流水表,得到连续三次得分的队员名字和每次赶超对手的球员名字
连续三次得分的队员名字:和连续点击大于等于三次的用户数,中间不能有别人的点击这题是一个思路
select team,name
from (
select team,
name,
lag(name, 1) over (partition by team order by score_time) as before_1_team_name,
lag(name, 2) over (partition by team order by score_time) as before_2_team_name
from team_score_detail
) t1
where name = before_1_team_name and name=before_2_team_name
group by team,name
思路二:两两分组,打标签
select team, name
from (
select team, name, if(lag(name, 1) over (partition by team order by score_time) = name, 1, 0) as tag
from team_score_detail
) t1
where tag = 1
group by team,name,tag
having count(1) >= 3
# 也可以if判断中打标签换过来,然后开窗累计,不用过滤tag=1这种方式
每次赶超对手的球员名字,case when的使用
select team,
name,
A_sum_score ,
B_sum_score
from (
select team,
name,
A_sum_score ,
B_sum_score,
A_sum_score - B_sum_score as c_score,
lag(A_sum_score - B_sum_score, 1) over (order by score_time) as before_c_score
from (
select team,
name,
score_time,
sum(A_score) over (order by score_time) as A_sum_score,
sum(B_score) over ( order by score_time) as B_sum_score
from (
select team,
name,
score_time,
case when team = 'A' then score else 0 end as A_score,
case when team = 'B' then score else 0 end as B_score
from team_score_detail
) t1
) t2
) t3 where c_score * before_c_score < 0
# c_score * before_c_score < 0说明就是反超
第一次不算,算的话,case when 判断一下,不用c_score * before_c_score < 0
case when diff_score > 0 and lag(diff_score) over (order by score_time asc) < 0 then 1
when diff_score < 0 and lag(diff_score) over (order by score_time asc) > 0 then 1
when diff_score is not null and lag(diff_score) over (order by score_time asc) is null then 1
else 0
end
356. 拉链表
数据仓库建模的一个体现,非常经典啊,我的经验是只要是一个维度的分析(参与时间),基本都能用拉链表,多个维度分析,万物皆可bitmap,哈哈哈
create table user_act_range(
first_dt date,
guid varchar(10),
range_start date,
range_end date
);
# 初始化
insert into user_act_range value('2020-01-01','X0001','2020-01-01','2022-01-01');
insert into user_act_range value('2020-01-01','X0001','2020-03-01','9999-12-31');
insert into user_act_range value('2020-06-01','X0002','2022-06-01','9999-12-31');
insert into user_act_range value('2020-01-01','X0003','2020-01-01','2022-01-01');
# 用户登陆表(就是今天有哪些用户登陆了)
create table user_session_login(
guid varchar(10),
login date
);
insert into user_session_login value('X0002','2022-06-26');
insert into user_session_login value('X0003','2022-06-26');
insert into user_session_login value('X0004','2022-06-26');
两部分,第一部分今天登陆了,至少昨天没有登陆的用户单独union all,第二部分两张表full join 分类讨论,封闭区间,9999-12-31,新用户的情况
357. 认识的组合数
快手面试题,非常经典的题目,再写一遍
需求:现有城市网吧访问数据,字段:网吧id,访客id(身份证号),上线时间,下线时间
规则1、如果有两个用户在一家网吧的前后上下线时间在10分钟以内,则两人可能认识
规则2、如果这两个用户在三家以上网吧出现【规则1】的情况,则两人一定认识
需求:
该城市上网用户中两人一定认识的组合数
自联结的应用
设表名:table0
字段:wid , uid ,ontime ,offtime
select
id,
count(distinct wid) c
from
(select
wid,
concat(t0.uid,t1.uid) as id
from
(select
wid,
uid,
unix_timestamp(ontime,'yyyyMMdd HH:mm:ss') as ontime,
unix_timestamp(offtime,'yyyyMMdd HH:mm:ss') as offtime
from
table0
)t0
join
(select
wid,
uid,
unix_timestamp(ontime,'yyyyMMdd HH:mm:ss') as ontime,
unix_timestamp(offtime,'yyyyMMdd HH:mm:ss') as offtime
from
table0
)t1
on t0.wid=t1.wid
and t0.uid>t1.uid
and (abs(t0.ontime-t1.ontime)<10*60 or abs(t0.offtime-t1.offtime)<10*60)
)t0
group by
id
having
c>=3
思路是对的,没测试...
select count(uid) as com_cnt
from(
select uid,pre_uid
,count(distinct wid) as flag
from(
select wid
,uid
,abs(unix_timestamp(ontime,'yyyy-MM-dd HH:mm:ss')-unix_timestamp(lag(ontime,1,'1970-01-01 08:00:00') over(partition by wid order by ontime),'yyyy-MM-dd HH:mm:ss')) / 60 ontime_diff
,abs(unix_timestamp(offtime,'yyyy-MM-dd HH:mm:ss')-unix_timestamp(lag(offtime,1,'1970-01-01 08:00:00') over(partition by wid order by offtime),'yyyy-MM-dd HH:mm:ss')) / 60 offtime_diff
,lag(uid) over (partition by wid order by offtime) as pre_uid
from table0
) m
where (ontime_diff<=10 or offtime_diff<=10)
group by uid,pre_uid
) n
where flag>=3
358. 中位数
面试题,非常经典的题目
表中保存了数字的值以及其个数,求取中位数,在此表中,数字为0,0,0,0,0,0,0,1,2,2,2.3,所以中位数为(0+0)/2
create table if not exists c0629(
Number int,Frequency INT
);
insert into c0629 values (0,7),(1,1),(2,3),(3,1);
请编写一个查询来查找所有数字的中位数并将结果命名为 median 。注意:什么是中位数?当一串数字是奇数个时,例如8,3,5,1,4。我们按顺序排列后为:1,3,4,5,8。那么4就是中位数
当一串数字为偶数个时,例如8,3,5,1,4,2。我们按顺序排列后为:1,2,3,4,5,8。那么(3+4)/2=3.5就是中位数。
从位置上理解中位数,正序逆序的应用
select avg(number) as median
from
(select Number, frequency,
sum(frequency) over(order by number asc) as total,
sum(frequency) over(order by number desc) as total1
from c0629
order by number asc)as a
where total>=(select sum(frequency) from c0629)/2
and total1>=(select sum(frequency) from c0629)/2
看一下这种写法...我已经看不懂了..测试了一下....参考自连接比较法,第一步其实求取的比多少大,比多少小的问题
这里的中位数定义是,比中位数大的数 - 比中位数下的数的绝对值是最小的(前提是没有重复值),有重复值的情况下,选出equal大于或等于margin绝对值的num,为什么要这么做,如果(不大于或等于),那么就一定不是中位数呀
SQL笔面试题:如何求取中位数?
select
Number, (select sum(Frequency) from c0629 where Number <= n.Number), (select sum(Frequency) from c0629 where Number >= n.Number)
from
c0629 as n
select
avg(n.Number) as median
from
c0629 as n
where
n.Frequency >= abs(
(select sum(Frequency) from c0629 where Number <= n.Number) -
(select sum(Frequency) from c0629 where Number >= n.Number)
);
359. 混合排序
非常经典
name 是店铺名称,名称中带有“-”表示分店,score 是销售额。出题人希望能依据城市、销售额查看各个店铺的销售数据,并且当存在分店时,分店能紧挨在总店后面按照 id 排序
create table order_data (
id int ,
city varchar(1),
name varchar(4),
score int
);
insert into order_data values (1,'a','A',100);
insert into order_data values (2,'a','A-1',80);
insert into order_data values (3,'b','C',70);
insert into order_data values (4,'a','A-2',90);
insert into order_data values (5,'b','D',85);
insert into order_data values (6,'b','B',75);
insert into order_data values (7,'b','E',30);
insert into order_data values (8,'b','B-1',50);
insert into order_data values (9,'a','F',95);
insert into order_data values (10,'b','G',65);
with x as (
select
id, city, name, score,
if( instr(name,'-'),substr(name,1,1),name ) as base_name
from order_data
)
select
x.id,x.city,x.name,
x.score,x.base_name,order_data.score
from order_data join x on order_data.name = x.base_name
order by x.city,order_data.score desc,x.id;