取N天 第次、7、30留存
如何设置n的变量,取1月整月 次 7 30留存
select
site `端口`,count(distinct aa.user_id) as`用户数`,
count(distinct bb.user_id) / count(distinct aa.user_id) as`次日留存率`,
count(distinct cc.user_id) / count(distinct aa.user_id) as`7日留存率`,
count(distinct dd.user_id) / count(distinct aa.user_id) as`30日留存率`,aa.dt `日期`
from
( select
distinct dt,
user_id
from 表明
where
dt='N'
) aa
left join (
select
distinct dt,
user_id,
from
表名
where
dt='N+1'
) bb on aa.user_id = bb.user_id
and aa.site = bb.site
left join (
select
distinct dt,
user_id,
from
表名
where
dt='N+7'
) cc on aa.user_id = bb.user_id
and aa.site = cc.site
left join (
select
distinct dt,
user_id,
from
表名
where
dt='N+30'
) dd on aa.user_id = dd.user_id
and aa.site = bb.site
group by
aa.dt,
aa.site
用开窗函数的滑动窗口,可以实现取当前行的下N行,
先提前统计一个数据, dt、site、人数count,3个字段作为子查询,
然后按site分组,按日期排序,取当前行往下数的第7行,比如下面这个,就能在同一行取到第7天的数据了
first_value(人数count) over(partition by site order by dt rows between 7 following and 7 following)
看上去没我之前想的那么简单,还需要排除新用户的影响,那么这个时候就需要先得到一个数据,即每个用户每天在次日、7日、30日时的登录状态,这个可以使用开窗函数中的range between来处理,排序用日期差,这样1天就是一个range
--测试表
create table test_20220318b (dt string,user_id string, site string);
insert into test_20220318b values('20220201','a','app01');
insert into test_20220318b values('20220202','a','app01');
insert into test_20220318b values('20220203','a','app01');
insert into test_20220318b values('20220208','a','app01');
insert into test_20220318b values('20220228','a','app01');
insert into test_20220318b values('20220301','a','app01');
insert into test_20220318b values('20220302','a','app01');
insert into test_20220318b values('20220303','a','app01');
insert into test_20220318b values('20220308','a','app01');
insert into test_20220318b values('20220201','b','app01');
insert into test_20220318b values('20220202','b','app01');
insert into test_20220318b values('20220228','b','app01');
insert into test_20220318b values('20220301','b','app01');
insert into test_20220318b values('20220308','b','app01');
--查询sql
select dt,site,
sum(d1)/count( user_id) `次日留存率`,
sum(d7)/count( user_id) `7日留存率`,
sum(d30)/count( user_id) `30日留存率`
from (
select aa.dt ,
aa.site ,
user_id,
count(distinct user_id) over(partition by user_id order by df
range between current row and 1 following )-1 d1,
count(distinct user_id) over(partition by user_id order by df
range between 6 following and 7 following ) d7,
count(distinct user_id) over(partition by user_id order by df
range between 29 following and 30 following ) d30
from (select distinct dt,
user_id ,
site ,
datediff(from_unixtime(unix_timestamp(dt,'yyyyMMdd'),'yyyy-MM-dd'),
from_unixtime(unix_timestamp('20220201','yyyyMMdd'),'yyyy-MM-dd')) df
from test_20220318b
where dt >= '20220201'
) aa ) bb
group by dt,site