目錄
一、原創(chuàng)文章被引用次數(shù)
0 問題描述
1 數(shù)據(jù)準(zhǔn)備
2 數(shù)據(jù)分析
?編輯
3 小結(jié)
二、學(xué)生退費人數(shù)
0 問題描述
1 數(shù)據(jù)準(zhǔn)備
2 數(shù)據(jù)分析
3 小結(jié)
一、原創(chuàng)文章被引用次數(shù)
0 問題描述
? ?求原創(chuàng)文章被引用的次數(shù),注意本題不能用關(guān)聯(lián)的形式求解。
1 數(shù)據(jù)準(zhǔn)備
? ?id表示文章id,oid表示引用的其他文章id,當(dāng)oid為0時表示當(dāng)前文章為原創(chuàng)文章。
create table if not exists table18
(
id int comment '文章id',
oid int comment '引用的其他文章id'
) comment '文章信息表';
insert overwrite table table18 values
(1,0),
(2,0),
(3,1),
(4,1),
(5,2),
(6,0),
(7,3);
2 數(shù)據(jù)分析
? ? 題目要求的是原創(chuàng)文章被引用的次數(shù),其中原創(chuàng)文章為oid等于0的文章,即求解文章id為【1,2,6】被引用的次數(shù)。常見的思路是用關(guān)聯(lián)方式求解,具體SQL如下圖所示:
思路一:用左連接 left join?
--思路一:用左連接 left join
select
t1.id,
count(t2.oid) as cnt
from (select * from table18 where oid = 0) t1
left join
(select * from table18 where oid <> 0) t2
on t1.id = t2.oid
group by t1.id
order by t1.id;
?輸出結(jié)果為:
?題意要求不能使用join等關(guān)聯(lián)形式求解,其實該題本質(zhì)是存在性計數(shù)問題。
思路二:借助array_contains(array,element) 函數(shù)
select
new_id,
sum(flag)as cnt
from (
select
id,
oid,
contains,
-- 第二步:利用array_contains()函數(shù)判斷引用的oid是否在原創(chuàng)文章id集合中,ture則記為1,false則記為0
if(array_contains(contains, oid), 1, 0) flag,
-- 第三步:清洗數(shù)據(jù),補(bǔ)充完整的原創(chuàng)文章
if(array_contains(contains, oid), oid, if(oid = 0, id, null)) new_id
from ( -- 第一步:構(gòu)建原創(chuàng)文章id集合,作為輔助列
select
id,
oid,
collect_set(if(oid = 0, id, null)) over () contains
from table18
) tmp1
) tmp2
where new_id is not null
group by new_id;
? ? 上述代碼解析:通過array_contains(array,column) 函數(shù)進(jìn)行存在性檢測,如果array中包含column 則記為1,不存在記為0,關(guān)鍵公式: sum(if(array_contains(array,column),1,0))
上述代碼解析:
第一步:構(gòu)建原創(chuàng)文章id集合contains,將contains作為輔助列。
select
id,
oid,
collect_set(if(oid = 0, id, null)) over () contains
from table18;
第二步:利用array_contains()函數(shù),判斷非原創(chuàng)的oid是否在原創(chuàng)文章id集合中,存在則計數(shù)為1,否則計數(shù)為0。
select
id,
oid,
contains,
if(array_contains(contains, oid), 1, 0) as flag
from (
select
id,
oid,
collect_set(if(oid = 0, id, null)) over () contains
from table18
) tmp1;
第三步:清洗數(shù)據(jù),對原創(chuàng)文章id補(bǔ)充完整
select
id,
oid,
contains,
if(array_contains(contains, oid), 1, 0) flag,
--清洗數(shù)據(jù),對原創(chuàng)文章id補(bǔ)充完整
if(array_contains(contains, oid), oid, if(oid = 0, id, null)) new_id
from (
select
id,
oid,
collect_set(if(oid = 0, id, null)) over () contains
from table18
) tmp1;
?
? ?ps: 此處需要對原創(chuàng)文章id補(bǔ)充完整,否則會丟失記錄。具體是:通過array_contains(contains,oid)去判斷,代碼為?if(array_contains(contains, oid), oid, if(oid = 0, id, null)) as? new_id? ?-->? 代表的意思是 :如果oid存在于原創(chuàng)文章id構(gòu)建的集合中,就取得該oid,如果不存在,再判斷oid是否為0,如果是0,則取得id,否則記為null。
第四步:將new_id 為null的數(shù)據(jù)濾掉,并對new_id分組,求出各原創(chuàng)文章被引用的次數(shù)sum(flag)as cnt
select
new_id,
sum(flag)as cnt
from (
select
id,
oid,
contains,
-- 第二步:利用array_contains()函數(shù)判斷引用的oid是否在原創(chuàng)文章id集合中,ture則記為1,false則記為0
if(array_contains(contains, oid), 1, 0) flag,
-- 第三步:清洗數(shù)據(jù),補(bǔ)充完整的原創(chuàng)文章id
if(array_contains(contains, oid), oid, if(oid = 0, id, null)) new_id
from ( -- 第一步:構(gòu)建原創(chuàng)文章id集合,作為輔助列
select
id,
oid,
collect_set(if(oid = 0, id, null)) over () contains
from table18
) tmp1
) tmp2
-- 第四步:將為null的數(shù)值過濾掉,并對new_id分組,求出各原創(chuàng)文章被引用的次數(shù)sum(flag)as cnt
where new_id is not null
group by new_id;
3 小結(jié)
?上述例子中利用array_contains(array,column)進(jìn)行存在性檢測,如果存在則記為1,不存在則記為0,核心計算公式為 sum(if(array_contains(array,value),1,0))
二、學(xué)生退費人數(shù)
0 問題描述
求截止當(dāng)前月的學(xué)生退費總?cè)藬?shù)【當(dāng)月的學(xué)生退費人數(shù):上月存在,這月不存在的學(xué)生個數(shù)】。
1 數(shù)據(jù)準(zhǔn)備
create table if not exists test19( dt string comment '日期',
stu_id string comment '學(xué)生id');
insert overwrite table test19
values ('2020-01-02','1001'),
('2020-01-02','1002'),
('2020-02-02','1001'),
('2020-02-02','1002'),
('2020-02-02','1003'),
('2020-02-02','1004'),
('2020-03-02','1001'),
('2020-03-02','1002'),
('2020-04-02','1005'),
('2020-05-02','1006');
2 數(shù)據(jù)分析
完整的代碼如下:
select month,
sum(month_cnt) over(order by month) as result
from(
select month,
lag(next_month_cnt,1,0) over(order by month) as month_cnt
from(
select distinct
t0.month as month,
sum(if(!array_contains(t1.lead_stu_id_arr,t0.stu_id),1,0)) over(partition by t0.month) as next_month_cnt
from
(select
date_format(dt,'yyyy-MM') as month,
stu_id
from test19) t0
left join
(
select month,
lead(stu_id_arr,1) over(order by month) as lead_stu_id_arr
from(
select date_format(dt,'yyyy-MM') as month,
collect_list(stu_id) as stu_id_arr
from test19
group by date_format(dt,'yyyy-MM')
) tmp1
) t1
on t0.month = t1.month
) tmp2
) tmp3;
第一步:聚合每個月的stu_id,利用collect_list()函數(shù)(不去重)合并,具體sql如下:
select date_format(dt,'yyyy-MM') as month,
collect_list(stu_id) as stu_id_arr
from test19
group by date_format(dt,'yyyy-MM')
計算結(jié)果如下:
2020-01 [1001,1002]
2020-02 [1001,1002,1003,1004]
2020-03 [1001,1002]
2020-04 [1005]
2020-05 [1006]
第二步:按照月份排序,獲取下一月合并之后的值,sql如下:
select month,
stu_id_arr,
lead(stu_id_arr,1) over(order by month) as lead_stu_id_arr
from(
select
date_format(dt,'yyyy-MM') as month,
collect_list(stu_id) as stu_id_arr
from test19
group by date_format(dt,'yyyy-MM')
) tmp1;
計算結(jié)果如下:
2020-01 [1001,1002] [1001,1002,1003,1004]
2020-02 [1001,1002,1003,1004] [1001,1002]
2020-03 [1001,1002] [1005]
2020-04 [1005] [1006]
2020-05 [1006] NULL
? ? ?ps:總體思路是利用數(shù)組差集函數(shù)求出差值集合后,再利用size()求出具體的個數(shù),最后sum聚合即可。hive中的數(shù)組函數(shù)array_contains可以實現(xiàn)這個需求,該函數(shù)表示在數(shù)組中查詢某個元素是否存在。在該題目中,借助此函數(shù)判斷 當(dāng)月某個學(xué)生id是否在下月(數(shù)據(jù)集合 -->數(shù)組)中存在,如果存在就為0,不存在標(biāo)記為1。
?第三步:利用步驟2的結(jié)果與原表進(jìn)行關(guān)聯(lián),獲取當(dāng)前學(xué)生id
select
t0.*,
t1.*
from (select
date_format(dt, 'yyyy-MM') as month,
stu_id
from test19) t0
left join ( select
month,
lead(stu_id_arr, 1) over (order by month) as lead_stu_id_arr
from ( select
date_format(dt, 'yyyy-MM') as month,
collect_list(stu_id) as stu_id_arr
from test19
group by date_format(dt, 'yyyy-MM')
) tmp1
) t1
on t0.month = t1.month;
結(jié)果如下:
2020-01 1001 2020-01 [1001,1002,1003,1004]
2020-01 1002 2020-01 [1001,1002,1003,1004]
2020-02 1001 2020-02 [1001,1002]
2020-02 1002 2020-02 [1001,1002]
2020-02 1003 2020-02 [1001,1002]
2020-02 1004 2020-02 [1001,1002]
2020-03 1001 2020-03 [1005]
2020-03 1002 2020-03 [1005]
2020-04 1005 2020-04 [1006]
2020-05 1006 2020-05 NULL
第四步:利用array_contains()函數(shù)判斷當(dāng)月的stu_id是否在下個月array數(shù)組中,如果存在標(biāo)記0,不存在標(biāo)記1。具體sql如下:
select t0.month,
t0.stu_id,
if(!array_contains(t1.lead_stu_id_arr,t0.stu_id),1,0) as flag
from
(select
date_format(dt,'yyyy-MM') as month,
stu_id
from test19) t0
left join
(
select month,
lead(stu_id_arr,1) over(order by month) as lead_stu_id_arr
from(
select date_format(dt,'yyyy-MM') as month,
collect_list(stu_id) as stu_id_arr
from test19
group by date_format(dt,'yyyy-MM')
) tmp1
) t1
on t0.month = t1.month
結(jié)果如下:
2020-01 1001 0
2020-01 1002 0
2020-02 1001 0
2020-02 1002 0
2020-02 1003 1
2020-02 1004 1
2020-03 1001 1
2020-03 1002 1
2020-04 1005 1
2020-05 1006 1
第五步:基于步驟四的結(jié)果,按照月份分組,對flag求和,得到下個月的學(xué)生退費人數(shù)
select distinct t0.month,
-- 求解下個月的退費人數(shù)
sum(if(!array_contains(t1.lead_stu_id_arr,t0.stu_id),1,0)) over(partition by t0.month) as next_month_cnt
from (select
date_format(dt,'yyyy-MM') as month,
stu_id
from test19) t0
left join
( select month,
lead(stu_id_arr,1) over(order by month) as lead_stu_id_arr
from( select
date_format(dt,'yyyy-MM') as month,
collect_list(stu_id) as stu_id_arr
from test19
group by date_format(dt,'yyyy-MM')
) tmp1
) t1
on t0.month = t1.month;
計算結(jié)果如下:
注意:第二列求是下個月的退費人數(shù)。
2020-01 0
2020-02 2
2020-03 2
2020-04 1
第六步:計算當(dāng)前月的退費人數(shù)
? ? 步驟五計算的是下一個月的學(xué)生退費人數(shù),再利用?lag(next_month_cnt,1,0) over(order by month) 向上偏移一行,就得到當(dāng)前月的退費人數(shù)。
sql代碼如下:
select month,
--基于下月的退費人數(shù)month_cnt字段,向上偏移一行,就得到當(dāng)前月的退費人數(shù)
lag(next_month_cnt,1,0) over(order by month) as month_cnt
from(
select distinct t0.month as month,
sum(if(!array_contains(t1.lead_stu_id_arr,t0.stu_id),1,0)) over(partition by t0.month) as next_month_cnt
from
(select
date_format(dt,'yyyy-MM') as month,
stu_id
from test19) t0
left join
(
select month,
lead(stu_id_arr,1) over(order by month) as lead_stu_id_arr
from(
select date_format(dt,'yyyy-MM') as month,
collect_list(stu_id) as stu_id_arr
from test19
group by date_format(dt,'yyyy-MM')
) tmp1
) t1
on t0.month = t1.month
) tmp2;
計算結(jié)果如下:
2020-01 0
2020-02 0
2020-03 2
2020-04 2
2020-05 1
計算截止到當(dāng)前月的退費人數(shù),sql代碼如下:
select month,
-- sum() over(order by ..) 窗口計算范圍:上無邊界(起始行)到當(dāng)前行
sum(month_cnt) over(order by month) as result
from(
select month,
lag(next_month_cnt,1,0) over(order by month) as month_cnt
from(
select distinct t0.month as month,
sum(if(!array_contains(t1.lead_stu_id_arr,t0.stu_id),1,0)) over(partition by t0.month) as next_month_cnt
from
(select
date_format(dt,'yyyy-MM') as month,
stu_id
from test19) t0
left join
(
select month,
lead(stu_id_arr,1) over(order by month) as lead_stu_id_arr
from(
select date_format(dt,'yyyy-MM') as month,
collect_list(stu_id) as stu_id_arr
from test19
group by date_format(dt,'yyyy-MM')
) tmp1
) t1
on t0.month = t1.month
) tmp2
) tmp3;
計算結(jié)果為:
2020-01 0
2020-02 0
2020-03 2
2020-04 4
2020-05 5
3 小結(jié)
? ?針對存在性問題,一般的求解思路是:1.利用collect_set()或者 collect_list()函數(shù)進(jìn)行聚合,將數(shù)據(jù)集轉(zhuǎn)換成數(shù)據(jù)組。2.再利用array_contains()等函數(shù)判斷集合(數(shù)組)中是否存在某元素,針對結(jié)果打上標(biāo)簽。3.再根據(jù)標(biāo)簽進(jìn)行之后的分組聚合計算等。
ps:以上文章參考:文章來源:http://www.zghlxwxcb.cn/news/detail-856540.html
https://blog.csdn.net/godlovedaniel/article/details/119388498?ops_request_misc=%257B%2522request%255Fid%2522%253A%2522167921970316800184142859%2522%252C%2522scm%2522%253A%252220140713.130102334..%2522%257D&request_id=167921970316800184142859&biz_id=0&utm_medium=distribute.pc_search_result.none-task-blog-2~all~sobaiduend~default-1-119388498-null-null.142^v74^control_1,201^v4^add_ask,239^v2^insert_chatgpt&utm_term=%E5%AD%98%E5%9C%A8%E6%80%A7%E9%97%AE%E9%A2%98&spm=1018.2226.3001.4187文章瀏覽閱讀741次。本文對存在性問題進(jìn)行了探討和研究,此類問題往往需要對不同的記錄做對比分析,我們可以先將符合條件的數(shù)據(jù)域按照collect_set()或collect_list()函數(shù)進(jìn)行聚合轉(zhuǎn)換成數(shù)組,然后獲取歷史的數(shù)據(jù)域放入當(dāng)前行,最后利用hive中數(shù)組的相關(guān)處理手段進(jìn)行對比分析。常用的hive數(shù)組處理函數(shù)如expode()、size()、array()、array_contains()等函數(shù),本題就借助于hive ,array_contains()函數(shù)進(jìn)行存在性問題分析。_sql 求截止當(dāng)前月退費總?cè)藬?shù)https://blog.csdn.net/godlovedaniel/article/details/119388498?ops_request_misc=%257B%2522request%255Fid%2522%253A%2522167921970316800184142859%2522%252C%2522scm%2522%253A%252220140713.130102334..%2522%257D&request_id=167921970316800184142859&biz_id=0&utm_medium=distribute.pc_search_result.none-task-blog-2~all~sobaiduend~default-1-119388498-null-null.142%5Ev74%5Econtrol_1,201%5Ev4%5Eadd_ask,239%5Ev2%5Einsert_chatgpt&utm_term=%E5%AD%98%E5%9C%A8%E6%80%A7%E9%97%AE%E9%A2%98&spm=1018.2226.3001.4187文章來源地址http://www.zghlxwxcb.cn/news/detail-856540.html
到了這里,關(guān)于HiveSQL題——array_contains函數(shù)的文章就介紹完了。如果您還想了解更多內(nèi)容,請在右上角搜索TOY模板網(wǎng)以前的文章或繼續(xù)瀏覽下面的相關(guān)文章,希望大家以后多多支持TOY模板網(wǎng)!