数据开发跑路了,只能自己去 hive 导数据,需求是:
-
xfrom='sugg'的全要
-
其它 xfrom 的按 sub_region 分组根据 pubtime 一个月前取 xx 条,一个月内取 xx 条,都要按 click 倒序排
-
汇总到一起
渣渣业务开发只能写出这种 sql 了,跑了下执行一次要 49 块钱,求个优化🙏
WITH total AS (
SELECT * FROM xx.xx WHERE log_date='20220515'
),
other AS (
SELECT * FROM total WHERE xfrom != 'sugg'
),
final AS (
SELECT * FROM total WHERE xfrom='sugg'
UNION ALL
SELECT * FROM (SELECT * FROM other WHERE sub_region='a' AND pubtime < '${yyyyMMdd,-1m}' ORDER BY click DESC LIMIT 1000) a1
UNION ALL
SELECT * FROM (SELECT * FROM other WHERE sub_region='a' AND pubtime >= '${yyyyMMdd,-1m}' ORDER BY click DESC LIMIT 1000) a2
UNION ALL
SELECT * FROM (SELECT * FROM other WHERE sub_region='b' AND pubtime < '${yyyyMMdd,-1m}' ORDER BY click DESC LIMIT 1000) b1
UNION ALL
SELECT * FROM (SELECT * FROM other WHERE sub_region='b' AND pubtime >= '${yyyyMMdd,-1m}' ORDER BY click DESC LIMIT 1000) b2
UNION ALL
SELECT * FROM (SELECT * FROM other WHERE sub_region='c' AND pubtime < '${yyyyMMdd,-1m}' ORDER BY click DESC LIMIT 250) c1
UNION ALL
SELECT * FROM (SELECT * FROM other WHERE sub_region='c' AND pubtime >= '${yyyyMMdd,-1m}' ORDER BY click DESC LIMIT 250) c2
UNION ALL
SELECT * FROM (SELECT * FROM other WHERE sub_region='d' AND pubtime < '${yyyyMMdd,-1m}' ORDER BY click DESC LIMIT 500) d1
UNION ALL
SELECT * FROM (SELECT * FROM other WHERE sub_region='d' AND pubtime >= '${yyyyMMdd,-1m}' ORDER BY click DESC LIMIT 500) d2
UNION ALL
SELECT * FROM (SELECT * FROM other WHERE sub_region='e' AND pubtime < '${yyyyMMdd,-1m}' ORDER BY click DESC LIMIT 500) e1
UNION ALL
SELECT * FROM (SELECT * FROM other WHERE sub_region='e' AND pubtime >= '${yyyyMMdd,-1m}' ORDER BY click DESC LIMIT 500) e2
UNION ALL
SELECT * FROM (SELECT * FROM other WHERE sub_region='f' AND pubtime < '${yyyyMMdd,-1m}' ORDER BY click DESC LIMIT 50) f1
UNION ALL
SELECT * FROM (SELECT * FROM other WHERE sub_region='f' AND pubtime >= '${yyyyMMdd,-1m}' ORDER BY click DESC LIMIT 50) f2
UNION ALL
SELECT * FROM (SELECT * FROM other WHERE sub_region='g' AND pubtime < '${yyyyMMdd,-1m}' ORDER BY click DESC LIMIT 50) g1
UNION ALL
SELECT * FROM (SELECT * FROM other WHERE sub_region='g' AND pubtime >= '${yyyyMMdd,-1m}' ORDER BY click DESC LIMIT 50) g2
UNION ALL
-- 这个 sub_region 有特殊要求,其它都是一样的条件,只是 limit 数量不同
SELECT * FROM (SELECT * FROM other WHERE sub_region='h' AND pubtime < '${yyyyMMdd,-1m}' AND (tag LIKE '%th%' OR tag LIKE '%ph%') ORDER BY click DESC LIMIT 150) h1
UNION ALL
SELECT * FROM (SELECT * FROM other WHERE sub_region='h' AND pubtime >= '${yyyyMMdd,-1m}' AND (tag LIKE '%th%' OR tag LIKE '%ph%') ORDER BY click DESC LIMIT 150) h2
)
INSERT OVERWRITE TABLE yy.yy SELECT * FROM final;