user_id|滴滴 x StarRocks:极速多维分析创造更大的业务价值( 三 )


select user_id, new_user_id, (max(new_user_id) over()) as max_id from
global_dict_by_userid_hive_table
) t2
on
t1.user_id = t2.user_id
where t2.newuser_id is null
3、原始表和更新后的全局字典表进行left join , 将新增用户的ID和编码后的整型用户ID插入到原始表中:
insert overwrite fact_log_user_hive_table
select
a.user_id,
b.new_user_id
from
fact_log_user_hive_table a left join global_dict_by_userid_hive_table b
on a.user_id=b.user_id
4、创建Spark离线同步任务完成Hive原始表到StarRocks明细表的数据同步:StarRocks表fact_log_user_doris_table定义(Hive表fact_log_user_hive_table与该表的结构一致):
CREATE TABLE `fact_log_user_doris_table` (
`new_user_id` bigint(20) NULL COMMENT "整型用户id",
`user_id` varchar(65533) NULL COMMENT "用户id",
`event_source` varchar(65533) NULL COMMENT "端(1:商城小程序 2:团长小程序 3:独立APP 4:主端)",
`is_new` varchar(65533) NULL COMMENT "是否新用户",
`identity` varchar(65533) NULL COMMENT "用户身份(团长或者普通用户)",
`biz_channel_name` varchar(65533) NULL COMMENT "当天首次落地页渠道名称",
`pro_id` varchar(65533) NULL COMMENT "省ID",
`pro_name` varchar(65533) NULL COMMENT "省名称",
`city_id` varchar(65533) NULL COMMENT "城市ID",
`city_name` varchar(65533) NULL COMMENT "城市名称",
`dt` date NULL COMMENT "分区",
`period_type` varchar(65533) NULL DEFAULT "daily" COMMENT ""
) ENGINE=OLAP
DUPLICATE KEY(`index_id`, `user_id`, `biz_channel_name`, `pro_id`, `city_id`)
PARTITION BY RANGE(`dt`)(
PARTITION p20210731 VALUES [('2021-07-31'), ('2021-08-01')),
PARTITION p20210801 VALUES [('2021-08-01'), ('2021-08-02')),
PARTITION p20210802 VALUES [('2021-08-02'), ('2021-08-03')),
PARTITION p20210803 VALUES [('2021-08-03'), ('2021-08-04')),
PARTITION p20210804 VALUES [('2021-08-04'), ('2021-08-05')),
PARTITION p20210805 VALUES [('2021-08-05'), ('2021-08-06')),
PARTITION p20210806 VALUES [('2021-08-06'), ('2021-08-07')),
PARTITION p20210807 VALUES [('2021-08-07'), ('2021-08-08')),
PARTITION p20210808 VALUES [('2021-08-08'), ('2021-08-09')))
DISTRIBUTED BY HASH(`index_id`, `user_id`) BUCKETS 10
PROPERTIES (
"replication_num" = "3",
"dynamic_partition.enable" = "true",
【user_id|滴滴 x StarRocks:极速多维分析创造更大的业务价值】"dynamic_partition.time_unit" = "DAY",
"dynamic_partition.time_zone" = "Asia/Shanghai",
"dynamic_partition.start" = "-2147483648",
"dynamic_partition.end" = "1",
"dynamic_partition.prefix" = "p",
"dynamic_partition.replication_num" = "-1",
"dynamic_partition.buckets" = "3",
"in_memory" = "false",
"storage_format" = "DEFAULT"
);
在这里我们使用了StarRocks的明细模型来建表 , 满足用户查询漏斗明细数据的使用场景 , 在明细表上根据不同的多维漏斗分析查询需求创建相应的物化视图 , 来满足用户选择不同维度查看漏斗模型每一步骤用户精确去重数量的使用场景 。

推荐阅读