Hudi数据湖与Hive集成示例
作者:快盘下载 人气:38-- 1. 创建数据库database
CREATE DATABASE IF NOT EXISTS db_hudi ; -- 2. 使用数据库 USE db_hudi ; -- 3. 创建表 CREATE EXTERNAL TABLE IF NOT EXISTS tbl_hudi_didi( order_id bigint , product_id int , city_id int , district int , county int , type int , combo_type int , traffic_type int , passenger_count int , driver_product_id int , start_dest_distance int , arrive_time string , departure_time string , pre_total_fee double , normal_time string , bubble_trace_id string , product_1level int , dest_lng double , dest_lat double , starting_lng double , starting_lat double , partitionpath string , ts bigint ) PARTITIONED BY (date_str string) ROW FORMAT SERDE ;org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe; STORED AS INPUTFORMAT ;org.apache.hudi.hadoop.HoodieParquetInputFormat; OUTPUTFORMAT ;org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat; LOCATION ;/hudi-warehouse/tbl_didi_haikou; ; -- 查看分区表分区 SHOW PARTITIONS db_hudi.tbl_hudi_didi ; -- 5. 手动添加分区信息 ALTER TABLE db_hudi.tbl_hudi_didi ADD IF NOT EXISTS PARTITION (date_str = ;2017-5-22;) LOCATION ;/hudi-warehouse/tbl_didi_haikou/2017-5-22; ; ALTER TABLE db_hudi.tbl_hudi_didi ADD IF NOT EXISTS PARTITION (date_str = ;2017-5-23;) LOCATION ;/hudi-warehouse/tbl_didi_haikou/2017-5-23; ; ALTER TABLE db_hudi.tbl_hudi_didi ADD IF NOT EXISTS PARTITION (date_str = ;2017-5-24;) LOCATION ;/hudi-warehouse/tbl_didi_haikou/2017-5-24; ; ALTER TABLE db_hudi.tbl_hudi_didi ADD IF NOT EXISTS PARTITION (date_str = ;2017-5-25;) LOCATION ;/hudi-warehouse/tbl_didi_haikou/2017-5-25; ; ALTER TABLE db_hudi.tbl_hudi_didi ADD IF NOT EXISTS PARTITION (date_str = ;2017-5-26;) LOCATION ;/hudi-warehouse/tbl_didi_haikou/2017-5-26; ; ALTER TABLE db_hudi.tbl_hudi_didi ADD IF NOT EXISTS PARTITION (date_str = ;2017-5-27;) LOCATION ;/hudi-warehouse/tbl_didi_haikou/2017-5-27; ; ALTER TABLE db_hudi.tbl_hudi_didi ADD IF NOT EXISTS PARTITION (date_str = ;2017-5-28;) LOCATION ;/hudi-warehouse/tbl_didi_haikou/2017-5-28; ; ALTER TABLE db_hudi.tbl_hudi_didi ADD IF NOT EXISTS PARTITION (date_str = ;2017-5-29;) LOCATION ;/hudi-warehouse/tbl_didi_haikou/2017-5-29; ; ALTER TABLE db_hudi.tbl_hudi_didi ADD IF NOT EXISTS PARTITION (date_str = ;2017-5-30;) LOCATION ;/hudi-warehouse/tbl_didi_haikou/2017-5-30; ; ALTER TABLE db_hudi.tbl_hudi_didi ADD IF NOT EXISTS PARTITION (date_str = ;2017-5-31;) LOCATION ;/hudi-warehouse/tbl_didi_haikou/2017-5-31; ; ALTER TABLE db_hudi.tbl_hudi_didi ADD IF NOT EXISTS PARTITION (date_str = ;2017-6-1;) LOCATION ;/hudi-warehouse/tbl_didi_haikou/2017-6-1; ; ALTER TABLE db_hudi.tbl_hudi_didi ADD IF NOT EXISTS PARTITION (date_str = ;2017-6-2;) LOCATION ;/hudi-warehouse/tbl_didi_haikou/2017-6-2; ; ALTER TABLE db_hudi.tbl_hudi_didi ADD IF NOT EXISTS PARTITION (date_str = ;2017-6-3;) LOCATION ;/hudi-warehouse/tbl_didi_haikou/2017-6-3; ; ALTER TABLE db_hudi.tbl_hudi_didi ADD IF NOT EXISTS PARTITION (date_str = ;2017-6-4;) LOCATION ;/hudi-warehouse/tbl_didi_haikou/2017-6-4; ; ALTER TABLE db_hudi.tbl_hudi_didi ADD IF NOT EXISTS PARTITION (date_str = ;2017-6-5;) LOCATION ;/hudi-warehouse/tbl_didi_haikou/2017-6-5; ; ALTER TABLE db_hudi.tbl_hudi_didi ADD IF NOT EXISTS PARTITION (date_str = ;2017-6-6;) LOCATION ;/hudi-warehouse/tbl_didi_haikou/2017-6-6; ; ALTER TABLE db_hudi.tbl_hudi_didi ADD IF NOT EXISTS PARTITION (date_str = ;2017-6-7;) LOCATION ;/hudi-warehouse/tbl_didi_haikou/2017-6-7; ; ALTER TABLE db_hudi.tbl_hudi_didi ADD IF NOT EXISTS PARTITION (date_str = ;2017-6-8;) LOCATION ;/hudi-warehouse/tbl_didi_haikou/2017-6-8; ; ALTER TABLE db_hudi.tbl_hudi_didi ADD IF NOT EXISTS PARTITION (date_str = ;2017-6-9;) LOCATION ;/hudi-warehouse/tbl_didi_haikou/2017-6-9; ; ALTER TABLE db_hudi.tbl_hudi_didi ADD IF NOT EXISTS PARTITION (date_str = ;2017-6-10;) LOCATION ;/hudi-warehouse/tbl_didi_haikou/2017-6-10; ; -- 测试;查询数据 SET hive.mapred.mode = nonstrict ; SELECT order_id, product_id, type, pre_total_fee, traffic_type, start_dest_distance FROM db_hudi.tbl_hudi_didi LIMIT 20; -- 开发测试;设置运行模式为本地模式 set hive.exec.mode.local.auto=true; set hive.exec.mode.local.auto.tasks.max=10; set hive.exec.mode.local.auto.inputbytes.max=50000000; -- 指标一;订单类型统计 WITH tmp AS ( SELECT product_id, COUNT(1) AS total FROM db_hudi.tbl_hudi_didi GROUP BY product_id ) SELECT CASE product_id WHEN 1 THEN ;滴滴专车; WHEN 2 THEN ;滴滴企业专车; WHEN 3 THEN ;滴滴快车; WHEN 4 THEN ;滴滴企业快车; END AS order_type, total FROM tmp ; -- 指标二;订单时效性统计 WITH tmp AS ( SELECT type, COUNT(1) AS total FROM db_hudi.tbl_hudi_didi GROUP BY type ) SELECT CASE type WHEN 0 THEN ;实时; WHEN 1 THEN ;预约; END AS order_type, total FROM tmp ; -- 指标三;订单交通类型统计 SELECT traffic_type, COUNT(1) AS total FROM db_hudi.tbl_hudi_didi GROUP BY traffic_type ; -- 指标五;订单价格统计;先将价格划分区间;再统计;此处使用 WHEN函数和SUM函数 SELECT SUM( CASE WHEN pre_total_fee BETWEEN 0 AND 15 THEN 1 ELSE 0 END ) AS 0_15, SUM( CASE WHEN pre_total_fee BETWEEN 16 AND 30 THEN 1 ELSE 0 END ) AS 16_30, SUM( CASE WHEN pre_total_fee BETWEEN 31 AND 50 THEN 1 ELSE 0 END ) AS 31_50, SUM( CASE WHEN pre_total_fee BETWEEN 50 AND 100 THEN 1 ELSE 0 END ) AS 51_100, SUM( CASE WHEN pre_total_fee > 100 THEN 1 ELSE 0 END ) AS 100_ FROM db_hudi.tbl_hudi_didi ;
加载全部内容