0% found this document useful (0 votes)
74 views3 pages

Appstore Homework

The document contains log files from May 25th and 26th recording user searches in an app store. It creates an external table in Hive to structure the raw log data and loads the files. Dim tables are then created to transform and store the search data, removing duplicates and enriching fields like user IDs.

Uploaded by

Debasish Mishra
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
74 views3 pages

Appstore Homework

The document contains log files from May 25th and 26th recording user searches in an app store. It creates an external table in Hive to structure the raw log data and loads the files. Dim tables are then created to transform and store the search data, removing duplicates and enriching fields like user IDs.

Uploaded by

Debasish Mishra
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 3

vi appstore_data_25may.

txt
123@123999|32345|entertainment|2022-05-25|2|xxx222|wifi|12.13.134|src1
123@124000|32346|entertainment|2022-05-25|3|xxx223|wifi|12.13.135|src2
123@124001|32347|entertainment|2022-05-25|4|xxx224|wifi|12.13.136|src3
123@124002|32348|entertainment|2022-05-25|5|xxx225|wifi|12.13.137|src4
123@124003|32349|entertainment|2022-05-25|6|xxx226|wifi|12.13.138|src5
cat appstore_data_25may.txt
vi appstore_data_26may.txt
123@123999|32345|finance|2022-05-26|10|xxx222|wifi|12.13.178|src1
123@124000|32346|love|2022-05-26|2|xxx223|wifi|12.13.135|src2
123@124001|32347|Tesla|2022-05-26|2|xxx224|wifi|12.13.136|src3
123@124002|32348|Kids|2022-05-26|5|xxx225|boradband|12.13.137|src4
123@124003|32349|entertainment|2022-05-26|6|xxx226|wifi|12.13.138|src5
cat appstore_data_26may.txt

create database appstore;


use appstore;

CREATE EXTERNAL TABLE IF NOT EXISTS stage_appstore_search_dm_astore


(
login_id STRING COMMENT '123',
device_id STRING COMMENT 'dev123',
search_term STRING COMMENT 'finance',
oper_date STRING COMMENT '2020-11-11 15:30:12',
result_count INT COMMENT '5',
biz_channel_id STRING COMMENT 'xxx999',
network_type_id STRING COMMENT '5G',
ip_address STRING COMMENT 'The ip address of appstore 22.23.124.30',
source STRING COMMENT 'game center'
)
PARTITIONED BY (part_dt STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '|'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE
LOCATION '/bigdata/appstore/hadoop/datacenter/stage/stage_appstore_search_dm';
load data local inpath '/home/cloudera/appstore_data_25may.txt' into table
stage_appstore_search_dm_astore partition(part_dt='2022-05-25');
load data local inpath '/home/cloudera/appstore_data_26may.txt' into table
stage_appstore_search_dm_astore partition(part_dt='2022-05-26');

set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions=2000;
set hive.exec.max.dynamic.partitions.pernode=2000;

CREATE EXTERNAL TABLE IF NOT EXISTS dim_appstore_search_dm


(
sign STRING COMMENT 'User sign',
imei STRING COMMENT 'Encrypted imei',
search_term STRING COMMENT 'The searched term',
result_count INT COMMENT 'The search result count',
source STRING COMMENT '',
client_ver_cd STRING COMMENT '',
network_type_id INT COMMENT 'Values: 1 WLAN,2 2G,3 3G, 4 4G, 0, -1
Unknown',
eventdate STRING COMMENT 'The search time'
)
PARTITIONED BY (part_dt STRING)
STORED AS TEXTFILE
LOCATION '/bigdata/appstore/hadoop/datacenter/DIM/dim_appstore_search_dm';

INSERT OVERWRITE TABLE dim_appstore_search_dm


PARTITION (part_dt)

SELECT
sign, imei,
search_term,result_count,source,client_ver_cd,network_type_id,eventdate,
REGEXP_REPLACE(substr(eventdate,1,10),'-','') AS part_dt
FROM
(
SELECT
COALESCE(t1.sign,t2.sign) as sign,
COALESCE(t1.imei,t2.imei) as imei,
COALESCE(t1.search_term,t2.search_term) as search_term,
COALESCE(t1.result_count,t2.result_count) as result_count,
COALESCE(t1.network_type_id,t2.network_type_id, -1) as network_type_id,
COALESCE(t1.source,t2.source) as source,
COALESCE(t1.client_ver_cd,t2.client_ver_cd) as client_ver_cd,
COALESCE(t1.eventdate,t2.eventdate) as eventdate
FROM
(
SELECT
sign, imei, search_term, result_count,
network_type_id,source,client_ver_cd,eventdate
FROM
appstore.dim_appstore_search_dm
where
part_dt>='2022-05-25'
)t2
FULL OUTER JOIN
(
SELECT
sign,t.imei,search_term,result_count,network_type_id,source,client_ver_cd,eventdate
FROM
(
SELECT
split(login_id,'@')[0] AS sign,
split(login_id,'@')[1] AS imei,
search_term,
result_count,
'99' as client_ver_cd,
cast (network_type_id as int) AS network_type_id,
source,
oper_date as eventdate
FROM
appstore.stage_appstore_search_dm_astore
WHERE part_dt='2022-05-26'
)t
)t1
ON t1.sign=t2.sign
and t1.search_term=t2.search_term
and t1.result_count=t2.result_count
and t1.network_type_id=t2.network_type_id
and t1.source=t2.source
and t1.eventdate=t2.eventdate
)t
;

You might also like