0% found this document useful (0 votes)
111 views6 pages

HIVE Codes

This document contains HiveQL queries that: 1) Create external tables and databases to store and query e-commerce data from files on S3 and the local filesystem. 2) Perform ETL processes like loading and transforming the data. 3) Write analytical queries on the data to calculate metrics like total revenue, sales by month, top products and brands. 4) Experiment with different data storage formats like partitioned and bucketed tables.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
111 views6 pages

HIVE Codes

This document contains HiveQL queries that: 1) Create external tables and databases to store and query e-commerce data from files on S3 and the local filesystem. 2) Perform ETL processes like loading and transforming the data. 3) Write analytical queries on the data to calculate metrics like total revenue, sales by month, top products and brands. 4) Experiment with different data storage formats like partitioned and bucketed tables.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 6

create external table if not exists user_info (id int, age int, gender string,

profession string, reviews int) row format delimited fields terminated by "|" lines
terminated by "\n" stored as textfile ;

aws s3 cp s3://ml-cloud-dataset/u.user .

create database if not exists demo;

set hive.cli.print.header=true ;set hive.cli.print.header=true ;

create external table if not exists user_info (id int, age int, gender string,
profession string, reviews int) row format delimited fields terminated by "|" lines
terminated by "\n" store as textfile ;

describe user_info ;

load data local inpath '/home/hadoop/u.user' into table user_info ;

create table secondTable (user_id int, user_profession string) stored as textfile ;

insert into table secondTable select id, profession from user_info ;

create table if not exists male_users (id int, gender string, profession string) ;

create table if not exists female_users (id int, gender string, profession
string) ;

from user_info insert into table male_users select id, gender, profession where
gender = 'M' insert into table female_users select id, gender, profession where
gender = 'F' ;

select * from male_users where profession='writer' ;

select count(*) as count, profession from male_users group by profession order by


count desc;

select count(*) as count, profession from male_users group by profession sort by


count desc;

select count(*) as count, profession from male_users group by profession ;

Dynamic Partitioning

set hive.exec.dynamic.partition = true ;


set hive.exec.dyanamic.partition.mode = nonstrict ;

create table if not exists buck_user_info (id int, age int, profession string,
ratings int) partitioned by (gender string) clustered by (age) into 7 buckets row
format delimited fields terminated by ' |' lines terminated by '\n' stored as
textfile ;

desc buck_user_info ;

insert into table buck_user_info partition(gender) select id, age, profession,


reviews, gender from user_info;

select gender, sum(ratings) as total_ratings from buck_user_info where


profession='artist' and age<35 group by gender;
create table if not exists fl_info (Entry_id int, year int, month int,

CASE STUDY

wget https://e-commerce-events-ml.s3.amazonaws.com/2019-Oct.csv
wget https://e-commerce-events-ml.s3.amazonaws.com/2019-Nov.csv

hadoop fs -mkdir /user/casestudy


hadoop fs -put 2019-Oct.csv /user/casestudy
hadoop fs -put 2019-Nov.csv /user/casestudy
hadoop fs -ls /user/casestudy

hive

create table if not exists Sales (


event_time timestamp, event_type string, product_id string,
category_id string, category_code string, brand string, price float,
user_id bigint, user_session string)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' WITH SERDEPROPERTIES
( "separatorChar" = ",", "quoteChar" = "\"", "escapeChar" = "\\")
stored as textfile
TBLPROPERTIES (
'serialization.null.format' = '',
'skip.header.line.count' = '1');

set hive.cli.print.header=true ;set hive.cli.print.header=true ;

load data inpath '/user/casestudy/2019-Oct.csv' into table Sales;


load data inpath '/user/casestudy/2019-Nov.csv' into table Sales;

set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.dynamic.partition=true;
set hive.enforce.bucketing=true;
CREATE TABLE IF NOT EXISTS ecom_no_cls(event_time timestamp, product_id string,
category_id string,
category_code string, brand string, price float, user_id bigint, user_session
string)
PARTITIONED BY (event_type string)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES
( "separatorChar" = ",", "quoteChar" = "\"", "escapeChar" = "\\")
STORED AS TEXTFILE;

insert into table ecom_no_cls partition(event_type) select event_time, product_id,


category_id,
category_code, brand, price, user_id, user_session, event_type from Sales;

CREATE INDEX opt_index ON TABLE ecom_no_cls(event_type)


AS 'COMPACT' WITH DEFERRED REBUILD;

CREATE TABLE IF NOT EXISTS ecom_bucket(event_time timestamp, product_id string,


category_id string,
category_code string, brand string, price float, user_id bigint, user_session
string)
PARTITIONED BY (event_type string) CLUSTERED BY (price) into 10 buckets
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES
( "separatorChar" = ",", "quoteChar" = "\"", "escapeChar" = "\\")
STORED AS TEXTFILE;

insert into table ecom_bucket partition(event_type) select event_time, product_id,


category_id,
category_code, brand, price, user_id, user_session, event_type from Sales;

Queries:

Find the total revenue generated due to purchases made in


October.

select sum(price) from Sales where date_format(event_time, 'MM')='10' and


event_type='purchase';

* select sum(price) from ecom_bucket where date_format(event_time, 'MM')='10' and


event_type='purchase';
** select sum(price) as Total_Revenue_in_Oct from ecom_no_cls where
date_format(event_time, 'MM')='10' and event_type='purchase';

Write a query to yield the total sum of purchases per month


in a single output.

select date_format(event_time, 'MM') as month, count(event_type) as total_sales


from Sales where event_type='purchase' group by (date_format(event_time, 'MM'));

** select date_format(event_time, 'MM') as month, count(event_type) as total_sales


from ecom_bucket where event_type='purchase' group by (date_format(event_time,
'MM'));

* select date_format(event_time, 'MM') as month, count(event_type) as total_sales


from ecom_no_cls where event_type='purchase' group by (date_format(event_time,
'MM'));

Write a query to find the change in revenue generated due to


purchases from October to November.

select October, November, November - October Difference


from
(
SELECT sum(case when date_format(event_time,'MM')=10 then price else 0 end) AS
October,
sum(case when date_format(event_time,'MM')=11 then price else 0 end) AS November
FROM Sales
WHERE date_format(event_time,'MM')in (10,11) AND event_type='purchase'
)s;

* select October, November, November - October Difference


from
(
SELECT sum(case when date_format(event_time,'MM')=10 then price else 0 end) AS
October,
sum(case when date_format(event_time,'MM')=11 then price else 0 end) AS November
FROM ecom_bucket
WHERE date_format(event_time,'MM')in (10,11) AND event_type='purchase'
)s;

** select November - October Difference_in_Revenue


from
(
SELECT sum(case when date_format(event_time,'MM')=10 then price else 0 end) AS
October,
sum(case when date_format(event_time,'MM')=11 then price else 0 end) AS November
FROM ecom_no_cls
WHERE date_format(event_time,'MM')in (10,11) AND event_type='purchase'
)s;
Find distinct categories of products. Categories with null
category code can be ignored.

** select category_code,count(product_id) as total_products from Sales where


category_code!='' group by category_code order by total_products desc;

* select category_code,count(product_id) as total_products from ecom_bucket where


category_code!='' group by category_code order by total_products desc;

select distinct(category_code) as categories from ecom_no_cls where category_code!


='' order by categories desc;

Find the total number of products available under


each category.

** select category_code, count(product_id) as Total_Products from Sales where


category_code != '' group by category_code order by Total_Products desc;

* select category_code, count(product_id) as Total_Products from ecom_bucket where


category_code != '' group by category_code order by Total_Products desc;

select category_code, count(product_id) as Total_Products


from ecom_no_cls where category_code != '' group by category_code order by
Total_Products desc;

Which brand had the maximum sales in October and


November combined?

** select brand, sum(price) as Total_Sales from Sales where brand != '' group by
brand order by Total_Sales desc limit 1;

* select brand, sum(price) as Total_Sales from ecom_bucket where brand != '' group
by brand order by Total_Sales desc limit 1;

select brand, sum(price) as Total_Sales from ecom_no_cls where brand != '' group by
brand limit 1;

Which brands increased their sales from October to


November?

With sale as (select brand, sum (case when date_format(event_time,'MM') =10 then
price else 0 end) as oct_s,
sum (case when date_format(event_time,'MM')=11 then price else 0 end) as nov_s from
Sales
where event_type = 'purchase' and brand != "" group by brand) select *,
nov_s - oct_s as sales_increase from sale where nov_s > oct_s order by
sales_increase desc;

* With sale as (select brand, sum (case when date_format(event_time,'MM') =10 then
price else 0 end) as oct_s,
sum (case when date_format(event_time,'MM')=11 then price else 0 end) as nov_s from
ecom_bucket
where event_type = 'purchase' and brand != "" group by brand) select *,
nov_s - oct_s as sales_increase from sale where nov_s > oct_s order by
sales_increase desc;

** With sale as (select brand, sum (case when date_format(event_time,'MM') =10 then
price else 0 end) as oct_s,
sum (case when date_format(event_time,'MM')=11 then price else 0 end) as nov_s from
ecom_no_cls
where event_type = 'purchase' and brand != "" group by brand) select *,
nov_s - oct_s as sales_increase from sale where nov_s > oct_s order by
sales_increase desc;

Your company wants to reward the top 10 users of its website with a
Golden Customer plan. Write a query to generate a list of top 10 users who spend
the most.

select user_id, sum(price) as TPrice, dense_rank() over (order by sum(price) desc)


as Rank from Sales where event_type='purchase' group by user_id limit 10;

* select user_id, sum(price) as TPrice, dense_rank() over (order by sum(price)


desc) as Rank from ecom_bucket where event_type='purchase' group by user_id limit
10;

** select user_id, sum(price) as TPrice, dense_rank() over (order by sum(price)


desc) as Rank
from ecom_no_cls where event_type='purchase' group by user_id limit 10;

You might also like