0% found this document useful (0 votes)
72 views1 page

Hive-Hands On - Bucketing Table

The document creates a Hive database and table called hive_bucket and temp to load transaction data from a CSV file. It then creates a partitioned and clustered table called transaction_bucket to bucket the data by customer ID. Data is inserted from temp to transaction_bucket. Additional tables bucket1, bucket2, bucket3 are then created to insert partitioned data with transactional support enabled.

Uploaded by

Story Telling
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
72 views1 page

Hive-Hands On - Bucketing Table

The document creates a Hive database and table called hive_bucket and temp to load transaction data from a CSV file. It then creates a partitioned and clustered table called transaction_bucket to bucket the data by customer ID. Data is inserted from temp to transaction_bucket. Additional tables bucket1, bucket2, bucket3 are then created to insert partitioned data with transactional support enabled.

Uploaded by

Story Telling
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 1

create database hive_bucket;

use hive_bucket;

create table temp (transaction_id string, cust_id int, tran_date string,


prod_subcat_code int, prod_cat_code int, Qty int, Rate int, Tax double, total_amt
double, Store_type string) row format delimited fields terminated by ',' lines
terminated by '\n' tblproperties("skip.header.line.count"="1");

load data local inpath '/projects/challenge/Transactions.csv' into table temp;

set hive.cli.print.header=true;
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.mapred.mode=nonstrict;

create table transaction_bucket (transaction_id string, cust_id int, tran_date


string, Qty int, Rate int, Tax double, total_amt double) partitioned by (Store_type
string) clustered by (cust_id) into 3 buckets row format delimited fields
terminated by ',' lines terminated by '\n'
tblproperties("skip.header.line.count"="1");

insert overwrite table transaction_bucket partition (Store_type) select


transaction_id, cust_id, tran_date, Qty, Rate, Tax, total_amt, Store_type from temp
where Qty > 0;

create table bucket1 (transaction_id string, cust_id int, tran_date string, Qty
int, Rate int, Tax double, total_amt double, Store_type string) clustered by
(cust_id) into 3 buckets row format delimited fields terminated by ',' lines
terminated by '\n' stored as ORC tblproperties("orc.compress"="Zlib",
"skip.header.line.count"="1");

alter table bucket1 set tblproperties('transactional'='true');

set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DBTxnManager;
set hive.support.concurrency=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.compactor.initiator.on=true;
set hive.compactor.worker.threads=1;

insert into table bucket1 values ("80712190438",270351,"28-02-2014",-5,-772,405.3,-


4265.3,"e-Shop"),("29258453508",270384,"27-02-2014",-5,-1497,785.925,-8270.925,"e-
Shop"),("93274880719",271509,"24-02-2014",-3,-1363,429.345,-4518.345,"e-Shop"),
("97439039119",272357,"23-02-2014",-2,-824,173.04,-1821.04,"TeleShop"),
("45649838090",273667,"22-04-2014",-1,-1450,152.25,-1602.25,"e-Shop"),
("22643667930",271489,"22-02-2014",-1,-1225,128.625,-1353.625,"TeleShop"),
("79792372943",275108,"22-02-2014",-3,-908,286.02,-3010.02,"MBR"),
("50076728598",269014,"21-02-2014",-4,-581,244,-2568.02,"e-Shop");

update bucket1 set tran_date = "25-02-2014" where Store_type = "e-Shop";

You might also like