Hive Practical 2
Hive Practical 2
csv
101,Amit,HADOOP:HIVE:SPARK:BIG-DATA
102,Sumit,HIVE:OOZIE:HADOOP:SPARK:STORM
103,Rohit,KAFKA:CASSANDRA:HBASE
USE itunes_fuse_semantic_app;
------------------------------------------------------------------------------------------------------------------
Working with Array operators
------------------------------------------------------------------------------------------------------------------
SELECT
size(skills),
array_contains(skills, 'HADOOP'),
sort_array(skills),
concat_ws("|", skills)
FROM employee;
------------------------------------------------------------------------------------------------------------------
Exploding contents of an array
------------------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------------------
Expanding contents of an array with other columns
------------------------------------------------------------------------------------------------------------------
Here skill_set is the table which contains single column with alias skill.
==========================================
The two queries look almost identical, but if more than one reducer is invoked, the output will be
sorted differently.
set mapred.reduce.tasks=2;
SELECT * FROM users SORT BY name ASC;
========================================
SET mapred.reduce.tasks=2;
SELECT * FROM users DISTRIBUTE BY unit SORT BY name ASC;
==================================================
-bash-4.1$ vi users.txt
1 Amit 100 DNA
2 Sumit 200 DNA
3 Yadav 300 DNA
4 Sunil 500 FCS
5 Kranti 100 FCS
6 Mahoor 200 FCS
8 Chandra 500 DNA
-bash-4.1$ vi locations.txt
1 UP
2 BIHAR
3 MP
4 AP
5 MAHARASHTRA
6 GOA
7 JHARKHAND
USE default;
SET hive.enforce.bucketing=true;
===============================================================
------------------------------------------------------------------------------------------------------------------------------
Inner Join
------------------------------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------------------------------
Left Outer Join
------------------------------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------------------------------
Right Outer Join
------------------------------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------------------------------
Full Outer Join
------------------------------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------------------------------
Cartesian Cross Product Join (Less Used)
------------------------------------------------------------------------------------------------------------------------------
============================================================
############# Use AcadGild VM ########################################
----------------------------------------------------------------------
CREATING emp_details TABLE
----------------------------------------------------------------------
create table emp_details
(
emp_name string,
unit string,
exp int,
location string
)
row format delimited
fields terminated by ',';
----------------------------------------------------------------------
LOADING emp_details TABLE
----------------------------------------------------------------------
----------------------------------------------------------------------
CREATING emp_details_partitioned TABLE
----------------------------------------------------------------------
----------------------------------------------------------------------
LOADING emp_details_partitioned TABLE with Static Partitions
----------------------------------------------------------------------
insert overwrite table emp_details_partitioned
partition(location = 'BBSR')
select emp_name, unit, exp from emp_details
where location = 'BBSR';
----------------------------------------------------------------------
LOADING emp_details_partitioned TABLE with Dynamic Partitions
----------------------------------------------------------------------
set hive.exec.dynamic.partition.mode=nonstrict;
----------------------------------------------------------------------
DROPIING PARTITIONS FROM emp_details_partitioned TABLE
----------------------------------------------------------------------
===============================================
SELECT * from users TABLESAMPLE(BUCKET 3 OUT OF 10 ON rand()) s;
SELECT * from users TABLESAMPLE(BUCKET 3 OUT OF 10 ON rand()) s;
==================================================================
---------------------------
Creating regular text table
---------------------------
create table text_table
(
c1 int,
c2 int,
c3 int,
c4 int
)
row format delimited
fields terminated by '|';
---------------------------
Loading into text table
---------------------------
---------------------------
Creating SequenceFile table
---------------------------
---------------------------
Creating RC Format table
---------------------------
---------------------------
Creating Parquet File table
---------------------------
---------------------------
Creating ORC Format table
---------------------------
----------------------------------------
Loading All the tables in a single pass
----------------------------------------
FROM text_table
INSERT OVERWRITE TABLE seq_table SELECT *
INSERT OVERWRITE TABLE rc_table SELECT *
INSERT OVERWRITE TABLE prq_table SELECT *
INSERT OVERWRITE TABLE orc_table SELECT *;
----------------------------------------
Comparing sizes of loaded tables
----------------------------------------
----------------------------------------
Enablig Compression
----------------------------------------
Enabling compression provides performance gains in most cases and is supported for RCFile
and SequenceFile tables.
For example, to enable Snappy compression, you would specify the following additional settings
when loading data through the Hive shell.
SET hive.exec.compress.output=true;
SET mapred.max.split.size=256000000;
SET mapred.output.compression.type=BLOCK; -- block compression for sequence file
SET mapred.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec;
FROM text_table
INSERT OVERWRITE TABLE seq_table SELECT *
INSERT OVERWRITE TABLE rc_table SELECT *
INSERT OVERWRITE TABLE prq_table SELECT *
INSERT OVERWRITE TABLE orc_table SELECT *;
-------------------------------------------------------------------------------------------------------
Comparing sizes of loaded tables after compression (RC Files and Sequence Files are
benefited the most)
-------------------------------------------------------------------------------------------------------
===================================================================