0% found this document useful (0 votes)

70 views11 pages

Hive Practical 2

The document discusses various operations that can be performed on array columns in Hive such as size, contains, sort, concat, and explode. It shows examples of using these functions to retrieve data from an employee table with an skills array column. It also discusses joining bucketed tables and different join types in Hive like inner, outer, and cartesian joins.

Uploaded by

Mytheesh Waran

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

70 views11 pages

Hive Practical 2

Uploaded by

Mytheesh Waran

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

You are on page 1/ 11

vi employee.

csv
101,Amit,HADOOP:HIVE:SPARK:BIG-DATA
102,Sumit,HIVE:OOZIE:HADOOP:SPARK:STORM
103,Rohit,KAFKA:CASSANDRA:HBASE

USE itunes_fuse_semantic_app;

CREATE TABLE employee

(
id INT,
name STRING,
skills ARRAY<STRING>
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
COLLECTION ITEMS TERMINATED BY ':';

LOAD DATA LOCAL INPATH 'employee.csv'

INTO TABLE employee;

------------------------------------------------------------------------------------------------------------------
Working with Array operators
------------------------------------------------------------------------------------------------------------------

SELECT
size(skills),
array_contains(skills, 'HADOOP'),
sort_array(skills),
concat_ws("|", skills)
FROM employee;

4 true ["BIG-DATA","HADOOP","HIVE","SPARK"] HADOOP|HIVE|SPARK|BIG-DATA

------------------------------------------------------------------------------------------------------------------
Exploding contents of an array
------------------------------------------------------------------------------------------------------------------

SELECT explode(skills) AS skills FROM employee;

--AS clause is required as explode() is UDTF, ie. generates output as TABLE.
HADOOP
HIVE
SPARK
BIG-DATA
HIVE
OOZIE
HADOOP
SPARK
STORM
KAFKA
CASSANDRA
HBASE

------------------------------------------------------------------------------------------------------------------
Expanding contents of an array with other columns
------------------------------------------------------------------------------------------------------------------

SELECT id, name, skill

FROM employee LATERAL VIEW explode(skills) skill_set
AS skill;

101 Amit HADOOP

101 Amit HIVE
101 Amit SPARK
101 Amit BIG-DATA
102 Sumit HIVE
102 Sumit OOZIE
102 Sumit HADOOP
102 Sumit SPARK
102 Sumit STORM
103 Rohit KAFKA
103 Rohit CASSANDRA
103 Rohit HBASE

Here skill_set is the table which contains single column with alias skill.

==========================================

SET hive.mapred.mode=nostrict; --default is nostrict

SELECT * FROM users ORDER BY name ASC;
SELECT * FROM users SORT BY name ASC;

The two queries look almost identical, but if more than one reducer is invoked, the output will be
sorted differently.

set mapred.reduce.tasks=2;
SELECT * FROM users SORT BY name ASC;

========================================

SET mapred.reduce.tasks=2;
SELECT * FROM users DISTRIBUTE BY unit SORT BY name ASC;

SELECT * FROM users DISTRIBUTE BY unit SORT BY name ASC;

SELECT * FROM users CLUSTER BY unit;

==================================================

-bash-4.1$ vi users.txt
1 Amit 100 DNA
2 Sumit 200 DNA
3 Yadav 300 DNA
4 Sunil 500 FCS
5 Kranti 100 FCS
6 Mahoor 200 FCS
8 Chandra 500 DNA

-bash-4.1$ vi locations.txt
1 UP
2 BIHAR
3 MP
4 AP
5 MAHARASHTRA
6 GOA
7 JHARKHAND

USE default;

CREATE TABLE users

(
id INT,
name STRING,
salary INT,
unit STRING
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t';

CREATE TABLE locations

(
id INT,
location STRING
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t';

LOAD DATA LOCAL INPATH '/root/hive/users.txt'

INTO TABLE users;

LOAD DATA LOCAL INPATH '/root/hive/locations.txt'

INTO TABLE locations;

CREATE TABLE buck_users

(
id INT,
name STRING,
salary INT,
unit STRING
)
CLUSTERED BY (id)
SORTED BY (id)
INTO 2 BUCKETS;

CREATE TABLE buck_locations

(
id INT,
location STRING
)
CLUSTERED BY (id)
SORTED BY (id)
INTO 2 BUCKETS;

SET hive.enforce.bucketing=true;

INSERT OVERWRITE TABLE buck_users

SELECT * FROM users;

INSERT OVERWRITE TABLE buck_locations

SELECT * FROM locations;
--View the number of files created at the table location.
--It should be two.

===============================================================

------------------------------------------------------------------------------------------------------------------------------
Inner Join
------------------------------------------------------------------------------------------------------------------------------

SELECT * FROM buck_users u INNER JOIN buck_locations l

ON u.id = l.id;

------------------------------------------------------------------------------------------------------------------------------
Left Outer Join
------------------------------------------------------------------------------------------------------------------------------

SELECT * FROM buck_users u LEFT OUTER JOIN buck_locations l

ON u.id = l.id;

------------------------------------------------------------------------------------------------------------------------------
Right Outer Join
------------------------------------------------------------------------------------------------------------------------------

SELECT * FROM buck_users u RIGHT OUTER JOIN buck_locations l

ON u.id = l.id;

------------------------------------------------------------------------------------------------------------------------------
Full Outer Join
------------------------------------------------------------------------------------------------------------------------------

SELECT * FROM buck_users u FULL OUTER JOIN buck_locations l

ON u.id = l.id;

------------------------------------------------------------------------------------------------------------------------------
Cartesian Cross Product Join (Less Used)
------------------------------------------------------------------------------------------------------------------------------

SELECT * FROM buck_users u JOIN buck_locations l

ON u.id = l.id;

============================================================
############# Use AcadGild VM ########################################

----------------------------------------------------------------------
CREATING emp_details TABLE
----------------------------------------------------------------------
create table emp_details
(
emp_name string,
unit string,
exp int,
location string
)
row format delimited
fields terminated by ',';

----------------------------------------------------------------------
LOADING emp_details TABLE
----------------------------------------------------------------------

load data local inpath '/home/acadgild/hive/emp_details.txt'

into table emp_details;

describe formatted emp_details;

dfs -ls hdfs://localhost:9000/user/hive/warehouse/emp_details;

----------------------------------------------------------------------
CREATING emp_details_partitioned TABLE
----------------------------------------------------------------------

create table emp_details_partitioned

(
emp_name string,
unit string,
exp int
)
partitioned by (location string);

----------------------------------------------------------------------
LOADING emp_details_partitioned TABLE with Static Partitions
----------------------------------------------------------------------
insert overwrite table emp_details_partitioned
partition(location = 'BBSR')
select emp_name, unit, exp from emp_details
where location = 'BBSR';

----------------------------------------------------------------------
LOADING emp_details_partitioned TABLE with Dynamic Partitions
----------------------------------------------------------------------
set hive.exec.dynamic.partition.mode=nonstrict;

insert overwrite table emp_details_partitioned

partition (location)
select * from emp_details;

select count(*) from emp_details where location='BBSR';

select count(*) from emp_details where name='Aditya';

----------------------------------------------------------------------
DROPIING PARTITIONS FROM emp_details_partitioned TABLE
----------------------------------------------------------------------

alter table emp_details_partitioned drop partition(location='BBSR');

===============================================
SELECT * from users TABLESAMPLE(BUCKET 3 OUT OF 10 ON rand()) s;
SELECT * from users TABLESAMPLE(BUCKET 3 OUT OF 10 ON rand()) s;

SELECT * from users TABLESAMPLE(BUCKET 2 OUT OF 4 ON name) s;

SELECT * FROM buck_users TABLESAMPLE(BUCKET 1 OUT OF 2 ON id) s LIMIT 1;

==================================================================
---------------------------
Creating regular text table
---------------------------
create table text_table
(
c1 int,
c2 int,
c3 int,
c4 int
)
row format delimited
fields terminated by '|';

---------------------------
Loading into text table
---------------------------

load data local inpath '/root/hive/datasets_for_fileformats/ratings.dat'

into table text_table;

---------------------------
Creating SequenceFile table
---------------------------

create table seq_table

(
c1 int,
c2 int,
c3 int,
c4 int
)
stored as SEQUENCEFILE;

---------------------------
Creating RC Format table
---------------------------

create table rc_table

(
c1 int,
c2 int,
c3 int,
c4 int
)
stored as RCFILE;

---------------------------
Creating Parquet File table
---------------------------

create table prq_table

(
c1 int,
c2 int,
c3 int,
c4 int
)
stored as PARQUET;

---------------------------
Creating ORC Format table
---------------------------

create table orc_table

(
c1 int,
c2 int,
c3 int,
c4 int
)
stored as ORC;

----------------------------------------
Loading All the tables in a single pass
----------------------------------------

FROM text_table
INSERT OVERWRITE TABLE seq_table SELECT *
INSERT OVERWRITE TABLE rc_table SELECT *
INSERT OVERWRITE TABLE prq_table SELECT *
INSERT OVERWRITE TABLE orc_table SELECT *;

----------------------------------------
Comparing sizes of loaded tables
----------------------------------------

describe formatted orc_table;

dfs -ls hdfs://sandbox.hortonworks.com:8020/apps/hive/warehouse/text_table;

-rw-r--r-- 1 root hdfs 4135847 2016-08-25 11:13
hdfs://sandbox.hortonworks.com:8020/apps/hive/warehouse/orc_table/000000_0

dfs -ls hdfs://sandbox.hortonworks.com:8020/apps/hive/warehouse/orc_table;

-rw-r--r-- 1 root hdfs 21593504 2016-08-25 11:12
hdfs://sandbox.hortonworks.com:8020/apps/hive/warehouse/text_table/ratings.dat
dfs -ls hdfs://sandbox.hortonworks.com:8020/apps/hive/warehouse/seq_table;
-rw-r--r-- 1 root hdfs 33928859 2016-08-25 11:13
hdfs://sandbox.hortonworks.com:8020/apps/hive/warehouse/seq_table/000000_0

dfs -ls hdfs://sandbox.hortonworks.com:8020/apps/hive/warehouse/rc_table;

-rw-r--r-- 1 root hdfs 11992620 2016-08-25 11:13
hdfs://sandbox.hortonworks.com:8020/apps/hive/warehouse/rc_table/000000_0

dfs -ls hdfs://sandbox.hortonworks.com:8020/apps/hive/warehouse/prq_table;

-rw-r--r-- 1 root hdfs 5941753 2016-08-25 11:13
hdfs://sandbox.hortonworks.com:8020/apps/hive/warehouse/prq_table/000000_0

----------------------------------------
Enablig Compression
----------------------------------------

Enabling compression provides performance gains in most cases and is supported for RCFile
and SequenceFile tables.
For example, to enable Snappy compression, you would specify the following additional settings
when loading data through the Hive shell.

SET hive.exec.compress.output=true;
SET mapred.max.split.size=256000000;
SET mapred.output.compression.type=BLOCK; -- block compression for sequence file
SET mapred.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec;

FROM text_table
INSERT OVERWRITE TABLE seq_table SELECT *
INSERT OVERWRITE TABLE rc_table SELECT *
INSERT OVERWRITE TABLE prq_table SELECT *
INSERT OVERWRITE TABLE orc_table SELECT *;

-------------------------------------------------------------------------------------------------------
Comparing sizes of loaded tables after compression (RC Files and Sequence Files are
benefited the most)
-------------------------------------------------------------------------------------------------------

dfs -ls hdfs://sandbox.hortonworks.com:8020/apps/hive/warehouse/text_table;

-rw-r--r-- 1 root hdfs 21593504 2016-08-25 11:12
hdfs://sandbox.hortonworks.com:8020/apps/hive/warehouse/text_table/ratings.dat

dfs -ls hdfs://sandbox.hortonworks.com:8020/apps/hive/warehouse/orc_table;

-rw-r--r-- 1 root hdfs 4135847 2016-08-25 11:22
hdfs://sandbox.hortonworks.com:8020/apps/hive/warehouse/orc_table/000000_0

dfs -ls hdfs://sandbox.hortonworks.com:8020/apps/hive/warehouse/seq_table;

-rw-r--r-- 1 root hdfs 10910048 2016-08-25 11:22
hdfs://sandbox.hortonworks.com:8020/apps/hive/warehouse/seq_table/000000_0

dfs -ls hdfs://sandbox.hortonworks.com:8020/apps/hive/warehouse/rc_table;

-rw-r--r-- 1 root hdfs 6352282 2016-08-25 11:22
hdfs://sandbox.hortonworks.com:8020/apps/hive/warehouse/rc_table/000000_0

dfs -ls hdfs://sandbox.hortonworks.com:8020/apps/hive/warehouse/prq_table;

-rw-r--r-- 1 root hdfs 5941753 2016-08-25 11:22
hdfs://sandbox.hortonworks.com:8020/apps/hive/warehouse/prq_table/000000_0

===================================================================

Hbase
No ratings yet
Hbase
8 pages
Avro Hands On Exercises
No ratings yet
Avro Hands On Exercises
2 pages
Hadoop Eco System - Class 1
No ratings yet
Hadoop Eco System - Class 1
25 pages
FLUME Hands On Exercises
No ratings yet
FLUME Hands On Exercises
1 page
Working of Hive: Mapreduce: It Is A Parallel Programming Model For Processing Large Amounts
No ratings yet
Working of Hive: Mapreduce: It Is A Parallel Programming Model For Processing Large Amounts
3 pages
Datatypes in Hive
No ratings yet
Datatypes in Hive
31 pages
Group 11
No ratings yet
Group 11
31 pages
Web Results: Ad WWW - Amazon.in
No ratings yet
Web Results: Ad WWW - Amazon.in
3 pages
Still Smart, CSC 415 Summery
No ratings yet
Still Smart, CSC 415 Summery
3 pages
MVC 1
No ratings yet
MVC 1
90 pages
Databricks Class 1 PPT
No ratings yet
Databricks Class 1 PPT
8 pages
Data Manipulation Language (DML) (Slides)
No ratings yet
Data Manipulation Language (DML) (Slides)
10 pages
Rev 1FWD QUESTION PAPER
No ratings yet
Rev 1FWD QUESTION PAPER
2 pages
Al - Wadi International School: Final Term Practical Exam April 2021
No ratings yet
Al - Wadi International School: Final Term Practical Exam April 2021
10 pages
MENT CS CLASSPART-5-Final
No ratings yet
MENT CS CLASSPART-5-Final
10 pages
Industrial Training Report
No ratings yet
Industrial Training Report
31 pages
Balasubramanian.S: Professional Objective
No ratings yet
Balasubramanian.S: Professional Objective
8 pages
Inventory Control Using ABC and Min-Max Analysis o
No ratings yet
Inventory Control Using ABC and Min-Max Analysis o
11 pages
SAP OData 60 Intermediate MCQs
No ratings yet
SAP OData 60 Intermediate MCQs
9 pages
Clinic 3 Answer
No ratings yet
Clinic 3 Answer
8 pages
Powerpdf Reference: Veision 0.8 (Beta)
No ratings yet
Powerpdf Reference: Veision 0.8 (Beta)
13 pages
Slide 05 5 MS-Server 2019 ADDS
No ratings yet
Slide 05 5 MS-Server 2019 ADDS
45 pages
Big Data and Data Analysis: Offurum Paschal I Kunoch Education and Training College, Owerri
No ratings yet
Big Data and Data Analysis: Offurum Paschal I Kunoch Education and Training College, Owerri
35 pages
MCS 220 em 2021-22
No ratings yet
MCS 220 em 2021-22
32 pages
Data Mining Tutorial - Javatpoint
No ratings yet
Data Mining Tutorial - Javatpoint
12 pages
Unit 3 SQL Extra-1-4
No ratings yet
Unit 3 SQL Extra-1-4
4 pages
Module 1A - Review of Elementary Statistics
No ratings yet
Module 1A - Review of Elementary Statistics
13 pages
Mysql Errors 9.2 en
No ratings yet
Mysql Errors 9.2 en
610 pages
03 Ch3 DesignTheoryforRelationalDatabases
No ratings yet
03 Ch3 DesignTheoryforRelationalDatabases
105 pages
Elasticsearch Optimization
No ratings yet
Elasticsearch Optimization
25 pages
Machine Learning For Data Science Unit-2
No ratings yet
Machine Learning For Data Science Unit-2
11 pages
Select Modifying Data: SQL Cheat Sheet - Postgresql
No ratings yet
Select Modifying Data: SQL Cheat Sheet - Postgresql
3 pages
MIS Questions Bank
No ratings yet
MIS Questions Bank
24 pages
Zadnag Stock Investors: Mr.P.Vijay Anand, C.Kasi Vishwanathan, S.Sajan, R.Surya
No ratings yet
Zadnag Stock Investors: Mr.P.Vijay Anand, C.Kasi Vishwanathan, S.Sajan, R.Surya
5 pages
SQ-402 SCADA Add-In User Guide
100% (1)
SQ-402 SCADA Add-In User Guide
23 pages
LT Replication Server Overview For Sap Hana
No ratings yet
LT Replication Server Overview For Sap Hana
44 pages
20ad41e2 - Data Science
No ratings yet
20ad41e2 - Data Science
2 pages
Cap3 SMS
No ratings yet
Cap3 SMS
1 page

Hive Practical 2

Uploaded by

Hive Practical 2

Uploaded by

vi employee.

CREATE TABLE employee

LOAD DATA LOCAL INPATH 'employee.csv'

4 true ["BIG-DATA","HADOOP","HIVE","SPARK"] HADOOP|HIVE|SPARK|BIG-DATA

SELECT explode(skills) AS skills FROM employee;

SELECT id, name, skill

101 Amit HADOOP

SET hive.mapred.mode=nostrict; --default is nostrict

SELECT * FROM users DISTRIBUTE BY unit SORT BY name ASC;

CREATE TABLE users

CREATE TABLE locations

LOAD DATA LOCAL INPATH '/root/hive/users.txt'

LOAD DATA LOCAL INPATH '/root/hive/locations.txt'

CREATE TABLE buck_users

CREATE TABLE buck_locations

INSERT OVERWRITE TABLE buck_users

INSERT OVERWRITE TABLE buck_locations

SELECT * FROM buck_users u INNER JOIN buck_locations l

SELECT * FROM buck_users u LEFT OUTER JOIN buck_locations l

SELECT * FROM buck_users u RIGHT OUTER JOIN buck_locations l

SELECT * FROM buck_users u FULL OUTER JOIN buck_locations l

SELECT * FROM buck_users u JOIN buck_locations l

load data local inpath '/home/acadgild/hive/emp_details.txt'

describe formatted emp_details;

dfs -ls hdfs://localhost:9000/user/hive/warehouse/emp_details;

create table emp_details_partitioned

insert overwrite table emp_details_partitioned

select count(*) from emp_details where location='BBSR';

select count(*) from emp_details where name='Aditya';

alter table emp_details_partitioned drop partition(location='BBSR');

SELECT * from users TABLESAMPLE(BUCKET 2 OUT OF 4 ON name) s;

SELECT * FROM buck_users TABLESAMPLE(BUCKET 1 OUT OF 2 ON id) s LIMIT 1;

load data local inpath '/root/hive/datasets_for_fileformats/ratings.dat'

create table seq_table

create table rc_table

create table prq_table

create table orc_table

describe formatted orc_table;

dfs -ls hdfs://sandbox.hortonworks.com:8020/apps/hive/warehouse/text_table;

dfs -ls hdfs://sandbox.hortonworks.com:8020/apps/hive/warehouse/orc_table;

dfs -ls hdfs://sandbox.hortonworks.com:8020/apps/hive/warehouse/rc_table;

dfs -ls hdfs://sandbox.hortonworks.com:8020/apps/hive/warehouse/prq_table;

dfs -ls hdfs://sandbox.hortonworks.com:8020/apps/hive/warehouse/text_table;

dfs -ls hdfs://sandbox.hortonworks.com:8020/apps/hive/warehouse/orc_table;

dfs -ls hdfs://sandbox.hortonworks.com:8020/apps/hive/warehouse/seq_table;

dfs -ls hdfs://sandbox.hortonworks.com:8020/apps/hive/warehouse/rc_table;

dfs -ls hdfs://sandbox.hortonworks.com:8020/apps/hive/warehouse/prq_table;

You might also like