0% found this document useful (0 votes)
29 views8 pages

Hive Practice - New

Uploaded by

chandhu194
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
29 views8 pages

Hive Practice - New

Uploaded by

chandhu194
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 8

To run hive in debug mode:

--------------------------
hive -hiveconf hive.root.logger=DEBUG,console

A. Create Database
------------------
create database retail_hive;

B. Select Database
------------------
use retail_hive;

C. Create table for storing transactional records


-------------------------------------------------
create table retail_hive.sales_data_tbl (TransID INT, TransDate STRING, Product
STRING, Price DOUBLE, PaymentType STRING, CustName STRING, City STRING, State
STRING, Country STRING) row format delimited FIELDS TERMINATED BY ',' LINES
TERMINATED BY '\n' LOCATION '/apps/hive/warehouse/retail_hive.db/sales_data_tbl';

DBS:
Database Name : retail_hive

TBLS:
Name of the table : sales_data_tbl
Table Type : default: MANAGED_TABLE [Optional]

COLUMNS_V2:
Column Names : TransID, TransDate, etc..
Column Data Type : INT, STRING, etc..

SDS:
Column Delimiter : ,
Row Delimiter : \n
Table Data Location :
/apps/hive/warehouse/retail_hive.db/sales_data_tbl [Optional]

create EXTERNAL table sales_data_tbl1 (TransID INT, TransDate STRING, Product


STRING, Price DOUBLE, PaymentType STRING, CustName STRING, City STRING, State
STRING, Country STRING) row format delimited FIELDS TERMINATED BY ',' LINES
TERMINATED BY '\n' LOCATION '/project1/data/pig/pigoutput/sales_data';

/project1/data/pig/pigoutput/sales_data/

hdfs dfs -cp /project1/data/pig/pigoutput/sales_data/part* /project1/data/pig/;

LOAD DATA INPATH '/project1/data/pig/pigoutput/sales_data/part*' INTO TABLE


sales_data_tbl;

D. Describing metadata or schema of the table


---------------------------------------------
describe sales_data_tbl;

alter table sales_data_tbl SET TBLPROPERTIES('EXTERNAL'='TRUE');

E. Load the data into the table


-------------------------------
F. Counting no of records
-------------------------
select count(*) from sales_data_tbl;

G. Counting total spending by product


-------------------------------------
select year(transdate),sum(Price) from sales_data_tbl group by year(transdate);
select month(transdate),sum(Price) from sales_data_tbl group by month(transdate);
select year(transdate),month(transdate),sum(Price) from sales_data_tbl group by
year(transdate),month(transdate);
select country, sum(price) from sales_data_tbl group by country;
select product, sum(price) from sales_data_tbl group by product;

H. Summary Tables
-----------------

sales_summary_by_city:
----------------------
drop table sales_summary_by_city;

create table sales_summary_by_city (TransDate STRING, Amount DOUBLE, City STRING,


State STRING, Country STRING) row format delimited FIELDS TERMINATED BY ',' LINES
TERMINATED BY '\n';

insert into sales_summary_by_city select to_date(transdate), sum(price), city,


state, country from sales_data_tbl group by to_date(transdate), city, state,
country;

select * from sales_summary_by_city;

sales_summary_by_state:
----------------------
drop table sales_summary_by_state;

create table sales_summary_by_state (TransDate STRING, Amount DOUBLE, State STRING,


Country STRING) row format delimited FIELDS TERMINATED BY ',' LINES TERMINATED BY
'\n';

insert into sales_summary_by_state select to_date(transdate), sum(price), state,


country from sales_data_tbl group by to_date(transdate), state, country;

select * from sales_summary_by_state;

sales_summary_by_country:
-------------------------
drop table sales_summary_by_country;

create table sales_summary_by_country (TransDate STRING, Amount DOUBLE, Country


STRING) row format delimited FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n';

insert into sales_summary_by_country select to_date(transdate), sum(price), country


from sales_data_tbl group by to_date(transdate), country;

select * from sales_summary_by_country;


sales_summary_by_month_by_city:
-------------------------------
drop table sales_summary_by_month_by_city;

create table sales_summary_by_month_by_city (TransMonth STRING, Amount DOUBLE, City


STRING, State STRING, Country STRING) row format delimited FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n';

insert into sales_summary_by_month_by_city select


date_format(to_date(transdate),'MM-yyyy'), sum(price), city, state, country from
sales_data_tbl group by date_format(to_date(transdate),'MM-yyyy'), city, state,
country;

select * from sales_summary_by_month_by_city;

sales_summary_by_month_by_state:
--------------------------------
drop table sales_summary_by_month_by_state;

create table sales_summary_by_month_by_state (TransMonth STRING, Amount DOUBLE,


State STRING, Country STRING) row format delimited FIELDS TERMINATED BY ',' LINES
TERMINATED BY '\n';

insert into sales_summary_by_month_by_state select


date_format(to_date(transdate),'MM-yyyy'), sum(price), state, country from
sales_data_tbl group by date_format(to_date(transdate),'MM-yyyy'), state, country;

select * from sales_summary_by_month_by_state;

sales_summary_by_month_by_country:
----------------------------------
drop table sales_summary_by_month_by_country;

create table sales_summary_by_month_by_country (TransMonth STRING, Amount DOUBLE,


Country STRING) row format delimited FIELDS TERMINATED BY ',' LINES TERMINATED BY
'\n';

insert into sales_summary_by_month_by_country select


date_format(to_date(transdate),'MM-yyyy'), sum(price), country from sales_data_tbl
group by date_format(to_date(transdate),'MM-yyyy'), country;

select * from sales_summary_by_month_by_country;

sales_summary_by_month
----------------------
drop table sales_summary_by_month;

create table sales_summary_by_month (TransYear STRING,TransMonth STRING, Amount


DOUBLE) row format delimited FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n';

insert into sales_summary_by_month select


year(transdate),month(transdate),sum(Price) from sales_data_tbl group by
year(transdate),month(transdate);

select * from sales_summary_by_month;


I. Create partitioned table
---------------------------

create external table sales_by_country (TransID INT, TransDate STRING, Product


STRING, Price DOUBLE, PaymentType STRING, CustName STRING, State STRING, City
STRING) partitioned by (Country STRING) row format delimited fields terminated by
',' stored as textfile;

describe formatted sales_by_country;

set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.dynamic.partition=true;
set hive.enforce.bucketing=true;
set hive.vectorized.execution.enabled = true;
set hive.vectorized.execution.reduce.enabled = true;

INSERT OVERWRITE TABLE sales_by_country PARTITION(country) select txn.TransID,


txn.TransDate,txn.Product,txn.Price,txn.PaymentType,txn.CustName,txn.State,txn.city
,txn.Country from sales_data_tbl txn ;

create external table sales_summary (TransDate STRING, Amount DOUBLE, State STRING,
City STRING) row format delimited FIELDS TERMINATED BY ',' LINES TERMINATED BY '\
n';

select to_date(transdate), sum(price), state, country from sales_data_tbl group by


to_date(transdate),state,country;

create table txnrecords(txnno INT, txndate STRING, custno INT, amount DOUBLE,
product STRING,category STRING, city STRING, state STRING, spendby STRING)

create table txnrecsByCity(txnno INT, txndate STRING, custno INT, amount DOUBLE,
product STRING, city STRING, state STRING, spendby STRING) partitioned by (category
STRING) clustered by (state) INTO 10 buckets row format delimited fields terminated
by ',' stored as textfile;

describe formatted txnrecsByCat;

Beeline:
========
beeline

!connect jdbc:hive2://hn2.hadoop.com:10000/default
User Name: hdpuser
Password: welcome1

set -v;

J. Configure Hive to allow partitions


-------------------------------------

However, a query across all partitions could trigger an enormous MapReduce job if
the table data and number of partitions are large. A highly suggested safety
measure is putting Hive into strict mode, which prohibits queries of partitioned
tables without a WHERE clause that filters on partitions. You can set the mode to
nonstrict, as in the following session:

set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.dynamic.partition=true;
set hive.enforce.bucketing=true;

K. Load data into partition table


----------------------------------
set hive.vectorized.execution.enabled = true;
set hive.vectorized.execution.reduce.enabled = true;

from txnrecords txn INSERT OVERWRITE TABLE txnrecsByCat PARTITION(category) select


txn.txnno, txn.txndate,txn.custno, txn.amount,txn.product,txn.city,txn.state,
txn.spendby, txn.category DISTRIBUTE BY category;

-----------------=======
find sales based on age group
-----------------=======

create table customer(custno string, firstname string, lastname string, age


int,profession string) row format delimited fields terminated by ',';

load data local inpath '/home/hdpuser/custs.txt' into table customer;

create table out1 (custno int,firstname string,age int,profession string,amount


double,product string) row format delimited fields terminated by ',';

insert overwrite table out1 select


a.custno,a.firstname,a.age,a.profession,b.amount,b.product from customer a JOIN
txnrecords b ON a.custno = b.custno;

select * from out1 limit 100;

create table out2 (custno int,firstname string,age int,profession string,amount


double,product string, level string) row format delimited fields terminated by ',';

insert overwrite table out2


select * , case
when age<30 then 'young'
when age>=30 and age < 50 then 'middle'
when age>=50 then 'old'
else 'others'
end
from out1;

select * from out2 limit 100;

describe out2;

create table out3 (level string, amount double) row format delimited fields
terminated by ',';

insert overwrite table out3 select level,sum(amount) from out2 group by level;

select * from out3 limit 100;

==============
simple join
==============

create table employee(name string, salary float,city string) row format delimited
fields terminated by ',';

load data local inpath '/home/hdpuser/emp.txt' into table employee;

select * from employee where name='tarun';

create table mailid (name string, email string) row format delimited fields
terminated by ',';

load data local inpath '/home/hdpuser/email.txt' into table mailid;

select a.name,a.city,a.salary,b.email from employee a join mailid b on a.name =


b.name;

select a.name,a.city,a.salary,b.email from employee a left outer join mailid b on


a.name = b.name;

select a.name,a.city,a.salary,b.email from employee a right outer join mailid b on


a.name = b.name;

select a.name,a.city,a.salary,b.email from employee a full outer join mailid b on


a.name = b.name;

----------------------------------===
Custom Mapper Code to manipulate unix timestamp
----------------------------------===

CREATE TABLE u_data ( userid INT, movieid INT, rating INT, unixtime STRING) ROW
FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE;

And load it into the table that was just created:

LOAD DATA LOCAL INPATH '/home/hdpuser/u.data.txt' OVERWRITE INTO TABLE u_data;

Count the number of rows in table u_data:

SELECT COUNT(*) FROM u_data;

****Create�weekday_mapper.py:

import sys
import datetime
for line in sys.stdin:
line = line.strip()
userid, movieid, rating, unixtime = line.split('\t')
weekday = datetime.datetime.fromtimestamp(float(unixtime)).isoweekday()
print '\t'.join([userid, movieid, rating, str(weekday)])

CREATE TABLE u_data_new (userid INT, movieid INT, rating INT, weekday INT) ROW
FORMAT DELIMITED FIELDS TERMINATED BY '\t';

add FILE /home/hdpuser/weekday_mapper.py;

****Note that columns will be transformed to string and delimited


****by TAB before feeding to the user script, and the standard output
****of the user script will be treated as TAB-separated string columns.

****The following command uses the TRANSFORM clause to embed the mapper scripts.

INSERT OVERWRITE TABLE u_data_new SELECT TRANSFORM (userid, movieid, rating,


unixtime) USING 'python weekday_mapper.py' AS (userid, movieid, rating, weekday)
FROM u_data;

SELECT weekday, COUNT(*) FROM u_data_new GROUP BY weekday;

===========
UDF
===========

import java.util.Date;
import java.text.DateFormat;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;
public class UnixtimeToDate extends UDF{
public Text evaluate(Text text){
if(text==null) return null;
long timestamp = Long.parseLong(text.toString());
return new Text(toDate(timestamp));
}
private String toDate(long timestamp) {
Date date = new Date (timestamp*1000);
return DateFormat.getInstance().format(date).toString();
}
}

/usr/bin/javac -classpath /usr/local/hadoop-2.6.4/share/hadoop/common/hadoop-


common-2.6.4.jar:/etc/hadoop/apache-hive-0.13.0-bin/apache-hive-0.13.0-bin/lib/
hive-exec-0.13.0.jar UnixtimeToDate.java

****Pack this class file into a jar:�


$/usr/bin/jar -cvf convert.jar UnixtimeToDate.class

****Verify jar using command :�


$/usr/bin/jar -tvf convert.jar

****add this jar in hive prompt


ADD JAR /home/hdpuser/convert.jar;

****Then you create your custom function as follows:


create temporary function�userdate�as 'UnixtimeToDate';

****one,1386023259550
****two,1389523259550
****three,1389523259550
****four,1389523259550

create table testing(id string,id_time string) row format delimited fields


terminated by ',';

load data inpath '/data/counter' into table testing;

hive> select * from testing;


****OK
****one 1386023259550
****two 1389523259550
****three 1389523259550
****four 1389523259550

****Then use function 'userdate' in sql command

select id,userdate(id_time) from testing;

****OK
****four 3/28/02 8:12 PM
****one 4/30/91 1:59 PM
****two 3/28/02 8:12 PM
****three 3/28/02 8:12 PM

Hive View:
----------
If you are getting S020 Data storage error, please check ambari-server log and find
out which table is missing and create table as below.

mysql -u root -pwelcome1

CREATE TABLE ambari.DS_JOBIMPL_15 (DS_id VARCHAR(255) NOT NULL, DS_applicationId


TEXT, DS_confFile TEXT, DS_dagId TEXT, DS_dagName TEXT, DS_dataBase TEXT,
DS_dateSubmitted BIGINT, DS_duration BIGINT, DS_forcedContent TEXT,
DS_globalSettings TEXT, DS_guid TEXT, DS_hiveQueryId TEXT, DS_logFile TEXT,
DS_owner TEXT, DS_queryFile TEXT, DS_queryId TEXT, DS_referrer TEXT, DS_sessionTag
TEXT, DS_sqlState TEXT, DS_status TEXT, DS_statusDir TEXT, DS_statusMessage TEXT,
DS_title TEXT, PRIMARY KEY (ds_id));

You might also like