Hive Practice - New
Hive Practice - New
--------------------------
hive -hiveconf hive.root.logger=DEBUG,console
A. Create Database
------------------
create database retail_hive;
B. Select Database
------------------
use retail_hive;
DBS:
Database Name : retail_hive
TBLS:
Name of the table : sales_data_tbl
Table Type : default: MANAGED_TABLE [Optional]
COLUMNS_V2:
Column Names : TransID, TransDate, etc..
Column Data Type : INT, STRING, etc..
SDS:
Column Delimiter : ,
Row Delimiter : \n
Table Data Location :
/apps/hive/warehouse/retail_hive.db/sales_data_tbl [Optional]
/project1/data/pig/pigoutput/sales_data/
H. Summary Tables
-----------------
sales_summary_by_city:
----------------------
drop table sales_summary_by_city;
sales_summary_by_state:
----------------------
drop table sales_summary_by_state;
sales_summary_by_country:
-------------------------
drop table sales_summary_by_country;
sales_summary_by_month_by_state:
--------------------------------
drop table sales_summary_by_month_by_state;
sales_summary_by_month_by_country:
----------------------------------
drop table sales_summary_by_month_by_country;
sales_summary_by_month
----------------------
drop table sales_summary_by_month;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.dynamic.partition=true;
set hive.enforce.bucketing=true;
set hive.vectorized.execution.enabled = true;
set hive.vectorized.execution.reduce.enabled = true;
create external table sales_summary (TransDate STRING, Amount DOUBLE, State STRING,
City STRING) row format delimited FIELDS TERMINATED BY ',' LINES TERMINATED BY '\
n';
create table txnrecords(txnno INT, txndate STRING, custno INT, amount DOUBLE,
product STRING,category STRING, city STRING, state STRING, spendby STRING)
create table txnrecsByCity(txnno INT, txndate STRING, custno INT, amount DOUBLE,
product STRING, city STRING, state STRING, spendby STRING) partitioned by (category
STRING) clustered by (state) INTO 10 buckets row format delimited fields terminated
by ',' stored as textfile;
Beeline:
========
beeline
!connect jdbc:hive2://hn2.hadoop.com:10000/default
User Name: hdpuser
Password: welcome1
set -v;
However, a query across all partitions could trigger an enormous MapReduce job if
the table data and number of partitions are large. A highly suggested safety
measure is putting Hive into strict mode, which prohibits queries of partitioned
tables without a WHERE clause that filters on partitions. You can set the mode to
nonstrict, as in the following session:
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.dynamic.partition=true;
set hive.enforce.bucketing=true;
-----------------=======
find sales based on age group
-----------------=======
describe out2;
create table out3 (level string, amount double) row format delimited fields
terminated by ',';
insert overwrite table out3 select level,sum(amount) from out2 group by level;
==============
simple join
==============
create table employee(name string, salary float,city string) row format delimited
fields terminated by ',';
create table mailid (name string, email string) row format delimited fields
terminated by ',';
----------------------------------===
Custom Mapper Code to manipulate unix timestamp
----------------------------------===
CREATE TABLE u_data ( userid INT, movieid INT, rating INT, unixtime STRING) ROW
FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE;
****Create�weekday_mapper.py:
import sys
import datetime
for line in sys.stdin:
line = line.strip()
userid, movieid, rating, unixtime = line.split('\t')
weekday = datetime.datetime.fromtimestamp(float(unixtime)).isoweekday()
print '\t'.join([userid, movieid, rating, str(weekday)])
CREATE TABLE u_data_new (userid INT, movieid INT, rating INT, weekday INT) ROW
FORMAT DELIMITED FIELDS TERMINATED BY '\t';
****The following command uses the TRANSFORM clause to embed the mapper scripts.
===========
UDF
===========
import java.util.Date;
import java.text.DateFormat;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;
public class UnixtimeToDate extends UDF{
public Text evaluate(Text text){
if(text==null) return null;
long timestamp = Long.parseLong(text.toString());
return new Text(toDate(timestamp));
}
private String toDate(long timestamp) {
Date date = new Date (timestamp*1000);
return DateFormat.getInstance().format(date).toString();
}
}
****one,1386023259550
****two,1389523259550
****three,1389523259550
****four,1389523259550
****OK
****four 3/28/02 8:12 PM
****one 4/30/91 1:59 PM
****two 3/28/02 8:12 PM
****three 3/28/02 8:12 PM
Hive View:
----------
If you are getting S020 Data storage error, please check ambari-server log and find
out which table is missing and create table as below.