V2 SQL Final Document
V2 SQL Final Document
csv")
df.show()
df1.show()
cust.show()
prod.show()
df.createOrReplaceTempView("df")
df1.createOrReplaceTempView("df1")
cust.createOrReplaceTempView("cust")
prod.createOrReplaceTempView("prod")
sc.setLogLevel("ERROR")
spark.sql("select * from df order by id").show()
spark.sql("select * from df1 order by id").show()
====================================
Validate data
====================================
====================================
Select two columns
====================================
====================================
Multi Column filter
====================================
spark.sql("select id,tdate,category,spendby from df where category='Exercise'
and spendby='cash' ").show()
====================================
Multi Value Filter
====================================
====================================
Like Filter
====================================
====================================
Not Filters
====================================
====================================
Not In Filters
====================================
spark.sql("select * from df where category not in
('Exercise','Gymnastics')").show()
====================================
Null Filters
====================================
spark.sql("select * from df where product is null").show()
====================================
Not Null Filters
====================================
====================================
Max Function
====================================
====================================
Min Funtion
====================================
spark.sql("select min(id) from df ").show()
====================================
Count
====================================
====================================
Condition statement
====================================
====================================
Concat data
====================================
spark.sql("select id,category,concat(id,'-',category) as condata from df").show()
====================================
Concat_ws data
====================================
spark.sql("select
id,category,product,concat_ws('-',id,cate
gory,product) as condata from df").show()
====================================
Lower Case data
====================================
====================================
Ceil data
====================================
====================================
Replace Nulls
====================================
spark.sql("select
product,coalesce(product,'NA') as nullrep
from df").show()
====================================
Trim the space
====================================
spark.sql("select substring(product,1,10)
as sub from df").show()
====================================
Substring/Split operation
====================================
====================================
Aggregate Sum
====================================
====================================
Aggregate Count
====================================
====================================
Aggregate with Order Descending
====================================
====================================
Window Dense_rank Number
====================================
spark.sql("SELECT category,amount, dense_rank() OVER ( partition by category
order by amount desc ) AS dense_rank FROM df").show()
====================================
Window rank Number
====================================
====================================
Having function
====================================
====================================
Left Join
====================================
spark.sql("select a.id,a.name,b.product from cust a left join prod b on
a.id=b.id").show()
====================================
Right Join
====================================
====================================
Full Join
====================================
spark.sql("select id,tdate,from_unixtime(unix_timestamp(tdate,'MM-dd-
yyyy'),'yyyy-MM-dd') as con_date from df").show()
====================================
Sub query
====================================
spark.sql("""
group by con_date
""").show()
====================================
Total Eclipse Code
====================================
package pack
object obj {
def main(args:Array[String]):Unit={
val conf = new
SparkConf().setAppName("Revision").setMaster("local[*]")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark =
SparkSession.builder().enableHiveSupport()
.config("spark.sql.warehouse.dir",
"file:///C:/hivewarehou/")
.config("spark.sql.catalogImplementation","hiv
e").getOrCreate()
import spark.implicits._
val df =
spark.read.option("header","true").csv("file:///C:/data/df.csv")
val df1 =
spark.read.option("header","true").csv("file:///C:/data/df1.csv")
val cust =
spark.read.option("header","true").csv("file:///C:/data/cust.csv")
val prod =
spark.read.option("header","true").csv("file:///C:/data/prod.csv")
df.show()
df1.show()
cust.show()
prod.show()
df.createOrReplaceTempView("df")
df1.createOrReplaceTempView("df1")
cust.createOrReplaceTempView("cust")
prod.createOrReplaceTempView("prod")
spark.sql("select id,tdate,from_unixtime(unix_timestamp(tdate,'MM-dd-
yyyy'),'yyyy-MM-dd') as con_date from df").show()
spark.sql("""
select id,tdate,from_unixtime(unix_timestamp(tdate,'MM-dd-yyyy'),'yyyy-MM-
dd') as con_date,amount,category,product,spendby from df)
group by con_date
""").show()
}