Pyspark SQL Final Document
Pyspark SQL Final Document
data2 = [
(4, "12-17-2011", 300.0, "Team Sports", "Field", "cash"),
(5, "02-14-2011", 200.0, "Gymnastics", None, "cash"),
(6, "02-14-2011", 200.0, "Winter", None, "cash"),
(7, "02-14-2011", 200.0, "Winter", None, "cash")
]
data4 = [
(1, "raj"),
(2, "ravi"),
(3, "sai"),
(5, "rani")
]
data3 = [
(1, "mouse"),
(3, "mobile"),
(7, "laptop")
]
df.createOrReplaceTempView("df")
df1.createOrReplaceTempView("df1")
cust.createOrReplaceTempView("cust")
prod.createOrReplaceTempView("prod")
sc.setLogLevel("ERROR")
spark.sql("select * from df order by id").show()
spark.sql("select * from df1 order by id").show()
====================================
Validate data
====================================
spark.sql("select * from df ").show()
====================================
Select two columns
====================================
====================================
Select column with category filter = Exercise
====================================
spark.sql("select id,tdate,category from df where category='Exercise'
order by id").show()
====================================
Multi Column filter
====================================
====================================
Multi Value Filter
====================================
spark.sql("select * from df where category in
('Exercise','Gymnastics')").show()
====================================
Like Filter
====================================
====================================
Not Filters
====================================
====================================
Null Filters
====================================
spark.sql("select * from df where product is null").show()
====================================
Not Null Filters
====================================
====================================
Max Function
====================================
====================================
Min Funtion
====================================
spark.sql("select min(id) from df ").show()
====================================
Count
====================================
====================================
Condition statement
====================================
====================================
Concat_ws data
====================================
spark.sql("select
id,category,product,concat_ws('-',id,cate
gory,product) as condata from df").show()
====================================
Lower Case data
====================================
spark.sql("select category,lower(category) as lower from df ").show()
====================================
Ceil data
====================================
====================================
Round the data
====================================
spark.sql("select
product,coalesce(product,'NA') as nullrep
from df").show()
====================================
Trim the space
====================================
====================================
Substring with Trim
====================================
spark.sql("select substring(product,1,10)
as sub from df").show()
====================================
Substring/Split operation
====================================
====================================
Aggregate Sum
====================================
====================================
Aggregate Count
====================================
====================================
Aggregate with Order Descending
====================================
====================================
Window Dense_rank Number
====================================
spark.sql("SELECT category,amount, dense_rank() OVER ( partition by category
order by amount desc ) AS dense_rank FROM df").show()
====================================
Window rank Number
====================================
====================================
Having function
====================================
====================================
Left Join
====================================
spark.sql("select a.id,a.name,b.product from cust a left join prod b on
a.id=b.id").show()
====================================
Right Join
====================================
====================================
Full Join
====================================
spark.sql("select id,tdate,from_unixtime(unix_timestamp(tdate,'MM-dd-
yyyy'),'yyyy-MM-dd') as con_date from df").show()
====================================
Sub query
====================================
spark.sql("""
group by con_date
""").show()
====================================
collect_list
====================================
====================================
Explode
====================================
spark.sql("select category,explode(col_spend) as ex_spend from (select
category,collect_set(spendby) as col_spend from df group by
category)").show()
====================================
explode_outer
====================================