0% found this document useful (0 votes)
2 views31 pages

Pyspark SQL Final Document

The document outlines a series of operations performed on Spark DataFrames, including data creation, filtering, aggregation, and joining. It demonstrates various SQL queries to manipulate and analyze data, such as selecting columns, applying conditions, and performing aggregations. Additionally, it showcases window functions, subqueries, and collection functions like collect_list and explode.

Uploaded by

mr.kayycreations
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
2 views31 pages

Pyspark SQL Final Document

The document outlines a series of operations performed on Spark DataFrames, including data creation, filtering, aggregation, and joining. It demonstrates various SQL queries to manipulate and analyze data, such as selecting columns, applying conditions, and performing aggregations. Additionally, it showcases window functions, subqueries, and collection functions like collect_list and explode.

Uploaded by

mr.kayycreations
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 31

data = [

(0, "06-26-2011", 300.4, "Exercise", "GymnasticsPro", "cash"),


(1, "05-26-2011", 200.0, "Exercise Band", "Weightlifting", "credit"),
(2, "06-01-2011", 300.4, "Exercise", "Gymnastics Pro", "cash"),
(3, "06-05-2011", 100.0, "Gymnastics", "Rings", "credit"),
(4, "12-17-2011", 300.0, "Team Sports", "Field", "cash"),
(5, "02-14-2011", 200.0, "Gymnastics", None, "cash"),
(6, "06-05-2011", 100.0, "Exercise", "Rings", "credit"),
(7, "12-17-2011", 300.0, "Team Sports", "Field", "cash"),
(8, "02-14-2011", 200.0, "Gymnastics", None, "cash")
]

df = spark.createDataFrame(data, ["id", "tdate", "amount", "category", "product",


"spendby"])
df.show()

data2 = [
(4, "12-17-2011", 300.0, "Team Sports", "Field", "cash"),
(5, "02-14-2011", 200.0, "Gymnastics", None, "cash"),
(6, "02-14-2011", 200.0, "Winter", None, "cash"),
(7, "02-14-2011", 200.0, "Winter", None, "cash")
]

df1 = spark.createDataFrame(data2, ["id", "tdate", "amount", "category", "product",


"spendby"])
df1.show()

data4 = [
(1, "raj"),
(2, "ravi"),
(3, "sai"),
(5, "rani")
]

cust = spark.createDataFrame(data4, ["id", "name"])


cust.show()

data3 = [
(1, "mouse"),
(3, "mobile"),
(7, "laptop")
]

prod = spark.createDataFrame(data3, ["id", "product"])


prod.show()

# Register DataFrames as temporary views


df.createOrReplaceTempView("df")
df1.createOrReplaceTempView("df1")
cust.createOrReplaceTempView("cust")
prod.createOrReplaceTempView("prod")

df.createOrReplaceTempView("df")
df1.createOrReplaceTempView("df1")
cust.createOrReplaceTempView("cust")
prod.createOrReplaceTempView("prod")

sc.setLogLevel("ERROR")
spark.sql("select * from df order by id").show()
spark.sql("select * from df1 order by id").show()

====================================
Validate data
====================================
spark.sql("select * from df ").show()

====================================
Select two columns
====================================

spark.sql("select id,tdate from df order by id").show()

====================================
Select column with category filter = Exercise
====================================
spark.sql("select id,tdate,category from df where category='Exercise'
order by id").show()

====================================
Multi Column filter
====================================

spark.sql("select id,tdate,category,spendby from df where category='Exercise'


and spendby='cash' ").show()

====================================
Multi Value Filter
====================================
spark.sql("select * from df where category in
('Exercise','Gymnastics')").show()

====================================
Like Filter
====================================

spark.sql("select * from df where product like ('%Gymnastics%')").show()

====================================
Not Filters
====================================

spark.sql("select * from df where category != 'Exercise'").show()


====================================
Not In Filters
====================================

spark.sql("select * from df where category not in


('Exercise','Gymnastics')").show()

====================================
Null Filters
====================================
spark.sql("select * from df where product is null").show()
====================================
Not Null Filters
====================================

====================================
Max Function
====================================

spark.sql("select max(id) from df ").show()

====================================
Min Funtion
====================================
spark.sql("select min(id) from df ").show()

====================================
Count
====================================

spark.sql("select count(1) from df ").show()

====================================
Condition statement
====================================

spark.sql("select *,case when spendby='cash' then 1 else 0 end as status from


df ").show()
====================================
Concat data
====================================
spark.sql("select id,category,concat(id,'-',category) as condata from df").show()

====================================
Concat_ws data
====================================
spark.sql("select
id,category,product,concat_ws('-',id,cate
gory,product) as condata from df").show()

====================================
Lower Case data
====================================
spark.sql("select category,lower(category) as lower from df ").show()

====================================
Ceil data
====================================

spark.sql("select amount,ceil(amount) as ceil from df").show()

====================================
Round the data
====================================

spark.sql("select amount,round(amount) as round from df").show()


====================================
Replace Nulls
====================================

spark.sql("select
product,coalesce(product,'NA') as nullrep
from df").show()
====================================
Trim the space
====================================

spark.sql("select trim(product) from df").show()


====================================
Distinct the columns
====================================

spark.sql("select distinct category,spendby from df").show()

====================================
Substring with Trim
====================================

spark.sql("select substring(product,1,10)
as sub from df").show()
====================================
Substring/Split operation
====================================

spark.sql("select SUBSTRING_INDEX(category,' ',1) as spl from df").show()


====================================
Union all
====================================

spark.sql("select * from df union all select * from df1").show()


====================================
Union
====================================

spark.sql("select * from df union select * from df1 order by id").show()

====================================
Aggregate Sum
====================================

spark.sql("select category, sum(amount) as total from df group by


category").show()
====================================
Aggregate sum with two columns
====================================

spark.sql("select category,spendby,sum(amount) as total from df group by


category,spendby").show()

====================================
Aggregate Count
====================================

spark.sql("select category,spendby,sum(amount) As total,count(amount) as


cnt from df group by category,spendby").show()
====================================
Aggregate Max
====================================

spark.sql("select category, max(amount) as max from df group by


category").show()

====================================
Aggregate with Order Descending
====================================

spark.sql("select category, max(amount) as max from df group by category


order by category desc").show()
====================================
Window Row Number
====================================

spark.sql("SELECT category,amount, row_number() OVER ( partition by


category order by amount desc ) AS row_number FROM df").show()

====================================
Window Dense_rank Number
====================================
spark.sql("SELECT category,amount, dense_rank() OVER ( partition by category
order by amount desc ) AS dense_rank FROM df").show()

====================================
Window rank Number
====================================

spark.sql("SELECT category,amount, rank() OVER ( partition by category order


by amount desc ) AS rank FROM df").show()
====================================
Window Lead function
====================================

spark.sql("SELECT category,amount, lead(amount) OVER ( partition by category


order by amount desc ) AS lead FROM df").show()
====================================
Window lag function
====================================

spark.sql("SELECT category,amount, lag(amount) OVER ( partition by category


order by amount desc ) AS lag FROM df").show()

====================================
Having function
====================================

spark.sql("select category,count(category) as cnt from df group by category


having count(category)>1").show()
====================================
Inner Join
====================================
spark.sql("select a.id,a.name,b.product from cust a join prod b on
a.id=b.id").show()

====================================
Left Join
====================================
spark.sql("select a.id,a.name,b.product from cust a left join prod b on
a.id=b.id").show()
====================================
Right Join
====================================

spark.sql("select a.id,a.name,b.product from cust a right join prod b on


a.id=b.id").show()

====================================
Full Join
====================================

spark.sql("select a.id,a.name,b.product from cust a full join prod b on


a.id=b.id").show()
====================================
left anti Join
====================================

spark.sql("select a.id,a.name from cust a LEFT ANTI JOIN prod b on


a.id=b.id").show()
====================================
Date format
====================================

spark.sql("select id,tdate,from_unixtime(unix_timestamp(tdate,'MM-dd-
yyyy'),'yyyy-MM-dd') as con_date from df").show()

====================================
Sub query
====================================

spark.sql("""

select sum(amount) as total , con_date from(


select id,tdate,from_unixtime(unix_timestamp(tdate,'MM-dd-yyyy'),'yyyy-MM-
dd') as con_date,amount,category,product,spendby from df)

group by con_date

""").show()

====================================
collect_list
====================================

spark.sql("select category,collect_list(spendby) as col_spend from df group by


category").show()
====================================
collect_set
====================================

spark.sql("select category,collect_set(spendby) as col_spend from df group by


category ").show()

====================================
Explode
====================================
spark.sql("select category,explode(col_spend) as ex_spend from (select
category,collect_set(spendby) as col_spend from df group by
category)").show()

====================================
explode_outer
====================================

spark.sql("select category,explode_outer(col_spend) as ex_spend from (select


category,collect_set(spendby) as col_spend from df group by
category)").show()

You might also like