0% found this document useful (0 votes)

3 views31 pages

Pyspark SQL Final Document

The document outlines a series of operations performed on Spark DataFrames, including data creation, filtering, aggregation, and joining. It demonstrates various SQL queries to manipulate and analyze data, such as selecting columns, applying conditions, and performing aggregations. Additionally, it showcases window functions, subqueries, and collection functions like collect_list and explode.

Uploaded by

mr.kayycreations

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

3 views31 pages

Pyspark SQL Final Document

Uploaded by

mr.kayycreations

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

You are on page 1/ 31

data = [

(0, "06-26-2011", 300.4, "Exercise", "GymnasticsPro", "cash"),

(1, "05-26-2011", 200.0, "Exercise Band", "Weightlifting", "credit"),
(2, "06-01-2011", 300.4, "Exercise", "Gymnastics Pro", "cash"),
(3, "06-05-2011", 100.0, "Gymnastics", "Rings", "credit"),
(4, "12-17-2011", 300.0, "Team Sports", "Field", "cash"),
(5, "02-14-2011", 200.0, "Gymnastics", None, "cash"),
(6, "06-05-2011", 100.0, "Exercise", "Rings", "credit"),
(7, "12-17-2011", 300.0, "Team Sports", "Field", "cash"),
(8, "02-14-2011", 200.0, "Gymnastics", None, "cash")
]

df = spark.createDataFrame(data, ["id", "tdate", "amount", "category", "product",

"spendby"])
df.show()

data2 = [
(4, "12-17-2011", 300.0, "Team Sports", "Field", "cash"),
(5, "02-14-2011", 200.0, "Gymnastics", None, "cash"),
(6, "02-14-2011", 200.0, "Winter", None, "cash"),
(7, "02-14-2011", 200.0, "Winter", None, "cash")
]

df1 = spark.createDataFrame(data2, ["id", "tdate", "amount", "category", "product",

"spendby"])
df1.show()

data4 = [
(1, "raj"),
(2, "ravi"),
(3, "sai"),
(5, "rani")
]

cust = spark.createDataFrame(data4, ["id", "name"])

cust.show()

data3 = [
(1, "mouse"),
(3, "mobile"),
(7, "laptop")
]

prod = spark.createDataFrame(data3, ["id", "product"])

prod.show()

# Register DataFrames as temporary views

df.createOrReplaceTempView("df")
df1.createOrReplaceTempView("df1")
cust.createOrReplaceTempView("cust")
prod.createOrReplaceTempView("prod")

sc.setLogLevel("ERROR")
spark.sql("select * from df order by id").show()
spark.sql("select * from df1 order by id").show()

====================================
Validate data
====================================
spark.sql("select * from df ").show()

====================================
Select two columns
====================================

spark.sql("select id,tdate from df order by id").show()

====================================
Select column with category filter = Exercise
====================================
spark.sql("select id,tdate,category from df where category='Exercise'
order by id").show()

====================================
Multi Column filter
====================================

spark.sql("select id,tdate,category,spendby from df where category='Exercise'

and spendby='cash' ").show()

====================================
Multi Value Filter
====================================
spark.sql("select * from df where category in
('Exercise','Gymnastics')").show()

====================================
Like Filter
====================================

spark.sql("select * from df where product like ('%Gymnastics%')").show()

====================================
Not Filters
====================================

spark.sql("select * from df where category != 'Exercise'").show()

====================================
Not In Filters
====================================

spark.sql("select * from df where category not in

('Exercise','Gymnastics')").show()

====================================
Null Filters
====================================
spark.sql("select * from df where product is null").show()
====================================
Not Null Filters
====================================

====================================
Max Function
====================================

spark.sql("select max(id) from df ").show()

====================================
Min Funtion
====================================
spark.sql("select min(id) from df ").show()

====================================
Count
====================================

spark.sql("select count(1) from df ").show()

====================================
Condition statement
====================================

spark.sql("select *,case when spendby='cash' then 1 else 0 end as status from

df ").show()
====================================
Concat data
====================================
spark.sql("select id,category,concat(id,'-',category) as condata from df").show()

====================================
Concat_ws data
====================================
spark.sql("select
id,category,product,concat_ws('-',id,cate
gory,product) as condata from df").show()

====================================
Lower Case data
====================================
spark.sql("select category,lower(category) as lower from df ").show()

====================================
Ceil data
====================================

spark.sql("select amount,ceil(amount) as ceil from df").show()

====================================
Round the data
====================================

spark.sql("select amount,round(amount) as round from df").show()

====================================
Replace Nulls
====================================

spark.sql("select
product,coalesce(product,'NA') as nullrep
from df").show()
====================================
Trim the space
====================================

spark.sql("select trim(product) from df").show()

====================================
Distinct the columns
====================================

spark.sql("select distinct category,spendby from df").show()

====================================
Substring with Trim
====================================

spark.sql("select substring(product,1,10)
as sub from df").show()
====================================
Substring/Split operation
====================================

spark.sql("select SUBSTRING_INDEX(category,' ',1) as spl from df").show()

====================================
Union all
====================================

spark.sql("select * from df union all select * from df1").show()

====================================
Union
====================================

spark.sql("select * from df union select * from df1 order by id").show()

====================================
Aggregate Sum
====================================

spark.sql("select category, sum(amount) as total from df group by

category").show()
====================================
Aggregate sum with two columns
====================================

spark.sql("select category,spendby,sum(amount) as total from df group by

category,spendby").show()

====================================
Aggregate Count
====================================

spark.sql("select category,spendby,sum(amount) As total,count(amount) as

cnt from df group by category,spendby").show()
====================================
Aggregate Max
====================================

spark.sql("select category, max(amount) as max from df group by

category").show()

====================================
Aggregate with Order Descending
====================================

spark.sql("select category, max(amount) as max from df group by category

order by category desc").show()
====================================
Window Row Number
====================================

spark.sql("SELECT category,amount, row_number() OVER ( partition by

category order by amount desc ) AS row_number FROM df").show()

====================================
Window Dense_rank Number
====================================
spark.sql("SELECT category,amount, dense_rank() OVER ( partition by category
order by amount desc ) AS dense_rank FROM df").show()

====================================
Window rank Number
====================================

spark.sql("SELECT category,amount, rank() OVER ( partition by category order

by amount desc ) AS rank FROM df").show()
====================================
Window Lead function
====================================

spark.sql("SELECT category,amount, lead(amount) OVER ( partition by category

order by amount desc ) AS lead FROM df").show()
====================================
Window lag function
====================================

spark.sql("SELECT category,amount, lag(amount) OVER ( partition by category

order by amount desc ) AS lag FROM df").show()

====================================
Having function
====================================

spark.sql("select category,count(category) as cnt from df group by category

having count(category)>1").show()
====================================
Inner Join
====================================
spark.sql("select a.id,a.name,b.product from cust a join prod b on
a.id=b.id").show()

====================================
Left Join
====================================
spark.sql("select a.id,a.name,b.product from cust a left join prod b on
a.id=b.id").show()
====================================
Right Join
====================================

spark.sql("select a.id,a.name,b.product from cust a right join prod b on

a.id=b.id").show()

====================================
Full Join
====================================

spark.sql("select a.id,a.name,b.product from cust a full join prod b on

a.id=b.id").show()
====================================
left anti Join
====================================

spark.sql("select a.id,a.name from cust a LEFT ANTI JOIN prod b on

a.id=b.id").show()
====================================
Date format
====================================

spark.sql("select id,tdate,from_unixtime(unix_timestamp(tdate,'MM-dd-
yyyy'),'yyyy-MM-dd') as con_date from df").show()

====================================
Sub query
====================================

spark.sql("""

select sum(amount) as total , con_date from(

select id,tdate,from_unixtime(unix_timestamp(tdate,'MM-dd-yyyy'),'yyyy-MM-
dd') as con_date,amount,category,product,spendby from df)

group by con_date

""").show()

====================================
collect_list
====================================

spark.sql("select category,collect_list(spendby) as col_spend from df group by

category").show()
====================================
collect_set
====================================

spark.sql("select category,collect_set(spendby) as col_spend from df group by

category ").show()

====================================
Explode
====================================
spark.sql("select category,explode(col_spend) as ex_spend from (select
category,collect_set(spendby) as col_spend from df group by
category)").show()

====================================
explode_outer
====================================

spark.sql("select category,explode_outer(col_spend) as ex_spend from (select

category,collect_set(spendby) as col_spend from df group by
category)").show()

SQL Vs PySpark 1678871778
No ratings yet
SQL Vs PySpark 1678871778
8 pages
V2 SQL Final Document
No ratings yet
V2 SQL Final Document
35 pages
SQL Final Document
No ratings yet
SQL Final Document
37 pages
Quewtion SQL - Pyspark
No ratings yet
Quewtion SQL - Pyspark
4 pages
Pyspark Interview Questions
No ratings yet
Pyspark Interview Questions
4 pages
Pyspark SQL and DataFrames
No ratings yet
Pyspark SQL and DataFrames
6 pages
Spark Test Que
No ratings yet
Spark Test Que
3 pages
Day 60
No ratings yet
Day 60
10 pages
Pyhtonpractice Questions
No ratings yet
Pyhtonpractice Questions
5 pages
Comparison of SQL
No ratings yet
Comparison of SQL
11 pages
Practicals Ak
No ratings yet
Practicals Ak
5 pages
Pyspark Syntax Using Simple Examples
No ratings yet
Pyspark Syntax Using Simple Examples
28 pages
PySpark Transformations
No ratings yet
PySpark Transformations
18 pages
Rajat Kumar Behera Lab4
No ratings yet
Rajat Kumar Behera Lab4
12 pages
Day3 - Assignments Solutions
No ratings yet
Day3 - Assignments Solutions
4 pages
50 SQL Interview Queries
No ratings yet
50 SQL Interview Queries
52 pages
Databricks Vs SQL Cheat Sheet
No ratings yet
Databricks Vs SQL Cheat Sheet
11 pages
Window Functions Spark
No ratings yet
Window Functions Spark
3 pages
Practical 2 Analytical Queries
No ratings yet
Practical 2 Analytical Queries
5 pages
combinationofallGROUPBYEVEYTHINGwatermark Z7n95hehml
No ratings yet
combinationofallGROUPBYEVEYTHINGwatermark Z7n95hehml
22 pages
Code Feature
No ratings yet
Code Feature
7 pages
Answers 4
No ratings yet
Answers 4
7 pages
Unit 4 Spark SQL
No ratings yet
Unit 4 Spark SQL
49 pages
Ade 1737191501
No ratings yet
Ade 1737191501
29 pages
02 Data - Engg - 23-24 Worksheet Practical#5b 1
No ratings yet
02 Data - Engg - 23-24 Worksheet Practical#5b 1
22 pages
Ecommerce
No ratings yet
Ecommerce
3 pages
3 Windows Function 08-01-2025
No ratings yet
3 Windows Function 08-01-2025
2 pages
SQL To Pyspark Conversion
No ratings yet
SQL To Pyspark Conversion
9 pages
SQL PySpark Cheat Sheet 1731729790
No ratings yet
SQL PySpark Cheat Sheet 1731729790
9 pages
SQL & pySPARK
No ratings yet
SQL & pySPARK
9 pages
Window Functions in SQL and PySpark
No ratings yet
Window Functions in SQL and PySpark
5 pages
SQL Vs Pyspark-1
No ratings yet
SQL Vs Pyspark-1
9 pages
DBMS Mini Notes
No ratings yet
DBMS Mini Notes
2 pages
122CS0079 A8 (Leknath Ghakkey)
No ratings yet
122CS0079 A8 (Leknath Ghakkey)
3 pages
QB
No ratings yet
QB
3 pages
SQL Interview Question
No ratings yet
SQL Interview Question
4 pages
DW Exno 6
No ratings yet
DW Exno 6
15 pages
XII IP Model 1 Ans
No ratings yet
XII IP Model 1 Ans
8 pages
Pyspark Funcamentals
No ratings yet
Pyspark Funcamentals
10 pages
Fundamental Pyspark Operations 1708364268
No ratings yet
Fundamental Pyspark Operations 1708364268
10 pages
Polio 065640
No ratings yet
Polio 065640
9 pages
Spark Best Practices
No ratings yet
Spark Best Practices
10 pages
Day 77
No ratings yet
Day 77
10 pages
Half Yearly Answers
No ratings yet
Half Yearly Answers
10 pages
SQL Code 1
No ratings yet
SQL Code 1
2 pages
Window Function in Pyspark
100% (1)
Window Function in Pyspark
8 pages
EDA With Pandas
No ratings yet
EDA With Pandas
8 pages
SQL Cheat Sheet With Tips
No ratings yet
SQL Cheat Sheet With Tips
3 pages
Basic Select, Where, Distinct
No ratings yet
Basic Select, Where, Distinct
11 pages
Dma - Assignmnet 2 - 241114 - 092531
No ratings yet
Dma - Assignmnet 2 - 241114 - 092531
8 pages
ISOM3015 Spark Core Exercise 2
No ratings yet
ISOM3015 Spark Core Exercise 2
3 pages
# IP Practical #
No ratings yet
# IP Practical #
12 pages
Ans Key Set A
No ratings yet
Ans Key Set A
6 pages
SQL90 GH 97
No ratings yet
SQL90 GH 97
5 pages
Dbms Experiment 2,3
No ratings yet
Dbms Experiment 2,3
4 pages
Assignment
No ratings yet
Assignment
4 pages
Windonction
No ratings yet
Windonction
16 pages
The Power of SQL Aggregate Window Functions
No ratings yet
The Power of SQL Aggregate Window Functions
4 pages
No Ph.D. Game Design With Three.js
From Everand
No Ph.D. Game Design With Three.js
Nikiforos Kontopoulos
No ratings yet
How to a Developers Guide to 4k: Developer edition, #3
From Everand
How to a Developers Guide to 4k: Developer edition, #3
Xinc Cyberwizard
No ratings yet
Refactoring
No ratings yet
Refactoring
82 pages
CS201 Mega Collection of Final Term Papers PDF
No ratings yet
CS201 Mega Collection of Final Term Papers PDF
103 pages
Application of Linked List-Polynomial Manipulation
No ratings yet
Application of Linked List-Polynomial Manipulation
5 pages
MCSL-216 Q1& Q2
No ratings yet
MCSL-216 Q1& Q2
4 pages
Python MCQ
No ratings yet
Python MCQ
18 pages
CSC WS 1
No ratings yet
CSC WS 1
4 pages
1-What Is The Difference Between Sequence Diagram and Communication Diagram in UML?
No ratings yet
1-What Is The Difference Between Sequence Diagram and Communication Diagram in UML?
8 pages
Presentation On Macros
No ratings yet
Presentation On Macros
21 pages
2009 JavaOne PDF
No ratings yet
2009 JavaOne PDF
251 pages
Oose Unit 3.2
No ratings yet
Oose Unit 3.2
89 pages
C PP PPPPP PP PP PP
No ratings yet
C PP PPPPP PP PP PP
7 pages
Java 8 Features
No ratings yet
Java 8 Features
38 pages
C# Bank Questions
No ratings yet
C# Bank Questions
2 pages
Access Specifiers in Java
No ratings yet
Access Specifiers in Java
3 pages
BAPI Introduction
No ratings yet
BAPI Introduction
9 pages
SCARY Iterator
No ratings yet
SCARY Iterator
5 pages
Lab Manual: CS102L - Object Oriented Programming Lab
No ratings yet
Lab Manual: CS102L - Object Oriented Programming Lab
8 pages
CoreJava Notes
No ratings yet
CoreJava Notes
25 pages
Term 3 OOP Prelim MP1 - Attempt Review
No ratings yet
Term 3 OOP Prelim MP1 - Attempt Review
5 pages
11 - Model Delegation
No ratings yet
11 - Model Delegation
7 pages
Local Objects vs. Distributed Objects: Distributed Object-Based Systems
No ratings yet
Local Objects vs. Distributed Objects: Distributed Object-Based Systems
31 pages
Notes Advance-Php Sybbaca
No ratings yet
Notes Advance-Php Sybbaca
214 pages
Exception Handling PDF
80% (5)
Exception Handling PDF
57 pages
Student Grading System
No ratings yet
Student Grading System
4 pages
PDF Documentation Package How To Integrate CPP Code in Python
No ratings yet
PDF Documentation Package How To Integrate CPP Code in Python
4 pages
Classes Objects
No ratings yet
Classes Objects
34 pages
Solution of Kubernates Solutions
No ratings yet
Solution of Kubernates Solutions
1 page
Java LabBook
No ratings yet
Java LabBook
78 pages
22dit081 Mad Practical 2
No ratings yet
22dit081 Mad Practical 2
10 pages
Airflow Chapter4
No ratings yet
Airflow Chapter4
30 pages

Pyspark SQL Final Document

Uploaded by

Pyspark SQL Final Document

Uploaded by

data = [

(0, "06-26-2011", 300.4, "Exercise", "GymnasticsPro", "cash"),

df = spark.createDataFrame(data, ["id", "tdate", "amount", "category", "product",

df1 = spark.createDataFrame(data2, ["id", "tdate", "amount", "category", "product",

cust = spark.createDataFrame(data4, ["id", "name"])

prod = spark.createDataFrame(data3, ["id", "product"])

# Register DataFrames as temporary views

spark.sql("select id,tdate from df order by id").show()

spark.sql("select id,tdate,category,spendby from df where category='Exercise'

spark.sql("select * from df where product like ('%Gymnastics%')").show()

spark.sql("select * from df where category != 'Exercise'").show()

spark.sql("select * from df where category not in

spark.sql("select max(id) from df ").show()

spark.sql("select count(1) from df ").show()

spark.sql("select *,case when spendby='cash' then 1 else 0 end as status from

spark.sql("select amount,ceil(amount) as ceil from df").show()

spark.sql("select amount,round(amount) as round from df").show()

spark.sql("select trim(product) from df").show()

spark.sql("select distinct category,spendby from df").show()

spark.sql("select SUBSTRING_INDEX(category,' ',1) as spl from df").show()

spark.sql("select * from df union all select * from df1").show()

spark.sql("select * from df union select * from df1 order by id").show()

spark.sql("select category, sum(amount) as total from df group by

spark.sql("select category,spendby,sum(amount) as total from df group by

spark.sql("select category,spendby,sum(amount) As total,count(amount) as

spark.sql("select category, max(amount) as max from df group by

spark.sql("select category, max(amount) as max from df group by category

spark.sql("SELECT category,amount, row_number() OVER ( partition by

spark.sql("SELECT category,amount, rank() OVER ( partition by category order

spark.sql("SELECT category,amount, lead(amount) OVER ( partition by category

spark.sql("SELECT category,amount, lag(amount) OVER ( partition by category

spark.sql("select category,count(category) as cnt from df group by category

spark.sql("select a.id,a.name,b.product from cust a right join prod b on

spark.sql("select a.id,a.name,b.product from cust a full join prod b on

spark.sql("select a.id,a.name from cust a LEFT ANTI JOIN prod b on

select sum(amount) as total , con_date from(

spark.sql("select category,collect_list(spendby) as col_spend from df group by

spark.sql("select category,collect_set(spendby) as col_spend from df group by

spark.sql("select category,explode_outer(col_spend) as ex_spend from (select

You might also like