SQL & PySpark Equivalence: A Comprehensive Guide
Structured Query Language (SQL) and PySpark are powerful tools for large-scale data processing. SQL is widely
used for querying and managing structured data in relational databases, while PySpark, built on Apache Spark,
excels in distributed computing and big data analytics.
This guide provides a side-by-side comparison of key SQL operations and their PySpark equivalents, covering
data types, database operations, table alterations, partitioning, views, schema management, file operations,
queries, aggregations, string and date functions, conditional logic, joins, grouping, set operations, window
functions, and CTEs. It aims to help data professionals transition seamlessly between SQL and PySpark in
hybrid environments.
1. Data Types
SQL Data Type PySpark Equivalent
INT IntegerType ()
BIGINT LongType ()
FLOAT FloatType ()
DOUBLE DoubleType ()
CHAR(n) / VARCHAR(n) StringType ()
DATE DateType ()
TIMESTAMP TimestampType ()
2. Database & Table_operations
Concept SQL Query PySpark Equivalent
Create Database CREATE DATABASE db_name; spark.sql ("CREATE DATABASE db_name")
Use Database USE db_name; spark.catalog.setCurrentDatabase("db_name")
Drop Database DROP DATABASE db_name; spark.sql ("DROP DATABASE db_name")
Show Databases SHOW DATABASES; spark.sql ("SHOW DATABASES").show()
CREATE TABLE table_name (col1 INT,
Create Table df.write.format("parquet"). saveAsTable("table_name")
col2 STRING);
Drop Table DROP TABLE table_name; spark.sql ("DROP TABLE IF EXISTS table_name")
Truncate Table TRUNCATE TABLE table_name; spark.sql ("TRUNCATE TABLE table_name")
Describe Table DESCRIBE TABLE table_name; df.printSchema()
Show Tables SHOW TABLES; spark.sql ("SHOW TABLES").show()
3. Table Alterations
Concept SQL Query PySpark Equivalent
ALTER TABLE table_name ADD COLUMN col3
Add Column df.withColumn("col3", lit(None).cast("string"))
STRING;
ALTER TABLE table_name RENAME COLUMN df.withColumnRenamed("old_name",
Rename Column
old_name TO new_name; "new_name")
Drop Column ALTER TABLE table_name DROP COLUMN col3; df.drop("col3")
4. Partitioning & Bucketing
Concept SQL Query PySpark Equivalent
CREATE TABLE table_name (col1
Create Partitioned df.write.partitionBy("col3").format("parquet").saveAsTable("table_n
INT, col2 STRING) PARTITIONED
Table ame")
BY (col3 STRING);
INSERT INTO table_name
Insert into df.write.mode("append").partitionBy("col3").saveAsTable("table_na
PARTITION (col3='value') SELECT
Partitioned Table me")
col1, col2 FROM source_table;
CREATE TABLE table_name (col1
Create Bucketed
INT, col2 STRING) CLUSTERED BY df.write.bucketBy(10, "col1").saveAsTable("table_name")
Table
(col1) INTO 10 BUCKETS;
5. Views (Temporary & Permanent)
Concept SQL Query PySpark Equivalent
CREATE VIEW view_name AS SELECT * FROM
Create View df.createOrReplaceTempView("view_name")
table_name;
Drop View DROP VIEW view_name; spark.sql("DROP VIEW IF EXISTS view_name")
CREATE GLOBAL VIEW view_name AS SELECT *
Create Global View df.createGlobalTempView("view_name")
FROM table_name;
Show Views SHOW VIEWS; spark.sql("SHOW VIEWS").show()
6. Schema Management
Concept SQL Query PySpark Equivalent
from pyspark.sql.types import StructType, StructField,
IntegerType, StringType, DateTypeschema =
Define Schema CREATE TABLE table_name (col1
StructType([StructField("col1", IntegerType(), True),
Manually INT, col2 STRING, col3 DATE);
StructField("col2", StringType(), True), StructField("col3",
DateType(), True)])
Check Schema DESCRIBE TABLE table_name; df.printSchema()
Change Column Data ALTER TABLE table_name ALTER
df.withColumn("col1", col("col1").cast("bigint"))
Type COLUMN col1 TYPE BIGINT;
7. File-Based Table Operations
Concept SQL Query PySpark Equivalent
Save as Parquet N/A (Implicit in Hive) df.write.format("parquet").save("path/to/parquet")
CREATE TABLE table_name USING DELTA
Save as Delta Table df.write.format("delta").save("path/to/delta")
LOCATION 'path';
df.write.format("csv").option("header",
Save as CSV N/A
"true").save("path/to/csv")
Save as JSON N/A df.write.format("json").save("path/to/json")
Save as ORC N/A df.write.format("orc").save("path/to/orc")
8. Basic SELECT Queries
Concept SQL Query PySpark Equivalent
Select Specific Columns SELECT column1, column2 FROM table; df.select("column1", "column2")
Select All Columns SELECT * FROM table; df.select("*")
Distinct Values SELECT DISTINCT column FROM table; df.select("column").distinct()
WHERE Condition SELECT * FROM table WHERE column = 'value'; df.filter(col("column") == 'value')
ORDER BY SELECT * FROM table ORDER BY column; df.sort("column")
LIMIT Rows SELECT * FROM table LIMIT n; df.limit(n)
COUNT Rows SELECT COUNT(*) FROM table; df.count()
9. Aggregate Functions
Concept SQL Query PySpark Equivalent
SUM SELECT SUM(column) FROM table; df.agg({"column": "sum"})
AVG SELECT AVG(column) FROM table; df.agg({"column": "avg"})
MAX SELECT MAX(column) FROM table; df.agg({"column": "max"})
MIN SELECT MIN(column) FROM table; df.agg({"column": "min"})
10. String Functions
Concept SQL Query PySpark Equivalent
String Length SELECT LEN(column) FROM table; df.select(length(col("column")))
Convert to Uppercase SELECT UPPER(column) FROM table; df.select(upper(col("column")))
Convert to Lowercase SELECT LOWER(column) FROM table; df.select(lower(col("column")))
Concatenate Strings SELECT CONCAT(string1, string2) FROM table; df.select(concat(col("string1"), col("string2")))
Trim String SELECT TRIM(column) FROM table; df.select(trim(col("column")))
SELECT SUBSTRING(column, start, length) FROM df.select(substring(col("column"), start,
Substring
table; length))
11. Date & Time Functions
Concept SQL Query PySpark Equivalent
Current Date SELECT CURDATE(); df.select(current_date())
Current Timestamp SELECT NOW(); df.select(current_timestamp())
CAST / CONVERT SELECT CAST(column AS datatype) FROM table; df.select(col("column").cast("datatype"))
12. Conditional Logic
Concept SQL Query PySpark Equivalent
SELECT IF(condition, value1, value2) FROM df.select(when(condition,
IF (Conditional Logic)
table; value1).otherwise(value2))
SELECT COALESCE(column1, column2, df.select(coalesce(col("column1"), col("column2"),
COALESCE
column3) FROM table; col("column3")))
13. Join, Grouping & Pivoting
Concept SQL Query PySpark Equivalent
SELECT * FROM table1 JOIN table2 ON
JOIN df1.join(df2, "column")
table1.column = table2.column;
SELECT column, agg_function(column)
GROUP BY df.groupBy("column").agg({"column": "agg_function"})
FROM table GROUP BY column;
PIVOT (agg_function(column) FOR df.groupBy("pivot_column").pivot("column").agg({"column":
PIVOT
pivot_column IN (values)); "agg_function"})
14. Logical Operators
Concept SQL Query PySpark Equivalent
SELECT * FROM table WHERE column1 = df.filter((col("column1") == value) &
AND / OR
value AND column2 > value; (col("column2") > value))
SELECT * FROM table WHERE column IS
IS NULL / IS NOT NULL df.filter(col("column").isNull())
NULL;
SELECT * FROM table WHERE column LIKE
LIKE df.filter(col("column").like("value%"))
'value%';
SELECT * FROM table WHERE column df.filter((col("column") >= value1) & (col("column")
BETWEEN
BETWEEN value1 AND value2; <= value2))
15. Set Operations
Concept SQL Query PySpark Equivalent
SELECT column FROM table1 UNION SELECT column FROM
UNION df1.union(df2).select("column")
table2;
SELECT column FROM table1 UNION ALL SELECT column FROM
UNION ALL df1.unionAll(df2).select("column")
table2;
16. Window Functions
Concept SQL Query PySpark Equivalent
SELECT column, RANK() OVER
df.withColumn("rank",
RANK() (PARTITION BY col2 ORDER
rank().over(Window.partitionBy("col2").orderBy("column")))
BY column) FROM table;
SELECT column,
DENSE_RANK() OVER df.withColumn("dense_rank",
DENSE_RANK()
(PARTITION BY col2 ORDER dense_rank().over(Window.partitionBy("col2").orderBy("column")))
BY column) FROM table;
SELECT column,
df.withColumn("row_number",
ROW_NUMBER() OVER
ROW_NUMBER() row_number().over(Window.partitionBy("col2").orderBy("column"))
(PARTITION BY col2 ORDER
)
BY column) FROM table;
SELECT column,
LEAD(column, 1) OVER df.withColumn("lead_value", lead("column",
LEAD()
(PARTITION BY col2 ORDER 1).over(Window.partitionBy("col2").orderBy("column")))
BY column) FROM table;
SELECT column, LAG(column,
1) OVER (PARTITION BY col2 df.withColumn("lag_value", lag("column",
LAG()
ORDER BY column) FROM 1).over(Window.partitionBy("col2").orderBy("column")))
table;
17. Common Table Expressions (CTEs)
Concept SQL Query PySpark Equivalent
WITH cte1 AS (SELECT * FROM table1) SELECT * df.createOrReplaceTempView("cte1")df_cte1 =
CTE
FROM cte1 WHERE condition; spark.sql("SELECT * FROM cte1 WHERE condition")