Databricks vs SQL Cheat Sheet
Databricks vs SQL Cheat Sheet
Azure Databricks
Spark
vs.
SQL Functions
Cheat Sheet
1. Syntax Basics
SQL
Operation Spark Example (PySpark)
Equivalent
SELECT
Select column1,
df.select("column1", "column2")
Columns column2
FROM table
SELECT *
Filter FROM table
df.filter(col("column") > 10)
Rows WHERE
column > 10
SELECT
Alias column AS
df.select(col("column").alias("alias"))
Columns alias FROM
table
3. Aggregations
Operatio SQL
Spark Example (PySpark)
n Equivalent
SELECT
Group By column,
and df.groupBy("column").agg(avg("value" AVG(value)
Aggregat )) FROM table
e GROUP BY
column
SELECT
Count
df.count() COUNT(*)
Rows
FROM table
SELECT
Aggregat
SUM(value)
e
df.agg(sum("value"), max("value")) ,
Function
MAX(value)
s
FROM table
4. Joins
Spark Example
Operation SQL Equivalent
(PySpark)
SELECT * FROM df1 INNER
df1.join(df2, "key",
Inner Join JOIN df2 ON df1.key =
"inner")
df2.key
SELECT * FROM df1 LEFT
df1.join(df2, "key",
Left Join JOIN df2 ON df1.key =
"left")
df2.key
Cross SELECT * FROM df1 CROSS
df1.crossJoin(df2)
Join JOIN df2
5. Sorting
Operati
Spark Example (PySpark) SQL Equivalent
on
SELECT
Substri df.select(substring("column", 1, SUBSTRING(colu
ng 3)) mn, 1, 3) FROM
table
SELECT * FROM
String
df.filter(col("column").contains("v table WHERE
Contain
alue")) column LIKE
s
'%value%'
SELECT
String df.select(regexp_replace("column REPLACE(colum
Replace ", "x", "y")) n, 'x', 'y') FROM
table
7. Date and Time Functions
Operat SQL
Spark Example (PySpark)
ion Equivalent
SELECT
column,
ROW_NUM
BER()
OVER
Row df.withColumn("row_num",
(PARTITIO
Numb row_number().over(Window.partitionBy("
N BY
er column")))
column)
AS
row_num
FROM
table
SELECT
column,
RANK()
OVER
df.withColumn("rank", (PARTITIO
Rank rank().over(Window.partitionBy("column" N BY
).orderBy("value"))) column
ORDER BY
value) AS
rank FROM
table
9. Null Handling
Spark Example
Operation SQL Equivalent
(PySpark)
Drop Null DELETE FROM table
df.na.drop()
Rows WHERE column IS NULL
SELECT
Replace df.na.fill("value",
COALESCE(column,
Null Values ["column"])
'value') FROM table
10. Miscellaneous
Operati SQL
Spark Example (PySpark)
on Equivalent
SELECT
Distinct DISTINCT
df.select("column").distinct()
Values column FROM
table
SELECT *
Sample FROM table
df.sample(fraction=0.1)
Rows TABLESAMPLE
(10 PERCENT)
Create Not
df.createOrReplaceTempView("view
Temp applicable in
_name")
View SQL directly