0% found this document useful (0 votes)
10 views

SQL and PySpark

SQL and PySpark

Uploaded by

Woody Woodpecker
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
10 views

SQL and PySpark

SQL and PySpark

Uploaded by

Woody Woodpecker
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 80

SQL and PySpark

Select Columns

SQL PySpark

SELECT column1, column2 FROM table; df.select("column1", "column2")

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Filter Rows

SQL PySpark

SELECT * FROM table WHERE condition; df.filter("condition")

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Aggregate Functions

SQL PySpark

SELECT AVG(column) FROM table; df.select(F.avg("column"))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Group By

SQL PySpark

SELECT column, COUNT(*) FROM table df.groupBy("column").count()


GROUP BY column;

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Order By

SQL PySpark

SELECT * FROM table ORDER BY column df.orderBy("column", ascending=True)


ASC;

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Join

SQL PySpark

SELECT * FROM table1 JOIN table2 ON df1.join(df2, df1.id == df2.id)


table1.id = table2.id;

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Union

SQL PySpark

SELECT * FROM table1 UNION SELECT * df1.union(df2)


FROM table2;

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Limit

SQL PySpark

SELECT * FROM table LIMIT 100; df.limit(100)

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Distinct Values

SQL PySpark

SELECT DISTINCT column FROM table; df.select("column").distinct()

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Adding a New Column

SQL PySpark

SELECT *, (column1 + column2) AS df.withColumn("new_column", F.col("column1") +


new_column FROM table; F.col("column2"))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Column Alias

SQL PySpark

SELECT column AS alias_name FROM table; df.select(F.col("column").alias("alias_name"))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Filtering on Multiple Conditions

SQL PySpark

SELECT * FROM table WHERE condition1 df.filter((F.col("condition1")) &


AND condition2; (F.col("condition2")))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Subquery

SQL PySpark

SELECT * FROM (SELECT * FROM table df.filter("condition").alias("subquery")


WHERE condition) AS subquery;

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Between

SQL PySpark

SELECT * FROM table WHERE column df.filter(F.col("column").between("val1", "val2"))


BETWEEN val1 AND val2;

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Like

SQL PySpark

SELECT * FROM table WHERE column LIKE df.filter(F.col("column").like("pattern"))


pattern;

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Case When

SQL PySpark

SELECT CASE WHEN condition THEN result1 df.select(F.when(F.col("condition"),


ELSE result2 END FROM table; "result1").otherwise("result2"))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Cast Data Type

SQL PySpark

SELECT CAST(column AS datatype) FROM df.select(F.col("column").cast("datatype"))


table;

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Count Distinct

SQL PySpark

SELECT COUNT(DISTINCT column) FROM df.select(F.countDistinct("column"))


table;

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Substring

SQL PySpark

SELECT SUBSTRING(column, start, length) df.select(F.substring("column", start, length))


FROM table;

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Concatenate Columns

SQL PySpark

SELECT CONCAT(column1, column2) AS df.withColumn("new_column",


new_column FROM table; F.concat(F.col("column1"), F.col("column2")))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Average Over Partition

SQL PySpark

SELECT AVG(column) OVER (PARTITION BY df.withColumn("avg",


column2) FROM table; F.avg("column").over(Window.partitionBy("colum
n2")))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Sum Over Partition

SQL PySpark

SELECT SUM(column) OVER (PARTITION BY df.withColumn("sum",


column2) FROM table; F.sum("column").over(Window.partitionBy("colum
n2")))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Lead Function

SQL PySpark

SELECT LEAD(column, 1) OVER (ORDER BY df.withColumn("lead", F.lead("column",


column2) FROM table; 1).over(Window.orderBy("column2")))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Lag Function

SQL PySpark

SELECT LAG(column, 1) OVER (ORDER BY df.withColumn("lag", F.lag("column",


column2) FROM table; 1).over(Window.orderBy("column2")))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Row Count

SQL PySpark

SELECT COUNT(*) FROM table; df.count()

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Drop Column

SQL PySpark

ALTER TABLE table DROP COLUMN column; df.drop("column")


(Not directly in SELECT)

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Rename Column

SQL PySpark

ALTER TABLE table RENAME COLUMN df.withColumnRenamed("column1", "column2")


column1 TO column2; (Not directly in
SELECT)

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Change Column Type

SQL PySpark

ALTER TABLE table ALTER COLUMN column df.withColumn("column",


TYPE new_type; (Not directly in SELECT) df["column"].cast("new_type"))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Creating a Table from Select

SQL PySpark

CREATE TABLE new_table AS SELECT * (df.write.format("parquet").saveAsTable("new_ta


FROM table; ble"))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Inserting Selected Data into Table

SQL PySpark

INSERT INTO table2 SELECT * FROM table1; (df1.write.insertInto("table2"))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Creating a Table with Specific Columns

SQL PySpark

CREATE TABLE new_table AS SELECT (df.select("column1",


column1, column2 FROM table; "column2").write.format("parquet").saveAsTable(
"new_table"))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Aggregate with Alias

SQL PySpark

SELECT column, COUNT(*) AS count FROM df.groupBy("column").agg(F.count("*").alias("coun


table GROUP BY column; t"))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Nested Subquery

SQL PySpark

SELECT * FROM (SELECT * FROM table df.filter("condition").alias("sub").filter("sub.condit


WHERE condition) sub WHERE ion2")
sub.condition2;

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Multiple Joins

SQL PySpark

SELECT * FROM table1 JOIN table2 ON df1.join(df2, "id").join(df3, "id")


table1.id = table2.id JOIN table3 ON
table1.id = table3.id;

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Cross Join

SQL PySpark

SELECT * FROM table1 CROSS JOIN table2; df1.crossJoin(df2)

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Group By Having Count Greater Than

SQL PySpark

SELECT column, COUNT(*) FROM table df.groupBy("column").count().filter(F.col("count")


GROUP BY column HAVING COUNT(*) > 1; > 1)

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Alias for Table in Join

SQL PySpark

SELECT t1.* FROM table1 t1 JOIN table2 t2 df1.alias("t1").join(df2.alias("t2"), F.col("t1.id") ==


ON t1.id = t2.id; F.col("t2.id"))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Selecting from Multiple Tables

SQL PySpark

SELECT t1.column, t2.column FROM table1 df1.join(df2, df1.id == df2.id).select(df1.column,


t1, table2 t2 WHERE t1.id = t2.id; df2.column)

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Case When with Multiple Conditions

SQL PySpark

SELECT CASE WHEN condition THEN df.select(F.when(F.col("condition"),


'value1' WHEN condition2 THEN 'value2' "value1").when(F.col("condition2"),
ELSE 'value3' END FROM table; "value2").otherwise("value3"))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Extracting Date Parts

SQL PySpark

SELECT EXTRACT(YEAR FROM date_column) df.select(F.year(F.col("date_column")))


FROM table;

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Inequality Filtering

SQL PySpark

SELECT * FROM table WHERE column != df.filter(df.column != 'value')


'value';

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
In List

SQL PySpark

SELECT * FROM table WHERE column IN df.filter(df.column.isin('value1', 'value2'))


('value1', 'value2');

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Not In List

SQL PySpark

SELECT * FROM table WHERE column NOT df.filter(~df.column.isin('value1', 'value2'))


IN ('value1', 'value2');

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Null Values

SQL PySpark

SELECT * FROM table WHERE column IS df.filter(df.column.isNull())


NULL;

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Not Null Values

SQL PySpark

SELECT * FROM table WHERE column IS df.filter(df.column.isNotNull())


NOT NULL;

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
String Upper Case

SQL PySpark

SELECT UPPER(column) FROM table; df.select(F.upper(df.column))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
String Lower Case

SQL PySpark

SELECT LOWER(column) FROM table; df.select(F.lower(df.column))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
String Length

SQL PySpark

SELECT LENGTH(column) FROM table; df.select(F.length(df.column))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Trim String

SQL PySpark

SELECT TRIM(column) FROM table; df.select(F.trim(df.column))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Left Trim String

SQL PySpark

SELECT LTRIM(column) FROM table; df.select(F.ltrim(df.column))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Right Trim String

SQL PySpark

SELECT RTRIM(column) FROM table; df.select(F.rtrim(df.column))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
String Replace

SQL PySpark

SELECT REPLACE(column, 'find', 'replace') df.select(F.regexp_replace(df.column, 'find',


FROM table; 'replace'))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Substring Index

SQL PySpark

SELECT SUBSTRING_INDEX(column, 'delim', df.select(F.expr("split(column, 'delim')[count-1]"))


count) FROM table; (Assuming 1-based index)

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Date Difference

SQL PySpark

SELECT DATEDIFF('date1', 'date2') FROM df.select(F.datediff(F.col('date1'), F.col('date2')))


table;

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Add Months to Date

SQL PySpark

SELECT ADD_MONTHS(date_column, df.select(F.add_months(df.date_column,


num_months) FROM table; num_months))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
First Value in Group

SQL PySpark

SELECT FIRST_VALUE(column) OVER df.withColumn("first_val",


(PARTITION BY column2) FROM table; F.first("column").over(Window.partitionBy("colum
n2")))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Last Value in Group

SQL PySpark

SELECT LAST_VALUE(column) OVER df.withColumn("last_val",


(PARTITION BY column2) FROM table; F.last("column").over(Window.partitionBy("colum
n2")))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Row Number Over Partition

SQL PySpark

SELECT ROW_NUMBER() OVER (PARTITION df.withColumn("row_num",


BY column ORDER BY column) FROM table; F.row_number().over(Window.partitionBy("colum
n").orderBy("column")))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Rank Over Partition

SQL PySpark

SELECT RANK() OVER (PARTITION BY df.withColumn("rank",


column ORDER BY column) FROM table; F.rank().over(Window.partitionBy("column").order
By("column")))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Dense Rank Over Partition

SQL PySpark

SELECT DENSE_RANK() OVER (PARTITION df.withColumn("dense_rank",


BY column ORDER BY column) FROM table; F.dense_rank().over(Window.partitionBy("column"
).orderBy("column")))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Count Rows

SQL PySpark

SELECT COUNT(*) FROM table; df.count()

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Mathematical Operations

SQL PySpark

SELECT column1 + column2 FROM table; df.select(F.col("column1") + F.col("column2"))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
String Concatenation

SQL PySpark

SELECT column1 | column2 AS new_column df.withColumn("new_column", F.concat_ws("|",


FROM table; F.col("column1"), F.col("column2")))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Find Minimum Value

SQL PySpark

SELECT MIN(column) FROM table; df.select(F.min("column"))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Find Maximum Value

SQL PySpark

SELECT MAX(column) FROM table; df.select(F.max("column"))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Removing Duplicates

SQL PySpark

SELECT DISTINCT * FROM table; df.distinct()

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Left Join

SQL PySpark

SELECT * FROM table1 LEFT JOIN table2 ON df1.join(df2, df1.id == df2.id, "left")
table1.id = table2.id;

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Right Join

SQL PySpark

SELECT * FROM table1 RIGHT JOIN table2 df1.join(df2, df1.id == df2.id, "right")
ON table1.id = table2.id;

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Full Outer Join

SQL PySpark

SELECT * FROM table1 FULL OUTER JOIN df1.join(df2, df1.id == df2.id, "outer")
table2 ON table1.id = table2.id;

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Group By with Having

SQL PySpark

SELECT column, COUNT(*) FROM table df.groupBy("column").count().filter(F.col("count")


GROUP BY column HAVING COUNT(*) > 10; > 10)

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Round Decimal Values

SQL PySpark

SELECT ROUND(column, 2) FROM table; df.select(F.round("column", 2))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Get Current Date

SQL PySpark

SELECT CURRENT_DATE(); df.select(F.current_date())

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Date Addition

SQL PySpark

SELECT DATE_ADD(date_column, 10) FROM df.select(F.date_add(F.col("date_column"), 10))


table;

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Date Subtraction

SQL PySpark

SELECT DATE_SUB(date_column, 10) FROM df.select(F.date_sub(F.col("date_column"), 10))


table;

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Extract Year from Date

SQL PySpark

SELECT YEAR(date_column) FROM table; df.select(F.year(F.col("date_column")))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Extract Month from Date

SQL PySpark

SELECT MONTH(date_column) FROM table; df.select(F.month(F.col("date_column")))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Extract Day from Date

SQL PySpark

SELECT DAY(date_column) FROM table; df.select(F.dayofmonth(F.col("date_column")))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Sorting Descending

SQL PySpark

SELECT * FROM table ORDER BY column df.orderBy(F.col("column").desc())


DESC;

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Group By Multiple Columns

SQL PySpark

SELECT col1, col2, COUNT(*) FROM table df.groupBy("col1", "col2").count()


GROUP BY col1, col2;

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Conditional Column Update

SQL PySpark

UPDATE table SET column1 = CASE WHEN df.withColumn("column1",


condition THEN 'value1' ELSE 'value2' END; F.when(F.col("condition"),
"value1").otherwise("value2"))

Shwetank Singh
GritSetGrow - GSGLearn.com

You might also like