0% found this document useful (0 votes)
10 views

SQL and PySpark

SQL and PySpark

Uploaded by

Woody Woodpecker
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
10 views

SQL and PySpark

SQL and PySpark

Uploaded by

Woody Woodpecker
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 80

SQL and PySpark

Select Columns

SQL PySpark

SELECT column1, column2 FROM table; df.select("column1", "column2")

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Filter Rows

SQL PySpark

SELECT * FROM table WHERE condition; df.filter("condition")

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Aggregate Functions

SQL PySpark

SELECT AVG(column) FROM table; df.select(F.avg("column"))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Group By

SQL PySpark

SELECT column, COUNT(*) FROM table df.groupBy("column").count()


GROUP BY column;

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Order By

SQL PySpark

SELECT * FROM table ORDER BY column df.orderBy("column", ascending=True)


ASC;

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Join

SQL PySpark

SELECT * FROM table1 JOIN table2 ON df1.join(df2, df1.id == df2.id)


table1.id = table2.id;

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Union

SQL PySpark

SELECT * FROM table1 UNION SELECT * df1.union(df2)


FROM table2;

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Limit

SQL PySpark

SELECT * FROM table LIMIT 100; df.limit(100)

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Distinct Values

SQL PySpark

SELECT DISTINCT column FROM table; df.select("column").distinct()

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Adding a New Column

SQL PySpark

SELECT *, (column1 + column2) AS df.withColumn("new_column", F.col("column1") +


new_column FROM table; F.col("column2"))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Column Alias

SQL PySpark

SELECT column AS alias_name FROM table; df.select(F.col("column").alias("alias_name"))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Filtering on Multiple Conditions

SQL PySpark

SELECT * FROM table WHERE condition1 df.filter((F.col("condition1")) &


AND condition2; (F.col("condition2")))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Subquery

SQL PySpark

SELECT * FROM (SELECT * FROM table df.filter("condition").alias("subquery")


WHERE condition) AS subquery;

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Between

SQL PySpark

SELECT * FROM table WHERE column df.filter(F.col("column").between("val1", "val2"))


BETWEEN val1 AND val2;

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Like

SQL PySpark

SELECT * FROM table WHERE column LIKE df.filter(F.col("column").like("pattern"))


pattern;

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Case When

SQL PySpark

SELECT CASE WHEN condition THEN result1 df.select(F.when(F.col("condition"),


ELSE result2 END FROM table; "result1").otherwise("result2"))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Cast Data Type

SQL PySpark

SELECT CAST(column AS datatype) FROM df.select(F.col("column").cast("datatype"))


table;

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Count Distinct

SQL PySpark

SELECT COUNT(DISTINCT column) FROM df.select(F.countDistinct("column"))


table;

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Substring

SQL PySpark

SELECT SUBSTRING(column, start, length) df.select(F.substring("column", start, length))


FROM table;

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Concatenate Columns

SQL PySpark

SELECT CONCAT(column1, column2) AS df.withColumn("new_column",


new_column FROM table; F.concat(F.col("column1"), F.col("column2")))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Average Over Partition

SQL PySpark

SELECT AVG(column) OVER (PARTITION BY df.withColumn("avg",


column2) FROM table; F.avg("column").over(Window.partitionBy("colum
n2")))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Sum Over Partition

SQL PySpark

SELECT SUM(column) OVER (PARTITION BY df.withColumn("sum",


column2) FROM table; F.sum("column").over(Window.partitionBy("colum
n2")))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Lead Function

SQL PySpark

SELECT LEAD(column, 1) OVER (ORDER BY df.withColumn("lead", F.lead("column",


column2) FROM table; 1).over(Window.orderBy("column2")))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Lag Function

SQL PySpark

SELECT LAG(column, 1) OVER (ORDER BY df.withColumn("lag", F.lag("column",


column2) FROM table; 1).over(Window.orderBy("column2")))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Row Count

SQL PySpark

SELECT COUNT(*) FROM table; df.count()

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Drop Column

SQL PySpark

ALTER TABLE table DROP COLUMN column; df.drop("column")


(Not directly in SELECT)

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Rename Column

SQL PySpark

ALTER TABLE table RENAME COLUMN df.withColumnRenamed("column1", "column2")


column1 TO column2; (Not directly in
SELECT)

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Change Column Type

SQL PySpark

ALTER TABLE table ALTER COLUMN column df.withColumn("column",


TYPE new_type; (Not directly in SELECT) df["column"].cast("new_type"))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Creating a Table from Select

SQL PySpark

CREATE TABLE new_table AS SELECT * (df.write.format("parquet").saveAsTable("new_ta


FROM table; ble"))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Inserting Selected Data into Table

SQL PySpark

INSERT INTO table2 SELECT * FROM table1; (df1.write.insertInto("table2"))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Creating a Table with Specific Columns

SQL PySpark

CREATE TABLE new_table AS SELECT (df.select("column1",


column1, column2 FROM table; "column2").write.format("parquet").saveAsTable(
"new_table"))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Aggregate with Alias

SQL PySpark

SELECT column, COUNT(*) AS count FROM df.groupBy("column").agg(F.count("*").alias("coun


table GROUP BY column; t"))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Nested Subquery

SQL PySpark

SELECT * FROM (SELECT * FROM table df.filter("condition").alias("sub").filter("sub.condit


WHERE condition) sub WHERE ion2")
sub.condition2;

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Multiple Joins

SQL PySpark

SELECT * FROM table1 JOIN table2 ON df1.join(df2, "id").join(df3, "id")


table1.id = table2.id JOIN table3 ON
table1.id = table3.id;

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Cross Join

SQL PySpark

SELECT * FROM table1 CROSS JOIN table2; df1.crossJoin(df2)

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Group By Having Count Greater Than

SQL PySpark

SELECT column, COUNT(*) FROM table df.groupBy("column").count().filter(F.col("count")


GROUP BY column HAVING COUNT(*) > 1; > 1)

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Alias for Table in Join

SQL PySpark

SELECT t1.* FROM table1 t1 JOIN table2 t2 df1.alias("t1").join(df2.alias("t2"), F.col("t1.id") ==


ON t1.id = t2.id; F.col("t2.id"))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Selecting from Multiple Tables

SQL PySpark

SELECT t1.column, t2.column FROM table1 df1.join(df2, df1.id == df2.id).select(df1.column,


t1, table2 t2 WHERE t1.id = t2.id; df2.column)

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Case When with Multiple Conditions

SQL PySpark

SELECT CASE WHEN condition THEN df.select(F.when(F.col("condition"),


'value1' WHEN condition2 THEN 'value2' "value1").when(F.col("condition2"),
ELSE 'value3' END FROM table; "value2").otherwise("value3"))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Extracting Date Parts

SQL PySpark

SELECT EXTRACT(YEAR FROM date_column) df.select(F.year(F.col("date_column")))


FROM table;

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Inequality Filtering

SQL PySpark

SELECT * FROM table WHERE column != df.filter(df.column != 'value')


'value';

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
In List

SQL PySpark

SELECT * FROM table WHERE column IN df.filter(df.column.isin('value1', 'value2'))


('value1', 'value2');

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Not In List

SQL PySpark

SELECT * FROM table WHERE column NOT df.filter(~df.column.isin('value1', 'value2'))


IN ('value1', 'value2');

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Null Values

SQL PySpark

SELECT * FROM table WHERE column IS df.filter(df.column.isNull())


NULL;

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Not Null Values

SQL PySpark

SELECT * FROM table WHERE column IS df.filter(df.column.isNotNull())


NOT NULL;

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
String Upper Case

SQL PySpark

SELECT UPPER(column) FROM table; df.select(F.upper(df.column))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
String Lower Case

SQL PySpark

SELECT LOWER(column) FROM table; df.select(F.lower(df.column))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
String Length

SQL PySpark

SELECT LENGTH(column) FROM table; df.select(F.length(df.column))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Trim String

SQL PySpark

SELECT TRIM(column) FROM table; df.select(F.trim(df.column))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Left Trim String

SQL PySpark

SELECT LTRIM(column) FROM table; df.select(F.ltrim(df.column))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Right Trim String

SQL PySpark

SELECT RTRIM(column) FROM table; df.select(F.rtrim(df.column))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
String Replace

SQL PySpark

SELECT REPLACE(column, 'find', 'replace') df.select(F.regexp_replace(df.column, 'find',


FROM table; 'replace'))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Substring Index

SQL PySpark

SELECT SUBSTRING_INDEX(column, 'delim', df.select(F.expr("split(column, 'delim')[count-1]"))


count) FROM table; (Assuming 1-based index)

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Date Difference

SQL PySpark

SELECT DATEDIFF('date1', 'date2') FROM df.select(F.datediff(F.col('date1'), F.col('date2')))


table;

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Add Months to Date

SQL PySpark

SELECT ADD_MONTHS(date_column, df.select(F.add_months(df.date_column,


num_months) FROM table; num_months))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
First Value in Group

SQL PySpark

SELECT FIRST_VALUE(column) OVER df.withColumn("first_val",


(PARTITION BY column2) FROM table; F.first("column").over(Window.partitionBy("colum
n2")))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Last Value in Group

SQL PySpark

SELECT LAST_VALUE(column) OVER df.withColumn("last_val",


(PARTITION BY column2) FROM table; F.last("column").over(Window.partitionBy("colum
n2")))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Row Number Over Partition

SQL PySpark

SELECT ROW_NUMBER() OVER (PARTITION df.withColumn("row_num",


BY column ORDER BY column) FROM table; F.row_number().over(Window.partitionBy("colum
n").orderBy("column")))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Rank Over Partition

SQL PySpark

SELECT RANK() OVER (PARTITION BY df.withColumn("rank",


column ORDER BY column) FROM table; F.rank().over(Window.partitionBy("column").order
By("column")))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Dense Rank Over Partition

SQL PySpark

SELECT DENSE_RANK() OVER (PARTITION df.withColumn("dense_rank",


BY column ORDER BY column) FROM table; F.dense_rank().over(Window.partitionBy("column"
).orderBy("column")))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Count Rows

SQL PySpark

SELECT COUNT(*) FROM table; df.count()

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Mathematical Operations

SQL PySpark

SELECT column1 + column2 FROM table; df.select(F.col("column1") + F.col("column2"))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
String Concatenation

SQL PySpark

SELECT column1 | column2 AS new_column df.withColumn("new_column", F.concat_ws("|",


FROM table; F.col("column1"), F.col("column2")))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Find Minimum Value

SQL PySpark

SELECT MIN(column) FROM table; df.select(F.min("column"))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Find Maximum Value

SQL PySpark

SELECT MAX(column) FROM table; df.select(F.max("column"))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Removing Duplicates

SQL PySpark

SELECT DISTINCT * FROM table; df.distinct()

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Left Join

SQL PySpark

SELECT * FROM table1 LEFT JOIN table2 ON df1.join(df2, df1.id == df2.id, "left")
table1.id = table2.id;

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Right Join

SQL PySpark

SELECT * FROM table1 RIGHT JOIN table2 df1.join(df2, df1.id == df2.id, "right")
ON table1.id = table2.id;

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Full Outer Join

SQL PySpark

SELECT * FROM table1 FULL OUTER JOIN df1.join(df2, df1.id == df2.id, "outer")
table2 ON table1.id = table2.id;

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Group By with Having

SQL PySpark

SELECT column, COUNT(*) FROM table df.groupBy("column").count().filter(F.col("count")


GROUP BY column HAVING COUNT(*) > 10; > 10)

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Round Decimal Values

SQL PySpark

SELECT ROUND(column, 2) FROM table; df.select(F.round("column", 2))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Get Current Date

SQL PySpark

SELECT CURRENT_DATE(); df.select(F.current_date())

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Date Addition

SQL PySpark

SELECT DATE_ADD(date_column, 10) FROM df.select(F.date_add(F.col("date_column"), 10))


table;

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Date Subtraction

SQL PySpark

SELECT DATE_SUB(date_column, 10) FROM df.select(F.date_sub(F.col("date_column"), 10))


table;

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Extract Year from Date

SQL PySpark

SELECT YEAR(date_column) FROM table; df.select(F.year(F.col("date_column")))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Extract Month from Date

SQL PySpark

SELECT MONTH(date_column) FROM table; df.select(F.month(F.col("date_column")))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Extract Day from Date

SQL PySpark

SELECT DAY(date_column) FROM table; df.select(F.dayofmonth(F.col("date_column")))

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Sorting Descending

SQL PySpark

SELECT * FROM table ORDER BY column df.orderBy(F.col("column").desc())


DESC;

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Group By Multiple Columns

SQL PySpark

SELECT col1, col2, COUNT(*) FROM table df.groupBy("col1", "col2").count()


GROUP BY col1, col2;

Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Conditional Column Update

SQL PySpark

UPDATE table SET column1 = CASE WHEN df.withColumn("column1",


condition THEN 'value1' ELSE 'value2' END; F.when(F.col("condition"),
"value1").otherwise("value2"))

Shwetank Singh
GritSetGrow - GSGLearn.com

You might also like