Quewtion SQL - Pyspark
Quewtion SQL - Pyspark
Filter rows where age is SELECT * FROM table_name WHERE age >
df.filter(df.age > 30).show()
greater than 30. 30;
Group by a column
SELECT department, COUNT(*) FROM
(e.g., department) and df.groupBy("department").count().show()
table_name GROUP BY department;
count the number of rows.
Perform a left join on two SELECT * FROM table1 LEFT JOIN table2 ON
df1.join(df2, "id", "left").show()
tables. table1.id = table2.id;
Rank rows based on a from pyspark.sql.functions import rank SELECT *, RANK() OVER (ORDER BY salary) AS
column (e.g., salary) using
rank FROM table_name;
window functions. window = Window.orderBy("salary")
df.withColumn("rank", rank().over(window)).show()
df.withColumn("cumulative_sum",
sum("sales").over(window)).show()
Question PySpark Query SQL Query
WITH RankedSalaries AS (
SELECT
salary,
from pyspark.sql.window import Window
DENSE_RANK() OVER (ORDER BY salary
from pyspark.sql.functions import row_number
Find the third highest DESC) AS dense_rank
value in a column window = Window.orderBy(desc("salary"))
FROM employees
(e.g., salary).
df.withColumn("row_num",
)
row_number().over(window)).filter(col("row_num")
== 3).show() SELECT salary
FROM RankedSalaries
WHERE dense_rank = 3;
Find the top N rows based SELECT * FROM table_name ORDER BY salary
df.orderBy(desc("salary")).limit(5).show()
on a column (e.g., salary). DESC LIMIT 5;
Calculate the percentage of from pyspark.sql.window import Window SELECT sales, (sales / SUM(sales) OVER ()) *
total for each row 100 AS percentage FROM table_name;
(e.g., sales). from pyspark.sql.functions import sum
window = Window.partitionBy()
Question PySpark Query SQL Query
df.withColumn("percentage", (df.sales /
sum("sales").over(window)) * 100).show()
WITH RankedSalaries AS
from pyspark.sql.window import Window
SELECT department, salary, ROW_NUMBER()
from pyspark.sql.functions import row_number
OVER (PARTITION BY department ORDER BY
window = salary DESC) AS row_num
Find the nth highest salary Window.partitionBy("department").orderBy(desc("sal
department-wise. FROM table_name
ary"))
)
df.withColumn("row_num",
row_number().over(window)).filter(col("row_num") SELECT * FROM RankedSalaries WHERE
== n).show() row_num = n;
WITH AvgSalary AS (
df.createOrReplaceTempView("temp_table")
SELECT * FROM table_name WHERE (name,
Find duplicates using a spark.sql("SELECT * FROM temp_table WHERE (name, age) IN (SELECT name, age FROM table_name
subquery. age) IN (SELECT name, age FROM temp_table GROUP GROUP BY name, age HAVING COUNT(*) > 1);
BY name, age HAVING COUNT(*) > 1)").show()
with cte as (
SELECT *,
query to obtain the third row_number() over (PARTITION by user_id
transaction of every user. ORDER by transaction_date) as row_num
from transactions)
select user_id,spend, transaction_date from
cte where row_num=3
with cte as
(SELECT salary,row_number() over (order by
second highest salary
salary desc) as row_num
among all employees.
FROM employee)
select salary as second_highest_salary from
cte where row_num=2
Question PySpark Query SQL Query
SELECT user_id,tweet_date,
round(avg(tweet_count) over
Tweets' Rolling Averages (PARTITION by user_id
ORDER BY tweet_date
rows BETWEEN 2 PRECEDING and current
ROW),2)
as rolling_avg_3d
FROM tweets