0% found this document useful (0 votes)
11 views

Pyspark and SQL

,,
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
11 views

Pyspark and SQL

,,
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 57

INTERVIEW QUESTIONS & WITH ANSWER

Most Commonly Asked


SQL & PySpark
***These questions cover real-
world scenarios and key concepts
to help you ace your next interview
2025! ***
SQL SERVER Basics
--Question 1.
--Write a query to find the top 5 products with the highest
revenue in each
--category.
CREATE TABLE ##PRODUCTS(PROID INT, PNAME
VARCHAR (50), CATEGORYID INT)
INSERT INTO ##PRODUCTS VALUES
(1,'CPU',1),
(2,'RAM',2),
(3,'MONITOR',3),
Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER
(4,'KEYBORD',4),
(5,'MOUSE',5)
SELECT * FROM ##PRODUCTS
CREATE TABLE ##SALES(PROID INT, SALEID INT,
AMOUNT NUMERIC(18,2))
INSERT INTO ##SALES VALUES
(1,1,100),
(1,2,200),
(1,3,300),
(1,4,400),
(1,5,500),
(2,1,100),
(2,2,200),
(2,3,300),
(2,4,400),
(2,5,500),
(3,1,100),
(3,2,200),
(3,3,300),
(3,4,400),
(3,5,500),
(4,1,100),
(4,2,200),
(4,3,300),
(4,4,400),
(4,5,500),
(5,1,100),
Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER
(5,2,200),
(5,3,300),
(5,4,400),
(5,5,500)
SELECT * FROM ##PRODUCTS
SELECT * FROM ##SALES
WITH CTE
AS
(
SELECT
P.PROID,P.PNAME,S.SALEID,SUM(S.AMOUNT) AS
AMOUNT,
RANK()OVER(PARTITION BY S.SALEID ORDER BY
SUM(S.AMOUNT) DESC) AS RN
FROM ##PRODUCTS P JOIN ##SALES S ON
P.PROID=S.PROID
GROUP BY P.PROID,S.SALEID,P.PNAME
)SELECT * FROM CTE
WITH REVENUE AS
(
SELECT
P.PROID,P.PNAME,P.CATEGORYID,SUM(S.AMOUNT) AS
AMOUNT,
RANK()OVER(PARTITION BY P.CATEGORYID ORDER BY
SUM(S.AMOUNT) DESC) AS RN
FROM ##PRODUCTS P JOIN ##SALES S ON
P.PROID=S.PROID
Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER
GROUP BY P.PROID,P.PNAME,P.CATEGORYID
) SELECT * FROM REVENUE WHERE RN<=5
--Question 2.
--Find the Third Lowest Price for Each Product
Category
--DROP TABLE ##PRODUCTS
CREATE TABLE ##PRODUCTS(PROID INT, PNAME
VARCHAR (50), CATEGORYID INT,AMOUNT INT)
INSERT INTO ##PRODUCTS VALUES
(1,'CPU',1,100),
(2,'RAM',2,100),
(3,'MONITOR',3,200),
(4,'KEYBORD',4,300),
(5,'MOUSE',5,300)
WITH LOWESTPRICE AS
(
SELECT *,
DENSE_RANK()OVER(ORDER BY AMOUNT DESC)RN
FROM ##PRODUCTS
) SELECT * FROM LOWESTPRICE WHERE RN=3
--Question 3.
--Write a query to find the customer with the
highest total purchase amount in each region.
DROP TABLE ##CUSTOMERS
CREATE TABLE ##CUSTOMERS(CUSTID INT, CUSTNAME
VARCHAR (50), REGION VARCHAR(50))
INSERT INTO ##CUSTOMERS VALUES
Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER
(1,'CPU','DELHI'),
(2,'RAM','MUMBAI'),
(3,'MONITOR','PATNA'),
(4,'KEYBORD','NOIDA'),
(5,'MOUSE','UP'),
(1,'CPU','ADELHI'),
(2,'RAM','MUMBAI'),
(3,'MONITOR','APATNA'),
(4,'KEYBORD','ANOIDA'),
(5,'MOUSE','UP')
SELECT * FROM ##CUSTOMERS
CREATE TABLE ##ORDERS(ORDID INT, CUSTID INT,
AMOUNT NUMERIC(18,2))
INSERT INTO ##ORDERS VALUES
(1,1,100),
(1,2,200),
(1,3,300),
(1,4,400),
(1,5,500),
(2,1,100),
(2,2,200),
(2,3,300),
(2,4,400),
(2,5,500),
(3,1,100),
(3,2,200),
(3,3,300),
Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER
(3,4,400),
(3,5,500),
(4,1,100),
(4,2,200),
(4,3,300),
(4,4,400),
(4,5,500),
(5,1,100),
(5,2,200),
(5,3,300),
(5,4,400),
(5,5,500)
WITH CUSTOMERSTOTAL AS(
SELECT
C.CUSTID,C.CUSTNAME,C.REGION,SUM(O.AMOUNT)AMT
FROM ##CUSTOMERS C JOIN ##ORDERS O ON
C.CUSTID=O.CUSTID
GROUP BY C.CUSTID,C.CUSTNAME,C.REGION
) , RANKED AS
(
SELECT *
, RANK()OVER(PARTITION BY REGION ORDER BY AMT
DESC) RN
FROM CUSTOMERSTOTAL
)SELECT * FROM RANKED WHERE RN=1

Follow Me | Subhash Yadav |Big Data Engineer


INTERVIEW QUESTIONS & WITH ANSWER
--SQL Interview Question and Answers

---Write a query to find all employees who are not


assigned to any department.

SELECT *
FROM employees
WHERE department_id IS NULL;

--Write a query to retrieve the N-th highest salary


from the employee table.

SELECT *
FROM (
SELECT *, DENSE_RANK() OVER (ORDER BY
SALARY DESC) AS salary_rank
FROM EMPLOYEE
) AS ranked_salaries
WHERE salary_rank = N; -- Replace N with the
desired rank

Follow Me | Subhash Yadav |Big Data Engineer


INTERVIEW QUESTIONS & WITH ANSWER
--How would you find employees who have the
same salary?

SELECT e1.employee_id, e1.salary


FROM employees e1
JOIN employees e2 ON e1.salary = e2.salary
WHERE e1.employee_id != e2.employee_id;

--Write a query to calculate the total salary for each


department.

SELECT department_id, SUM(salary) AS total_salary


FROM employees
GROUP BY department_id;

--How would you retrieve the second lowest salary


in a table?

SELECT MIN(salary) AS SecondLowestSalary


FROM employees
WHERE salary > (SELECT MIN(salary) FROM
employees);
Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER

--Write a query to find all employees who joined


before 2010 and have a salary greater than 50,000.

SELECT *
FROM employees
WHERE join_date < '2010-01-01' AND salary >
50000;

--How do you fetch the top 3 highest salaries for


each department?

SELECT department_id, salary


FROM employees
WHERE (department_id, salary) IN (
SELECT department_id, MAX(salary)
FROM employees
GROUP BY department_id
ORDER BY salary DESC
LIMIT 3
);

Follow Me | Subhash Yadav |Big Data Engineer


INTERVIEW QUESTIONS & WITH ANSWER
--Write a query to find all employees who earn
more than the average salary of their department.

SELECT e1.employee_id, e1.salary,


e1.department_id
FROM employees e1
JOIN (
SELECT department_id, AVG(salary) AS avg_salary
FROM employees
GROUP BY department_id
) e2 ON e1.department_id = e2.department_id
WHERE e1.salary > e2.avg_salary;

--To find duplicates based on NAME and SALARY?

SELECT NAME, SALARY, COUNT(*)


FROM EMPLOYEE
GROUP BY NAME, SALARY
HAVING COUNT(*) > 1;

--Delete Duplicate Rows (Keep Only One)

Follow Me | Subhash Yadav |Big Data Engineer


INTERVIEW QUESTIONS & WITH ANSWER
WITH CTE AS (
SELECT *, ROW_NUMBER() OVER (PARTITION BY
NAME, SALARY ORDER BY ID) AS rn
FROM EMPLOYEE
)
DELETE FROM EMPLOYEE
WHERE ID IN (
SELECT ID FROM CTE WHERE rn > 1
);

--Write a query to calculate the total number of


employees in each department and the average
salary.

SELECT department_id, COUNT(*) AS


total_employees, AVG(salary) AS avg_salary
FROM employees
GROUP BY department_id;

--Write a query to get the department with the


highest total salary.

Follow Me | Subhash Yadav |Big Data Engineer


INTERVIEW QUESTIONS & WITH ANSWER
SELECT department_id, SUM(salary) AS total_salary
FROM employees
GROUP BY department_id
ORDER BY total_salary DESC
LIMIT 1;

PySpark Basics and RDDs


Q1. What is the difference between RDD, DataFrame, and Dataset?
RDDs:-
A distributed collection of data elements without a schema. RDDs are slower
than DataFrames and Datasets for simple operations.
DataFrames:-
A distributed collection organized into named columns.
DataFrames are similar to relational database tables or Python Pandas
DataFrames.
DataFrames are faster than RDDs for exploratory analysis and creating
aggregated statistics.
Datasets:-
An extension of DataFrames with additional features like type-safety and
object-oriented interface. Datasets are faster than RDDs but slower than
DataFrames. Datasets combine the performance optimization of DataFrames
and the convenience of RDDs.
Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER

Q2. How does PySpark achieve parallel processing?


PySpark achieves parallel processing by leveraging Apache Spark's
distributed computing architecture.
1. RDD/DataFrame Abstractions
2. Driver and Executors
3. Task Parallelism
4. Cluster Manager Integration
5. Lazy Evaluation & DAG
6. In-Memory Computation

PySpark achieves parallel processing by:


 Distributing data across partitions
 Executing tasks concurrently on worker nodes
 Managing resources via cluster managers
 Optimizing execution through DAG and lazy evaluation

Q3. Explain lazy evaluation in PySpark with a real-world analogy.


Lazy evaluation in PySpark means that transformations are not executed
immediately when you define them. Instead, Spark waits until an action (like
collect() or count()) is called to actually execute the transformations. This
allows Spark to optimize the execution plan for better performance
Q4. What is SparkContext, and why is it important?

Follow Me | Subhash Yadav |Big Data Engineer


INTERVIEW QUESTIONS & WITH ANSWER
SparkContext is the entry point to using Spark functionality in PySpark (or
Scala Spark). It represents the connection between your application and the
Spark cluster.
Role Description
1. Initializes Spark Application: It sets up the environment and allows your
app to use Spark’s capabilities.
2. Connects to the Cluster: Manages communication with the Cluster Manager
(e.g., YARN, Standalone).
3. Resource Allocation: Requests resources (executors, cores, memory) for
your Spark jobs.
4. Job Submission: Submits jobs and coordinates RDD or DataFrame
transformations/actions.
5. Fault Tolerance & Lineage: Keeps track of RDD lineage for fault recovery.

Q5. How do you handle large file processing in PySpark?


TechniquePurpose
1. Use Parquet/ORC Faster, more efficient reads
2. Partitioning Process only necessary data
3. Repartition/Coalesce Control parallelism and file count
4. Caching Save repeated computations
5. Filter Early Reduce input size
6. Avoid .collect () Prevent memory issues on driver
7. Broadcast small datasets Optimize joins

Follow Me | Subhash Yadav |Big Data Engineer


INTERVIEW QUESTIONS & WITH ANSWER
Q6. What is the difference between actions and transformations in
PySpark?
Feature Transformations Actions
Execution Lazy Trigger execution
Return Type New RDD/DataFrame Result to driver or storage
Examples map(), filter(), select() count(), collect(), show()
Purpose Define a computation plan Execute and get results
Q7. How does Spark handle data partitioning in distributed
environments?
Apache Spark uses data partitioning to divide large datasets into smaller
chunks (called partitions) that can be processed in parallel across multiple
nodes in a cluster.
A partition is a logical chunk of data stored in memory or disk. Each partition
is processed by a single task in a single executor thread.
When you create an RDD from a file or collection, Spark partitions it
automatically.
Spark also partitions DataFrames internally (default number:
spark.sql.shuffle.partitions = 200).

Hash Partitioning: - Spark uses a hash function on a key to distribute rows


evenly. Common in joins and aggregations.
Range Partitioning: - Data is divided into ordered ranges. Useful for ordered
or skewed data.
Custom Partitioning: - You can define your own partition logic using a custom
Partitioner (RDD only).

reduceByKey() ✅ Keys are grouped across partitions


Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER
join() ✅ Matching keys may be on different partitions
coalesce() ✅ (sometimes) Reduces number of partitions
repartition() ✅ Redistributes data evenly
Q8. Explain the concept of fault tolerance in PySpark?
Mechanism Description

Lineage DAG Rebuilds lost data by reapplying transformations

Task Retries Failed tasks are retried automatically


Relies on storage layer (e.g., HDFS) for fault-
Data Replication
tolerant reads
Persist intermediate RDDs to reduce
Checkpointing
recomputation cost

Q9. How do you broadcast variables in Spark, and when should you
use them?
In Spark, broadcast variables are used to efficiently share small read-only
data (like lookup tables or configuration settings) with all worker nodes,
without sending a copy for each task.
Q10. What are accumulators in PySpark, and how do they differ from
broadcast variables?
Feature Accumulators Broadcast Variables
Aggregation (e.g.,
Purpose Share read-only data with executors
counters, sums)
Tasks can only add
Mutable? Completely read-only
values
Driver only can read
Access All tasks can read it
value
Usage in Tasks Write-only in workers Read-only in workers
Common Use Metrics, debugging, Lookup tables, configs, small
Cases counting conditions datasets

Follow Me | Subhash Yadav |Big Data Engineer


INTERVIEW QUESTIONS & WITH ANSWER
DataFrame and Dataset
Operations
Q11. How do you perform data filtering using PySpark
DataFrames?
SQL String df.filter("age > 25")
Column Functions df.filter(col("age") > 25)
Complex Logic df.filter((col("age") > 30) & (col("country") == "US"))
Pattern Match df.filter(col("name").like("A%"))

Q12. What is the difference between repartition () and


coalesce (), and when would you use each?
Feature repartition() coalesce()

Operation Full shuffle Narrow dependency (no shuffle)

Change partitions Increase or decrease Only decrease


Cost Expensive due to shuffle Cheap, avoids shuffle
Improve parallelism, Reduce partitions, optimize output
Use case
repartition by key files

Q13. How do you handle missing or null values in PySpark?


Function /
Task Description
Method
Detect nulls .filter(col.isNull()) Find rows with nulls

Drop rows with nulls .dropna() Remove rows with nulls

Fill nulls .fillna() Replace nulls with specified values


Impute values Imputer (MLlib) Replace nulls with mean/median
Replace in expressions coalesce() Use first non-null value

Follow Me | Subhash Yadav |Big Data Engineer


INTERVIEW QUESTIONS & WITH ANSWER
Q14. How can you add a new column to a DataFrame using
withColumn()?
df_new = df.withColumn("country", lit("USA"))

Q15. How do you perform a left join between two DataFrames


in PySpark?
spark.sql("select a.id,a.name,b.product from cust a left join prod b on
a.id=b.id").show()

Q16. What are temporary views in PySpark, and how do they


differ from global temporary views?

df.createOrReplaceTempView("temp_view")
spark.sql("SELECT * FROM temp_view WHERE age > 30").show()

Q17. How do you use window functions in PySpark for


advanced analytics?
windowSpec =
Window.partitionBy("department").orderBy(col("salary").desc())

Q18. How can you register a UDF (User-Defined Function) in


PySpark?

def to_uppercase(s):
return s.upper()

from pyspark.sql.functions import udf


from pyspark.sql.types import StringType

# Register the UDF with a return type


to_upper_udf = udf(to_uppercase, StringType())

Follow Me | Subhash Yadav |Big Data Engineer


INTERVIEW QUESTIONS & WITH ANSWER
spark.udf.register("to_uppercase_sql", to_uppercase, StringType())
# Dataframe
df.withColumn("name_upper", to_upper_udf(col("name"))).show()

df.createOrReplaceTempView("people")
spark.sql("SELECT name, to_uppercase_sql(name) AS name_upper FROM
people").show()

Q19. What is the difference between persist() and cache()?


Feature cache() persist()
Not a shortcut, you specify
Shortcut for .persist(MEMORY_AND_DISK)
StorageLevel
Storage control No Yes
✅ (e.g., MEMORY_ONLY,
Custom levels No
DISK_ONLY)
Advanced control over
Use case Default caching needs
storage behavior

Q20. How do you read and write data in Parquet, CSV, and
JSON formats in PySpark?
Read:
df_parquet = spark.read.parquet("path/to/file.parquet")
df_csv = spark.read.option("header", "true").csv("path/to/file.csv")
df_json = spark.read.json("path/to/file.json")
Write:
df.write.mode("overwrite").parquet("path/to/output_parquet")
df.write.option("header",
"true").mode("overwrite").csv("path/to/output_csv")
df.write.mode("overwrite").json("path/to/output_json")

Follow Me | Subhash Yadav |Big Data Engineer


INTERVIEW QUESTIONS & WITH ANSWER
Spark SQL and Query
Optimization
Q21. How do you run SQL queries on a DataFrame in
PySpark?
df.createOrReplaceTempView("temp_view")
spark.sql("SELECT * FROM temp_view WHERE age > 30").show()
Register temp view createOrReplaceTempView("name")
Register global view createGlobalTempView("name")
Run SQL query spark.sql("SQL QUERY")

Q22. What is the purpose of Catalyst Optimizer in Spark SQL?


The Catalyst Optimizer is the query optimization engine used by Spark SQL.
Its main goal is to automatically optimize queries to improve performance
and efficiency

1. Logical Optimization
2. Physical Plan Optimization
3. Rule-Based and Cost-Based Optimization
Uses rule-based techniques (static transformations) and optionally cost-
based optimization (CBO) to make smarter choices.
4. Extensibility
Automatic optimization (you don’t need to tune manually)
Improved performance for complex SQL/DataFrame queries
Extensible for custom logic in enterprise environments

Q23. How do you handle schema inference when reading data


from external sources?
PySpark tries to infer the schema by scanning the data when you use
.option("inferSchema", "true") (mainly for CSV and JSON).
df = spark.read \

Follow Me | Subhash Yadav |Big Data Engineer


INTERVIEW QUESTIONS & WITH ANSWER
.option("header", "true") \
.option("inferSchema", "true") \
.csv("path/to/file.csv")

df = spark.read \
.option("inferSchema", "true") \
.json("path/to/file.json")
Manual Schema Definition (Recommended for Large Data)
You can define a StructType schema to explicitly specify data types and
improve performance.
schema = StructType([
StructField("name", StringType(), True),
StructField("age", IntegerType(), True)
])

Parquet & ORC – Schema is Embedded


For Parquet and ORC files, schema is already embedded in the file format:
df = spark.read.parquet("path/to/file.parquet")

Q24. What are the different join types in Spark SQL, and when
would you use each?

spark.sql("select a.id,a.name,b.product from cust a join prod b on


a.id=b.id").show()
spark.sql("select a.id,a.name,b.product from cust a left join prod b on
a.id=b.id").show()
spark.sql("select a.id,a.name,b.product from cust a right join prod b on
a.id=b.id").show()
spark.sql("select a.id,a.name,b.product from cust a full join prod b on
a.id=b.id").show()
spark.sql("select a.id,a.name from cust a LEFT ANTI JOIN prod b on
a.id=b.id").show() --Tbl1—(1,2,3) Tbl2—(1,3)
LEFT ANTI JOIN Only Show (2)
Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER
spark.sql("select a.id,a.name from cust a LEFT SEMI JOIN prod b on
a.id=b.id").show()--Tbl1—(1,2,3) Tbl2—(1,3)
LEFT SEMI JOIN Only Show (1,3)
spark.sql("select a.id,a.name from cust a CROSS JOIN prod b").show()
Tbl1*Tbl2

Q25. How do you create a persistent table in Spark SQL?


This stores both the data and schema persistently in the metastore.
df.write
.mode("overwrite")
.saveAsTable("employee")

Q26. How does dynamic partition pruning improve query


performance?

Static partition pruning: Prunes partitions before query starts (e.g., WHERE
region = 'US').
DPP (Dynamic Partition pruning): Prunes partitions during execution,
based on values coming from another table.
spark.conf.set("spark.sql.optimizer.dynamicPartitionPruning.enabled",
"true")

Q27. Explain how to use broadcast joins to optimize query


performance?

from pyspark.sql.functions import broadcast


# 'small_df' is the smaller table, 'large_df' is the big one
joined_df = large_df.join(broadcast(small_df), "join_key")

Q28. What is data skew, and how do you handle it in Spark


SQL?

Follow Me | Subhash Yadav |Big Data Engineer


INTERVIEW QUESTIONS & WITH ANSWER
1. Salting Keys
Add a random prefix/suffix ("salt") to skewed keys to spread the data across
multiple partitions, then join on this salted key
from pyspark.sql.functions import concat, lit, floor, rand

# Add salt column to both DataFrames


df1_salted = df1.withColumn("salt", floor(rand() * 10))
df1_salted = df1_salted.withColumn("salted_key",
concat(df1_salted["join_key"], lit("_"), df1_salted["salt"]))

df2_salted = df2.withColumn("salt", floor(rand() * 10))


df2_salted = df2_salted.withColumn("salted_key",
concat(df2_salted["join_key"], lit("_"), df2_salted["salt"]))

# Join on salted_key instead of join_key


result = df1_salted.join(df2_salted, "salted_key")

2. Broadcast Join
If one table is small, broadcast it to avoid shuffling and reduce skew impact.
from pyspark.sql.functions import broadcast
result = large_df.join(broadcast(small_df), "join_key")

3. Increase Shuffle Partitions


Increase spark.sql.shuffle.partitions to spread skewed keys over more
partitions.
spark.conf.set("spark.sql.shuffle.partitions", 500)

4. Skew Join Optimization (Spark 3.0+) Adaptive Query Execution (AQE)


Spark 3+ supports adaptive query execution (AQE) with built-in skew join
optimization that detects skew and splits large partitions automatically.
spark.conf.set("spark.sql.adaptive.enabled", "true")
spark.conf.set("spark.sql.adaptive.skewJoin.enabled", "true")

5. Filter or Aggregate Early


Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER
Reduce skew by filtering or aggregating data before the join to minimize
skewed keys.

6. spark_partition_id()
In Apache Spark, spark_partition_id() is a built-in function that returns the
partition ID (an integer) of the row it is associated with. It is particularly
useful for debugging, understanding data distribution, and optimizing
performance.

import org.apache.spark.sql.functions.spark_partition_id
val df = spark.range(0, 10).repartition(3)
df.withColumn("partition_id", spark_partition_id()).show()

Technique When to Use


Salting Manual control when you know skewed keys
Broadcast Join When one table is small
Increase Shuffle Partitions To increase parallelism
Adaptive Query Execution Spark 3+ automatic skew handling
Early Filtering/Aggregation Reduce skew data volume before join
spark_partition_id() After a join to check skewed partitions.

Q29. How can you perform aggregations using SQL queries


on large datasets?
When working with large datasets, aggregations are common operations like
SUM, COUNT, AVG, MIN, MAX, GROUP BY, etc. Spark SQL is designed to handle
these efficiently even at scale

Q30. How do you enable query caching in Spark SQL?


Method How to Enable When to Use
SQL CACHE TABLE CACHE TABLE tableName; Cache tables for repeated SQL queries
PySpark .cache() df.cache() Cache DataFrames in Spark applications
Persist with StorageLevel df.persist(StorageLevel) Customize cache storage behavior
AQE Cache Set configs for adaptive query caching Automatic optimization in Spark 3.2+

Follow Me | Subhash Yadav |Big Data Engineer


INTERVIEW QUESTIONS & WITH ANSWER
Data Pipeline Scenarios and
Real World Use Cases
Q31. How would you build an ETL pipeline using PySpark?
Key ETL Steps with PySpark

1. Extract – Read data from external sources


2. Transform – Clean, filter, join, aggregate, enrich data
3. Load – Write the final dataset to storage (Parquet, S3, Hive, etc.)

from pyspark.sql import SparkSession


from pyspark.sql.functions import col, to_date

# Step 1: Initialize Spark Session


spark = SparkSession.builder \
.appName("ETL Pipeline Example") \
.getOrCreate()

# Step 2: Extract - Load raw data from CSV


raw_df = spark.read.option("header", True).csv("s3://your-
bucket/raw/sales.csv")

# Step 3: Transform - Clean and prepare the data


transformed_df = raw_df \
.withColumn("sales_amount", col("sales_amount").cast("double")) \
.withColumn("date", to_date(col("date"), "yyyy-MM-dd")) \
.filter(col("sales_amount") > 0)

# Optional: Join with product/dimension data


products_df = spark.read.parquet("s3://your-bucket/dim/products/")
final_df = transformed_df.join(products_df, on="product_id", how="left")

Follow Me | Subhash Yadav |Big Data Engineer


INTERVIEW QUESTIONS & WITH ANSWER
# Step 4: Load - Write cleaned data to S3 in Parquet format
final_df.write.mode("overwrite").partitionBy("date").parquet("s3://your-
bucket/processed/sales/")

# Stop Spark session


spark.stop()

Q32. How do you handle real-time data processing with


Structured Streaming in PySpark?

from pyspark.sql import SparkSession


from pyspark.sql.functions import expr

# 1. Create Spark session


spark = SparkSession.builder \
.appName("RealTimeETL") \
.getOrCreate()

# 2. Read streaming data from Kafka


df = spark.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", "localhost:9092") \
.option("subscribe", "sales_topic") \
.load()

# 3. Transform - Parse Kafka value and extract fields


sales_df = df.selectExpr("CAST(value AS STRING) as json_data") \
.selectExpr("from_json(json_data, 'product_id INT, sales_amount DOUBLE,
ts STRING') as data") \
.select("data.*")

# 4. Optional aggregation
agg_df = sales_df.groupBy("product_id").sum("sales_amount")
Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER
# 5. Write to output sink (console or storage)
query = agg_df.writeStream \
.outputMode("complete") \
.format("console") \
.option("truncate", "false") \
.trigger(processingTime="10 seconds") \
.start()

query.awaitTermination()

Triggers.
trigger(processingTime="10 seconds") # every 10 seconds
.trigger(once=True) # one-time batch for debugging

Fault Tolerance
.writeStream.option("checkpointLocation", "/tmp/checkpoints/")

Q33. What are the best practices for partitioning data in large
datasets?

1. Partition by Frequently Queried Columns


df.write.partitionBy("country", "year").parquet("s3://your-bucket/sales/")

2. Avoid Over-Partitioning and Small Files


df.coalesce(10).write.parquet("path/")

3. Use Repartitioning for Data Shuffle Efficiency


df = df.repartition(100, "customer_id")

4. Use .coalesce() to Reduce Partition Count Before Writing


Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER
df.coalesce(1).write.parquet("path/")

5. Choose Cardinality Wisely


Avoid partitioning by high-cardinality columns like user_id or transaction_id
— leads to too many tiny partitions.

Prefer low to medium cardinality columns like:


country,year,region,event_type

6. Use Bucketing for Efficient Joins


CREATE TABLE sales_bucketed
USING parquet
CLUSTERED BY (customer_id) INTO 100 BUCKETS;

7. Leverage Partition Pruning


SELECT * FROM sales WHERE year = 2024 AND country = 'US';

8. Monitor and Tune with Spark UI


Tune spark.sql.shuffle.partitions (default: 200)

Technique When to Use Benefit


partitionBy() Writing data to storage Enables pruning, efficient reads
repartition() Before joins, increase parallelism Improves shuffle-based ops
coalesce() Before writing output Combines partitions, fewer files
Bucketing For repetitive joins on a key Faster joins without reshuffling
Partition
Filtering on partition columns Reads only required data
Pruning

Q34. How would you debug and optimize a slow-running


Spark job?
1. Check Spark UI
URL: Usually at http://<driver-node>:4040

Follow Me | Subhash Yadav |Big Data Engineer


INTERVIEW QUESTIONS & WITH ANSWER
2. Identify Expensive Operations
Common causes of slowness:
Wide transformations (e.g., join, groupBy, distinct)
Large shuffles (data moved between nodes)
Skewed data (some tasks take much longer)
df.explain(True)
df.queryExecution.debug.codegen()

3. Check for Data Skew


Use .groupBy("key").count().orderBy("count", ascending=False) to detect
skewed

4. Broadcast Joins for Small Tables


df1.join(broadcast(df2), "id")

5. Optimize Shuffles
spark.conf.set("spark.sql.shuffle.partitions", 200) # default, increase or
decrease based on data size

6. Cache/Persist Intermediate Results


df.cache() # or df.persist(StorageLevel.MEMORY_AND_DISK)
df.count() # trigger caching

7. Avoid Unnecessary Collect/Show


Use .limit(n).show() instead for sampling.

8. Tune Resource Allocation


--executor-memory 4G
--executor-cores 4
--num-executors 50

spark.conf.set("spark.dynamicAllocation.enabled", "true")

9. Enable Adaptive Query Execution (AQE)


Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER
spark.conf.set("spark.sql.adaptive.enabled", "true")

10. Profile with Spark History Server


http://<your-cluster>/history

Q35. How do you handle schema evolution in PySpark


pipelines?
1. Enable Schema Merging (for Parquet/ORC)
df = spark.read.option("mergeSchema",
"true").parquet("s3://path/to/parquet/")

2. Use Delta Lake for Robust Schema Evolution


Delta Lake (on Databricks or open source) supports automatic schema
evolution.
new_data.write \
.format("delta") \
.option("mergeSchema", "true") \
.mode("append") \
.save("/mnt/delta/sales/")
3. Infer Schema Dynamically (for Semi-Structured Data)
df = spark.read \
.option("inferSchema", "true") \
.json("s3://path/json/")

4. Define and Update Explicit Schemas


from pyspark.sql.types import StructType, StructField, StringType,
IntegerType

schema_v2 = StructType([
StructField("id", IntegerType(), True),
StructField("name", StringType(), True),
StructField("email", StringType(), True) # new column
Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER
])
df = spark.read.schema(schema_v2).json("path")

5. Handle Nulls and Defaults for Missing Fields


df = df.withColumn("email", coalesce(col("email"),
lit("[email protected]")))

6. Monitor and Validate Schema Changes


.schema.json() to save schema versions.

7. Backfill Historical Data (Optional)


If schema changes are breaking (e.g., renaming a column):
Consider backfilling historical data to the new schema.
Or maintain versioned data models (v1, v2 folders or tables).

Q36. What is the role of checkpointing in Spark Streaming?

Checkpointing in Spark Streaming (including Structured Streaming) is a


critical mechanism that enables fault tolerance, state recovery, and state
management during stream processing.

Types of Checkpointing in Spark


Metadata Checkpointing:-
Saves streaming job progress (e.g., offsets, batch IDs).Required for Structured
Streaming.

Data Checkpointing:-
Saves the RDD lineage and data to avoid recomputation.
Mainly used in DStream-based Spark Streaming (less common now).

How to Enable Checkpointing in Structured Streaming

query = df.writeStream \

Follow Me | Subhash Yadav |Big Data Engineer


INTERVIEW QUESTIONS & WITH ANSWER
.format("parquet") \
.outputMode("append") \
.option("checkpointLocation", "s3://my-bucket/checkpoints/job1/") \
.start("s3://my-bucket/output/")
Checkpoint directory must be reliable and durable (e.g., HDFS, S3).

Purpose Description
Fault Recovers the stream from failures by storing metadata
Recovery and data state
Stateful Required for operations like updateStateByKey,
Operations mapGroupsWithState
Progress Tracks offsets (Kafka, file source), watermarks, and
Tracking batch info

Q37. How can you implement incremental data processing in


PySpark?

Common Strategies for Incremental Processing

1. Using Timestamps or Date Columns


Assumption: Your source data has a column like
last_updated, created_at, or ingestion_date.

# Last processed timestamp, from metadata store


last_timestamp = "2025-05-20 00:00:00"

# Filter new/updated records


new_data = df.filter(df["last_updated"] > lit(last_timestamp))

2. Using Watermarking in Structured Streaming

df = spark.readStream \
Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER
.format("kafka") \
.option("subscribe", "orders") \
.load()

parsed = df \
.withWatermark("event_time", "10 minutes") \
.groupBy(window("event_time", "5 minutes")) \
.count()

3. Delta Lake’s Change Data Feed (CDF) 🔁


If using Delta Lake, enable CDF to get only updated/new/deleted rows:
df = spark.read.format("delta") \
.option("readChangeData", "true") \
.option("startingVersion", 23) \
.load("/delta/orders/")
4. Using Surrogate Keys or Auto-Increment IDs
last_processed_id = 10250
new_data = df.filter(df["id"] > last_processed_id)

Store the last processed ID externally.


Useful when data is strictly append-only.

5. Compare Against Existing Target Table (Merge)


Use merge (upsert) to load only new/changed rows into a target:
from delta.tables import DeltaTable
target = DeltaTable.forPath(spark, "/delta/customers/")

target.alias("t").merge(
source=new_data.alias("s"),
condition="t.id = s.id"
).whenMatchedUpdateAll() \
.whenNotMatchedInsertAll() \
.execute()

Follow Me | Subhash Yadav |Big Data Engineer


INTERVIEW QUESTIONS & WITH ANSWER
Real-World Use Case Example
Scenario: You want to process only new orders added daily to a Parquet file.
last_processed_date = "2024-05-19"

df = spark.read.parquet("s3://bucket/orders/")
incremental_df = df.filter(df["order_date"] > last_processed_date)

# Process and write


incremental_df.write.parquet("s3://bucket/processed_orders/",
mode="append")

# Update last_processed_date in metadata store

Q38. How do you handle large joins between multiple


DataFrames?

1. Broadcast Joins (for Small Tables)


from pyspark.sql.functions import broadcast
result = large_df.join(broadcast(small_df), "join_key")

2. Repartition Before Join


Ensure both DataFrames are partitioned on the join key to reduce shuffle
skew.
df1 = df1.repartition("join_key")
df2 = df2.repartition("join_key")
joined_df = df1.join(df2, "join_key")

3. Use Bucketing (For Hive Tables)


CREATE TABLE t1 (...) CLUSTERED BY (key) INTO 50 BUCKETS;

4. Avoid Cross Joins Unless Necessary


df1.crossJoin(df2).filter("df1.col = df2.col")
Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER
5. Skew Join Handling (When Keys Are Uneven)

Add a salt key (e.g., key + rand()) to spread out skewed data.
Use salting or enable AQE skew join handling (in Spark 3.0+):

spark.conf.set("spark.sql.adaptive.enabled", "true")
spark.conf.set("spark.sql.adaptive.skewJoin.enabled", "true")

6. Join in Stages (Multi-Table Join Strategy)


temp = df1.join(df2, "key1")
result = temp.join(df3, "key2")

7. Use SQL for Complex Joins


df1.createOrReplaceTempView("orders")
df2.createOrReplaceTempView("customers")
df3.createOrReplaceTempView("products")

spark.sql("""
SELECT o.*, c.name, p.name
FROM orders o
JOIN customers c ON o.cust_id = c.id
JOIN products p ON o.prod_id = p.id
""")

8. Tune Configurations

Setting Purpose
spark.sql.shuffle.partitions Controls # of shuffle partitions
spark.sql.autoBroadcastJoinThreshold Max size (bytes) for broadcast
spark.sql.adaptive.enabled Enables Adaptive Query Execution
spark.sql.adaptive.skewJoin.enabled Handles skewed joins automatically

Follow Me | Subhash Yadav |Big Data Engineer


INTERVIEW QUESTIONS & WITH ANSWER
Q39. What is the difference between batch processing and
stream processing in Spark?

Feature Batch Processing Stream Processing


Nature of Data Processes finite/static data Processes continuous/infinite data
Input Source Files (Parquet, CSV, etc.), DBs Kafka, socket, files, etc.
Execution Runs as a job, ends when data is Runs continuously, processing data
Mode processed in real time

Q40. How would you secure sensitive data in a PySpark


pipeline?

1. Data Encryption
At Rest
Enable encryption on storage systems like:
Amazon S3 (SSE-S3, SSE-KMS)
HDFS transparent encryption
Azure Data Lake encryption
Use encrypted file formats like Parquet + GZIP/Snappy.
In Transit

Enable SSL/TLS when transferring data:


Between Spark and Kafka, S3, JDBC, etc.
Use spark.ssl.enabled for Spark encryption.
2. Masking and Tokenization
Use data masking to obscure sensitive fields (e.g., SSNs, emails):
from pyspark.sql.functions import sha2, col
df = df.withColumn("email_hash", sha2(col("email"), 256))
Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER
3. Column-Level Encryption (Custom)
Encrypt sensitive columns before writing:
from cryptography.fernet import Fernet

key = Fernet.generate_key()
cipher = Fernet(key)
@udf("string")
def encrypt(value):
return cipher.encrypt(value.encode()).decode()
df = df.withColumn("ssn_encrypted", encrypt(col("ssn")))

Store encryption keys in a secure vault (e.g., AWS KMS, Azure Key Vault,
HashiCorp Vault).
4. Access Control
Use Role-Based Access Control (RBAC):
On data storage (S3, ADLS, Hive, etc.)
On Databricks / Spark clusters
Apply fine-grained access control via:
Apache Ranger (for HDFS, Hive, etc.)
Unity Catalog (Databricks)
5. Auditing and Logging
Log:

Follow Me | Subhash Yadav |Big Data Engineer


INTERVIEW QUESTIONS & WITH ANSWER
Who accessed data
What data was read or written
When and where the access occurred
Use audit logs from:

Spark history server


Cloud providers (AWS CloudTrail, Azure Monitor)
Access gateways (e.g., Lake Formation)
6. Data Governance and Classification
Tag sensitive columns in metadata catalogs like:
AWS Glue Data Catalog
Apache Atlas
Unity Catalog (Databricks)
Define policies based on sensitivity level (e.g., PII, HIPAA).

7. DevSecOps Practices
Don't hardcode credentials in scripts.
Use secrets managers:
spark.conf.set("spark.hadoop.fs.s3a.access.key", ...) via environment vars or
secret scopes.
Encrypt logs and control log verbosity.
Security Measure Technique / Tool
Encryption (at rest) S3/KMS, HDFS encryption

Follow Me | Subhash Yadav |Big Data Engineer


INTERVIEW QUESTIONS & WITH ANSWER
Encryption (in transit) TLS/SSL in Spark, Kafka, JDBC
Data masking/tokenizing sha2(), custom UDFs
Access control RBAC, Apache Ranger, Unity Catalog
Auditing Cloud logs, Spark audit logs
Secrets management AWS Secrets Manager, Databricks secrets

Advanced PySpark Features


Q41. How do you handle large datasets in PySpark to
optimize performance and reduce memory usage?

1. Use Efficient Data Formats

Parquet or ORC are columnar storage formats optimized for Spark.


They provide better compression and faster I/O compared to formats like
CSV or JSON.
df.write.parquet("path/to/output.parquet")

2. Partitioning
Use repartition(n) to increase partitions (e.g., after a wide transformation).
Use coalesce(n) to reduce the number of partitions (e.g., before writing).

df = df.repartition(100, "col1") # Better parallelism


df = df.coalesce(10) # Reduce shuffles before write

3. Cache and Persist


Cache intermediate DataFrames if reused multiple times to avoid
recomputation.
Use .cache() or .persist(storage_level) only when needed.
df.persist(StorageLevel.MEMORY_AND_DISK)
Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER
4. Avoid Wide Transformations
Wide transformations (like groupByKey, join, distinct, repartition) trigger
shuffling.
Prefer reduceByKey or aggregateByKey instead of groupByKey.
rdd.reduceByKey(lambda x, y: x + y) # More efficient than groupByKey

5. Use Broadcast Join


If one dataset is small, broadcast it to all nodes to avoid shuffle-heavy joins.
from pyspark.sql.functions import broadcast
df = large_df.join(broadcast(small_df), "key")

# Create a small DataFrame to broadcast


small_df = spark.read.csv("small_dataset.csv", header=True,
inferSchema=True)
broadcast_small_df = spark.sparkContext.broadcast(small_df.collect())
# Use broadcast variable in a join
large_df = spark.read.csv("large_dataset.csv", header=True,
inferSchema=True)
joined_df = large_df.join(small_df, "key_column")

6. Column Pruning & Filter Pushdown


Read only required columns and apply filters early using predicate
pushdown.
spark.read.parquet("path").select("col1", "col2").filter("col1 > 100")

7. Avoid Collecting Large Data to Driver


Avoid using .collect() or .toPandas() on large datasets as it can crash the
driver.
Use .show(), .take(n) or .limit(n) for previewing.
df.limit(10).toPandas()

8. Optimize Joins
Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER
Ensure the join keys are distributed and avoid skewed joins.
Use salting or skew join hints when facing data skew.
df1.join(df2.hint("skew"), "key")

9. Use UDFs Wisely


Avoid Python UDFs due to serialization and performance overhead.
Prefer Spark built-in functions (pyspark.sql.functions) or Pandas UDFs.
from pyspark.sql.functions import col, upper
df = df.withColumn("name_upper", upper(col("name")))

10. Resource Tuning


Tune Spark configuration:

--executor-memory 4G
--executor-cores 4
--num-executors 10

# Example of configuring Spark settings in the SparkSession


spark = SparkSession.builder \
.appName("OptimizationExample") \
.config("spark.executor.memory", "4g") \
.config("spark.executor.cores", "4") \
.config("spark.driver.memory", "4g") \
.getOrCreate()

# Example of caching and partitioning


df = spark.read.csv("data.csv", header=True, inferSchema=True) # Read data
df.cache() # Cache the DataFrame
df_partitioned = df.repartition(numPartitions=100,
partitioningColumn="key_column") # Repartition

Follow Me | Subhash Yadav |Big Data Engineer


INTERVIEW QUESTIONS & WITH ANSWER

Q42. What is the purpose of Delta Lake, and how does it


improve reliability?

Delta Lake is an open-source storage layer that brings ACID transactions,


schema enforcement, and time travel to big data workloads on Apache Spark
and data lakes (like S3, ADLS, etc.).

Purpose of Delta Lake


Reliable, scalable big data pipelines
Transactional consistency on top of distributed storage
Unified batch and streaming data processing
1. ACID Transactions
Ensures atomicity, consistency, isolation, and durability even across multiple
writers
df.write.format("delta").mode("append").save("/path/to/delta-table")

2. Schema Enforcement & Evolution


Prevents bad data from corrupting tables with strict schema checks.
Supports schema evolution (e.g., adding new columns).
spark.read.format("delta").load("/path").printSchema()

3. Time Travel
Access previous versions of data using versioning or timestamps.
Useful for debugging, rollback, and reproducibility.
delta_table = DeltaTable.forPath(spark, "/path")
delta_table.history() # Show all versions
spark.read.format("delta").option("versionAsOf", 3).load("/path")

4. Unified Batch + Streaming


Enables a single table to support both streaming reads and batch writes,
improving consistency across pipelines.
spark.readStream.format("delta").load("/path")
Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER
5. Data Quality with Constraints
You can define constraints like NOT NULL, CHECK, etc.
Ensures data correctness at the write level.

6. Efficient Upserts and Deletes (MERGE)


Simplifies slow-changing dimension updates and deduplication.
from delta.tables import DeltaTable

deltaTable = DeltaTable.forPath(spark, "/path")


deltaTable.alias("target").merge(
source_df.alias("source"),
"target.id = source.id"
).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()

7. Scalable Metadata Handling


Delta Lake uses transaction logs (stored as _delta_log) rather than relying on
file listings, making it scalable for tables with millions of files

Q43. How do you enable time travel queries using Delta


Lake?

Delta Lake allows you to query past versions of a table using:

versionAsOf — specify a version number


timestampAsOf — specify a timestamp

1. Using versionAsOf
df = spark.read.format("delta") \
.option("versionAsOf", 5) \
.load("/path/to/delta-table")

2. Using timestampAsOf
Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER
df = spark.read.format("delta") \
.option("timestampAsOf", "2024-05-20T10:00:00") \
.load("/path/to/delta-table")

3. View Table History


from delta.tables import DeltaTable

delta_table = DeltaTable.forPath(spark, "/path/to/delta-table")


delta_table.history().show(truncate=False)

4. Notes
Delta Lake stores all changes as incremental commits in the _delta_log/
directory.
Older data is retained by default for 30 days, but this is configurable with the
data retention period
spark.databricks.delta.retentionDurationCheck.enabled = false

5. Optional: Clean Up Old Versions


# Remove files no longer needed for time travel (older than 7 days)
spark.sql("VACUUM delta.`/path/to/delta-table` RETAIN 168 HOURS")

Q44. How do you handle complex aggregations using window


functions?

1. Running Totals / Cumulative Sum


from pyspark.sql.window import Window
from pyspark.sql.functions import sum

window_spec =
Window.partitionBy("customer_id").orderBy("transaction_date") \
Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER
.rowsBetween(Window.unboundedPreceding,
Window.currentRow)
df = df.withColumn("running_total", sum("amount").over(window_spec))

2. Moving Average
from pyspark.sql.functions import avg

window_spec =
Window.partitionBy("customer_id").orderBy("transaction_date") \
.rowsBetween(-2, 0) # 3-day moving average
df = df.withColumn("moving_avg", avg("amount").over(window_spec))

3. Row Number / Ranking / Dense Ranking


from pyspark.sql.functions import row_number, rank, dense_rank

window_spec = Window.partitionBy("category").orderBy("sales")

df = df.withColumn("row_num", row_number().over(window_spec)) \
.withColumn("rank", rank().over(window_spec)) \
.withColumn("dense_rank", dense_rank().over(window_spec))

4. Lag/Lead for Value Comparison


from pyspark.sql.functions import lag, lead

window_spec = Window.partitionBy("user_id").orderBy("event_time")

df = df.withColumn("prev_val", lag("score", 1).over(window_spec)) \


.withColumn("next_val", lead("score", 1).over(window_spec))

5. Detecting Change Points or Gaps


from pyspark.sql.functions import col, lag, when

window_spec = Window.partitionBy("user_id").orderBy("event_time")
df = df.withColumn("prev_status", lag("status").over(window_spec)) \
Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER
.withColumn("status_changed", when(col("status") != col("prev_status"),
1).otherwise(0))

6. First and Last Value


from pyspark.sql.functions import first, last

window_spec = Window.partitionBy("department").orderBy("date")

df = df.withColumn("first_sale", first("sale").over(window_spec)) \
.withColumn("last_sale", last("sale").over(window_spec))

Function Description
`row_number()` Unique row number per partition
`rank()` Ranking with gaps
`dense_rank()` Ranking without gaps
`lag()` Value from a previous row
`lead()` Value from a following row
`sum()` Cumulative or windowed sum
`avg()` Moving or group average
`first()` First value in the window
`last()` Last value in the window

Q45. What are stateful operations in Spark Structured


Streaming?

Key Characteristics of Stateful Operations


State is maintained in memory and periodically checkpointed to ensure fault
tolerance.
Requires watermarks and timeout configurations to prevent unbounded state
growth.
Involves grouping, windowing, or matching events over time.

Follow Me | Subhash Yadav |Big Data Engineer


INTERVIEW QUESTIONS & WITH ANSWER
Examples of Stateful Operations
1. Group-based Aggregations (with Time Window)
from pyspark.sql.functions import window, sum

df.groupBy(
window("event_time", "10 minutes"),
"user_id"
).agg(sum("amount"))

2. Streaming Joins (between two streams)


stream1.join(stream2, "id") # Requires watermarking
3. FlatMapGroupsWithState
from pyspark.sql.functions import expr
from pyspark.sql.streaming import GroupState, GroupStateTimeout

def update_state(user_id, inputs, state: GroupState):


# custom logic here
return ...

df.groupByKey(lambda row: row.user_id).flatMapGroupsWithState(


update_state,
outputMode="update",
stateTimeoutDuration="10 minutes"
)

4. Deduplication
df.dropDuplicates(["user_id", "event_time"])

5. Role of Watermarking
Watermarking helps limit state size by specifying the maximum expected
lateness of data.
df.withWatermark("event_time", "15 minutes")

Follow Me | Subhash Yadav |Big Data Engineer


INTERVIEW QUESTIONS & WITH ANSWER

Q46. How do you implement error handling and retries in


PySpark jobs?

Implementing robust error handling and retry logic in PySpark jobs is


essential for production-grade data pipelines. Here’s how you can structure it
across different components of a PySpark job:
1. Use Try-Except Blocks in Driver Code
try:
df = spark.read.parquet("/input/path")
result = df.groupBy("category").count()
result.write.mode("overwrite").parquet("/output/path")
except Exception as e:
print(f"Job failed: {e}")
# Optionally send alert or write error to log

2. Implement Retries with Exponential Backoff


import time
import random

def retry_operation(func, retries=3):


for i in range(retries):
try:
return func()
except Exception as e:
print(f"Retry {i + 1} failed: {e}")
time.sleep(2 ** i + random.random()) # exponential backoff
raise Exception("All retries failed.")

3. Handle Errors in UDFs Carefully


from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

Follow Me | Subhash Yadav |Big Data Engineer


INTERVIEW QUESTIONS & WITH ANSWER
def safe_parse(value):
try:
return complex_parsing_logic(value)
except:
return None # or "error"

safe_parse_udf = udf(safe_parse, StringType())


df = df.withColumn("parsed", safe_parse_udf(df["raw_col"]))

4. Use Accumulators or Logs for Error Tracking


from pyspark.accumulators import AccumulatorParam

error_count = spark.sparkContext.accumulator(0)

def parse_and_count(value):
try:
return int(value)
except:
error_count.add(1)
return None

udf_parse = udf(parse_and_count)
df = df.withColumn("parsed", udf_parse(df["col"]))

5. Validate Data Early


expected_schema = StructType([...])
df = spark.read.schema(expected_schema).json("/data/path")

if df.filter("col IS NULL").count() > 0:


raise ValueError("Null values found in critical column.")

6. Checkpoints and Recovery for Streaming


query.writeStream \
Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER
.format("delta") \
.option("checkpointLocation", "/checkpoints/stream1") \
.start("/output/path")

7. Leverage Workflow Orchestration for Retries


PythonOperator(
task_id='spark_job',
python_callable=run_spark_job,
retries=3,
retry_delay=timedelta(minutes=2),
)

Area Strategy
Driver code Try-except with logging
External systems Retry with exponential backoff
UDFs Safe exception handling inside logic
Streaming Use check pointing and watermarking
Data quality Validate schema and critical fields early
Workflow orchestration Handle retries and notifications externally

Q47. How do you monitor and manage Spark clusters using


Spark UI?
The Spark UI is a web-based tool that provides detailed insights into:

Job and stage execution


Task-level metrics
Memory usage
Storage and caching
Executors
SQL query plans

1. Local Mode or Standalone Cluster


Default URL: http://localhost:4040
Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER
2. YARN (Yet Another Resource Negotiator)
In YARN mode, the Spark UI is linked from the YARN Resource Manager UI
under the "Tracking URL".

3. Databricks
Available as part of the "Spark UI" tab inside each job/run.

Key Spark UI Tabs


1. Jobs
2. Stages
3. Tasks
4. Storage
5. Environment
6. Executors
7. SQL (if using Spark SQL)

Q48. What is the difference between SparkSession and


SparkContext?

Use SparkSession in modern Spark applications (especially with DataFrames,


SQL, Delta Lake, etc.).
Use SparkContext only when working directly with RDDs or for low-level
operations.

Feature `SparkContext` `SparkSession`


Introduced in Spark 1.x Spark 2.0
Purpose Entry point for low-level RDD APIs Unified entry point for all Spark APIs
Supports RDD YES YES (via `spark.sparkContext`)
Supports DataFrames NO YES
Supports SQL NO YES
Encapsulates N/A `SparkContext`, `SQLContext`, `HiveContext`
Recommended in Legacy RDD-based code Modern Spark apps (especially DataFrame-based)

Follow Me | Subhash Yadav |Big Data Engineer


INTERVIEW QUESTIONS & WITH ANSWER

Q49. How do you handle late-arriving data in Spark


Structured Streaming?

Handling Late Data (Event - Time Processing)


Causes of Late - Arriving Data in Kafka
1. Network and Producer Delays: high network latency. Resource connections
or retries in the producer can delay message delivery.
2. Broker overload: overloaded kafka brokers or slow replication can
introduce processing delays.
3. Upstream Delays “latency in upstream systems or IoT devices can make
events arrive late in kafka.

Event Time vs Processing Time in Structured Streaming


Time Ranges:
Start Time Getdate()
End Time Getdate()

Example Kafka Message:


{
"timestamp": "2025-11-22T08:52:10.000+00:00",
"userid" : "123",
"item" : "headphones",
"quantity" : 1
}

Understanding the State Store in Spark Structured Streaming


1. What is aState Dtore?
The State Store is a key-value store used by Spark to persist and manage the
state for each micro-batch in a streaming query. This state is updated with
each batch and saved for future use.

Follow Me | Subhash Yadav |Big Data Engineer


INTERVIEW QUESTIONS & WITH ANSWER
Example Use Case:
>> in a windowed aggregation the state store keeps track of partial results for
each window until the window closes.
2. How it Works

Each streaming query operates in micro-batches , following these steps:

1) Input Data: Data is read and processed.


2) Query Existing State : the state store is queried for existing state.
3) State Update: The state is updated based on the new data.
4) Output Results : Results are written to the output sink.
5) Persist State : The updated state is saved for use in subsequent micro-
batches.

>>Automatic State Cleanup:


* Spark automatically removes Old state based on the watermark, which
defines when data is considered late and no longer affects the state.

provider_class=spark.conf.get("spark.sql.streaming.statestore.providerclass")

>> Background : What is RockDB in Spark Structured Streaming??


RockDB is an embedded key-value store designed for high-performance reads
and writes

In the context of Spark Structured Streaming , it serves as a powerful


alternative to the default file-based state store.By leveraging RocksDB, Spark
can significantly boost the performance of stateful computations like
aggregations and joins , particularly under heavy workloads.

this is achieved by minimizing disk I/O overhead,making RockDB an excellent


choice for handling large states or high-throughput streaming queries.

Follow Me | Subhash Yadav |Big Data Engineer


INTERVIEW QUESTIONS & WITH ANSWER
Q50. What is the difference between Spark’s Catalyst
Optimizer and Tungsten Execution Engine?

Catalyst Optimizer — Logical & Query Optimization Layer


Purpose: Optimizes the logical and physical execution plans of Spark
SQL queries.

Layer: Query Optimization (part of the planning phase).

Written in: Scala, using functional programming concepts and pattern


matching.

Key Features:

Rule-based and cost-based optimization: Applies transformations like


predicate pushdown, constant folding, projection pruning, etc.

Logical Plan → Op mized Logical Plan → Physical Plan → Executable


Plan

Supports user-defined optimizations and extensibility via rules.

Follow Me | Subhash Yadav |Big Data Engineer


INTERVIEW QUESTIONS & WITH ANSWER
Abstracts SQL, DataFrame, and Dataset APIs into a unified
optimization flow.

Tungsten Execution Engine — Physical Execution Layer


Purpose: Provides low-level, memory-efficient execution of the query
plan.

Layer: Execution Engine (part of the runtime phase).

Introduced in: Spark 1.4+ for improved performance.

Key Features:

Whole-stage code generation (WSG): Compiles parts of the query into


Java bytecode to avoid virtual function calls and for-loop overhead.

Off-heap memory management: Reduces garbage collection overhead.

Cache-friendly and CPU-efficient algorithms

Improves performance by using binary processing and vectorization.


Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER

Feature Catalyst Optimizer Tungsten Execution Engine


**Function** Query planning and optimization Efficient physical query execution
**Phase** Compile-time (Planning) Run-time (Execution)
**Optimizes** Logical and physical plans Memory usage, CPU efficiency
**Techniques
Used** Rule-based and cost-based optimization Whole-stage codegen, off-heap memory
**Target** SQL, DataFrame, Dataset JVM bytecode, CPU, memory

Bonus: Practical Coding


Challenges
💻 Challenge 1: Write a PySpark function to remove
duplicate rows from a DataFrame based on specific
columns.
💻 Challenge 2: Create a PySpark pipeline to read a
CSV file, filter out rows with null values, and write
the result to a Parquet file.
💻 Challenge 3: Implement a window function to
rank salespeople based on total sales by region. 💻
Challenge 4: Write a PySpark SQL query to calculate
the average salary by department, including only
employees with more than 3 years of experience.
💻 Challenge 5: Implement a PySpark function to
split a large DataFrame into smaller DataFrames
based on a specific column value.

Follow Me | Subhash Yadav |Big Data Engineer


INTERVIEW QUESTIONS & WITH ANSWER
Quick Tips for Interviews
Tip 1: Be ready to explain real-world scenarios where you’ve
used PySpark.
Tip 2: Know how to optimize Spark jobs using caching,
partitioning, and broadcasting.
Tip 3: Understand the trade-offs between RDDs,
DataFrames, and Datasets.

Follow Me | Subhash Yadav |Big Data Engineer

You might also like