0% found this document useful (0 votes)

34 views57 pages

Pyspark and SQL

Uploaded by

rajeshganta.de7799

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

34 views57 pages

Pyspark and SQL

Uploaded by

rajeshganta.de7799

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 57

INTERVIEW QUESTIONS & WITH ANSWER

Most Commonly Asked

SQL & PySpark
***These questions cover real-
world scenarios and key concepts
to help you ace your next interview
2025! ***
SQL SERVER Basics
--Question 1.
--Write a query to find the top 5 products with the highest
revenue in each
--category.
CREATE TABLE ##PRODUCTS(PROID INT, PNAME
VARCHAR (50), CATEGORYID INT)
INSERT INTO ##PRODUCTS VALUES
(1,'CPU',1),
(2,'RAM',2),
(3,'MONITOR',3),
Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER
(4,'KEYBORD',4),
(5,'MOUSE',5)
SELECT * FROM ##PRODUCTS
CREATE TABLE ##SALES(PROID INT, SALEID INT,
AMOUNT NUMERIC(18,2))
INSERT INTO ##SALES VALUES
(1,1,100),
(1,2,200),
(1,3,300),
(1,4,400),
(1,5,500),
(2,1,100),
(2,2,200),
(2,3,300),
(2,4,400),
(2,5,500),
(3,1,100),
(3,2,200),
(3,3,300),
(3,4,400),
(3,5,500),
(4,1,100),
(4,2,200),
(4,3,300),
(4,4,400),
(4,5,500),
(5,1,100),
Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER
(5,2,200),
(5,3,300),
(5,4,400),
(5,5,500)
SELECT * FROM ##PRODUCTS
SELECT * FROM ##SALES
WITH CTE
AS
(
SELECT
P.PROID,P.PNAME,S.SALEID,SUM(S.AMOUNT) AS
AMOUNT,
RANK()OVER(PARTITION BY S.SALEID ORDER BY
SUM(S.AMOUNT) DESC) AS RN
FROM ##PRODUCTS P JOIN ##SALES S ON
P.PROID=S.PROID
GROUP BY P.PROID,S.SALEID,P.PNAME
)SELECT * FROM CTE
WITH REVENUE AS
(
SELECT
P.PROID,P.PNAME,P.CATEGORYID,SUM(S.AMOUNT) AS
AMOUNT,
RANK()OVER(PARTITION BY P.CATEGORYID ORDER BY
SUM(S.AMOUNT) DESC) AS RN
FROM ##PRODUCTS P JOIN ##SALES S ON
P.PROID=S.PROID
Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER
GROUP BY P.PROID,P.PNAME,P.CATEGORYID
) SELECT * FROM REVENUE WHERE RN<=5
--Question 2.
--Find the Third Lowest Price for Each Product
Category
--DROP TABLE ##PRODUCTS
CREATE TABLE ##PRODUCTS(PROID INT, PNAME
VARCHAR (50), CATEGORYID INT,AMOUNT INT)
INSERT INTO ##PRODUCTS VALUES
(1,'CPU',1,100),
(2,'RAM',2,100),
(3,'MONITOR',3,200),
(4,'KEYBORD',4,300),
(5,'MOUSE',5,300)
WITH LOWESTPRICE AS
(
SELECT *,
DENSE_RANK()OVER(ORDER BY AMOUNT DESC)RN
FROM ##PRODUCTS
) SELECT * FROM LOWESTPRICE WHERE RN=3
--Question 3.
--Write a query to find the customer with the
highest total purchase amount in each region.
DROP TABLE ##CUSTOMERS
CREATE TABLE ##CUSTOMERS(CUSTID INT, CUSTNAME
VARCHAR (50), REGION VARCHAR(50))
INSERT INTO ##CUSTOMERS VALUES
Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER
(1,'CPU','DELHI'),
(2,'RAM','MUMBAI'),
(3,'MONITOR','PATNA'),
(4,'KEYBORD','NOIDA'),
(5,'MOUSE','UP'),
(1,'CPU','ADELHI'),
(2,'RAM','MUMBAI'),
(3,'MONITOR','APATNA'),
(4,'KEYBORD','ANOIDA'),
(5,'MOUSE','UP')
SELECT * FROM ##CUSTOMERS
CREATE TABLE ##ORDERS(ORDID INT, CUSTID INT,
AMOUNT NUMERIC(18,2))
INSERT INTO ##ORDERS VALUES
(1,1,100),
(1,2,200),
(1,3,300),
(1,4,400),
(1,5,500),
(2,1,100),
(2,2,200),
(2,3,300),
(2,4,400),
(2,5,500),
(3,1,100),
(3,2,200),
(3,3,300),
Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER
(3,4,400),
(3,5,500),
(4,1,100),
(4,2,200),
(4,3,300),
(4,4,400),
(4,5,500),
(5,1,100),
(5,2,200),
(5,3,300),
(5,4,400),
(5,5,500)
WITH CUSTOMERSTOTAL AS(
SELECT
C.CUSTID,C.CUSTNAME,C.REGION,SUM(O.AMOUNT)AMT
FROM ##CUSTOMERS C JOIN ##ORDERS O ON
C.CUSTID=O.CUSTID
GROUP BY C.CUSTID,C.CUSTNAME,C.REGION
) , RANKED AS
(
SELECT *
, RANK()OVER(PARTITION BY REGION ORDER BY AMT
DESC) RN
FROM CUSTOMERSTOTAL
)SELECT * FROM RANKED WHERE RN=1

Follow Me | Subhash Yadav |Big Data Engineer

INTERVIEW QUESTIONS & WITH ANSWER
--SQL Interview Question and Answers

---Write a query to find all employees who are not

assigned to any department.

SELECT *
FROM employees
WHERE department_id IS NULL;

--Write a query to retrieve the N-th highest salary

from the employee table.

SELECT *
FROM (
SELECT *, DENSE_RANK() OVER (ORDER BY
SALARY DESC) AS salary_rank
FROM EMPLOYEE
) AS ranked_salaries
WHERE salary_rank = N; -- Replace N with the
desired rank

Follow Me | Subhash Yadav |Big Data Engineer

INTERVIEW QUESTIONS & WITH ANSWER
--How would you find employees who have the
same salary?

SELECT e1.employee_id, e1.salary

FROM employees e1
JOIN employees e2 ON e1.salary = e2.salary
WHERE e1.employee_id != e2.employee_id;

--Write a query to calculate the total salary for each

department.

SELECT department_id, SUM(salary) AS total_salary

FROM employees
GROUP BY department_id;

--How would you retrieve the second lowest salary

in a table?

SELECT MIN(salary) AS SecondLowestSalary

FROM employees
WHERE salary > (SELECT MIN(salary) FROM
employees);
Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER

--Write a query to find all employees who joined

before 2010 and have a salary greater than 50,000.

SELECT *
FROM employees
WHERE join_date < '2010-01-01' AND salary >
50000;

--How do you fetch the top 3 highest salaries for

each department?

SELECT department_id, salary

FROM employees
WHERE (department_id, salary) IN (
SELECT department_id, MAX(salary)
FROM employees
GROUP BY department_id
ORDER BY salary DESC
LIMIT 3
);

Follow Me | Subhash Yadav |Big Data Engineer

INTERVIEW QUESTIONS & WITH ANSWER
--Write a query to find all employees who earn
more than the average salary of their department.

SELECT e1.employee_id, e1.salary,

e1.department_id
FROM employees e1
JOIN (
SELECT department_id, AVG(salary) AS avg_salary
FROM employees
GROUP BY department_id
) e2 ON e1.department_id = e2.department_id
WHERE e1.salary > e2.avg_salary;

--To find duplicates based on NAME and SALARY?

SELECT NAME, SALARY, COUNT(*)

FROM EMPLOYEE
GROUP BY NAME, SALARY
HAVING COUNT(*) > 1;

--Delete Duplicate Rows (Keep Only One)

Follow Me | Subhash Yadav |Big Data Engineer

INTERVIEW QUESTIONS & WITH ANSWER
WITH CTE AS (
SELECT *, ROW_NUMBER() OVER (PARTITION BY
NAME, SALARY ORDER BY ID) AS rn
FROM EMPLOYEE
)
DELETE FROM EMPLOYEE
WHERE ID IN (
SELECT ID FROM CTE WHERE rn > 1
);

--Write a query to calculate the total number of

employees in each department and the average
salary.

SELECT department_id, COUNT(*) AS

total_employees, AVG(salary) AS avg_salary
FROM employees
GROUP BY department_id;

--Write a query to get the department with the

highest total salary.

Follow Me | Subhash Yadav |Big Data Engineer

INTERVIEW QUESTIONS & WITH ANSWER
SELECT department_id, SUM(salary) AS total_salary
FROM employees
GROUP BY department_id
ORDER BY total_salary DESC
LIMIT 1;

PySpark Basics and RDDs

Q1. What is the difference between RDD, DataFrame, and Dataset?
RDDs:-
A distributed collection of data elements without a schema. RDDs are slower
than DataFrames and Datasets for simple operations.
DataFrames:-
A distributed collection organized into named columns.
DataFrames are similar to relational database tables or Python Pandas
DataFrames.
DataFrames are faster than RDDs for exploratory analysis and creating
aggregated statistics.
Datasets:-
An extension of DataFrames with additional features like type-safety and
object-oriented interface. Datasets are faster than RDDs but slower than
DataFrames. Datasets combine the performance optimization of DataFrames
and the convenience of RDDs.
Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER

Q2. How does PySpark achieve parallel processing?

PySpark achieves parallel processing by leveraging Apache Spark's
distributed computing architecture.
1. RDD/DataFrame Abstractions
2. Driver and Executors
3. Task Parallelism
4. Cluster Manager Integration
5. Lazy Evaluation & DAG
6. In-Memory Computation

PySpark achieves parallel processing by:

 Distributing data across partitions
 Executing tasks concurrently on worker nodes
 Managing resources via cluster managers
 Optimizing execution through DAG and lazy evaluation

Q3. Explain lazy evaluation in PySpark with a real-world analogy.

Lazy evaluation in PySpark means that transformations are not executed
immediately when you define them. Instead, Spark waits until an action (like
collect() or count()) is called to actually execute the transformations. This
allows Spark to optimize the execution plan for better performance
Q4. What is SparkContext, and why is it important?

Follow Me | Subhash Yadav |Big Data Engineer

INTERVIEW QUESTIONS & WITH ANSWER
SparkContext is the entry point to using Spark functionality in PySpark (or
Scala Spark). It represents the connection between your application and the
Spark cluster.
Role Description
1. Initializes Spark Application: It sets up the environment and allows your
app to use Spark’s capabilities.
2. Connects to the Cluster: Manages communication with the Cluster Manager
(e.g., YARN, Standalone).
3. Resource Allocation: Requests resources (executors, cores, memory) for
your Spark jobs.
4. Job Submission: Submits jobs and coordinates RDD or DataFrame
transformations/actions.
5. Fault Tolerance & Lineage: Keeps track of RDD lineage for fault recovery.

Q5. How do you handle large file processing in PySpark?

TechniquePurpose
1. Use Parquet/ORC Faster, more efficient reads
2. Partitioning Process only necessary data
3. Repartition/Coalesce Control parallelism and file count
4. Caching Save repeated computations
5. Filter Early Reduce input size
6. Avoid .collect () Prevent memory issues on driver
7. Broadcast small datasets Optimize joins

Follow Me | Subhash Yadav |Big Data Engineer

INTERVIEW QUESTIONS & WITH ANSWER
Q6. What is the difference between actions and transformations in
PySpark?
Feature Transformations Actions
Execution Lazy Trigger execution
Return Type New RDD/DataFrame Result to driver or storage
Examples map(), filter(), select() count(), collect(), show()
Purpose Define a computation plan Execute and get results
Q7. How does Spark handle data partitioning in distributed
environments?
Apache Spark uses data partitioning to divide large datasets into smaller
chunks (called partitions) that can be processed in parallel across multiple
nodes in a cluster.
A partition is a logical chunk of data stored in memory or disk. Each partition
is processed by a single task in a single executor thread.
When you create an RDD from a file or collection, Spark partitions it
automatically.
Spark also partitions DataFrames internally (default number:
spark.sql.shuffle.partitions = 200).

Hash Partitioning: - Spark uses a hash function on a key to distribute rows

evenly. Common in joins and aggregations.
Range Partitioning: - Data is divided into ordered ranges. Useful for ordered
or skewed data.
Custom Partitioning: - You can define your own partition logic using a custom
Partitioner (RDD only).

reduceByKey() ✅ Keys are grouped across partitions

Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER
join() ✅ Matching keys may be on different partitions
coalesce() ✅ (sometimes) Reduces number of partitions
repartition() ✅ Redistributes data evenly
Q8. Explain the concept of fault tolerance in PySpark?
Mechanism Description

Lineage DAG Rebuilds lost data by reapplying transformations

Task Retries Failed tasks are retried automatically

Relies on storage layer (e.g., HDFS) for fault-
Data Replication
tolerant reads
Persist intermediate RDDs to reduce
Checkpointing
recomputation cost

Q9. How do you broadcast variables in Spark, and when should you
use them?
In Spark, broadcast variables are used to efficiently share small read-only
data (like lookup tables or configuration settings) with all worker nodes,
without sending a copy for each task.
Q10. What are accumulators in PySpark, and how do they differ from
broadcast variables?
Feature Accumulators Broadcast Variables
Aggregation (e.g.,
Purpose Share read-only data with executors
counters, sums)
Tasks can only add
Mutable? Completely read-only
values
Driver only can read
Access All tasks can read it
value
Usage in Tasks Write-only in workers Read-only in workers
Common Use Metrics, debugging, Lookup tables, configs, small
Cases counting conditions datasets

Follow Me | Subhash Yadav |Big Data Engineer

INTERVIEW QUESTIONS & WITH ANSWER
DataFrame and Dataset
Operations
Q11. How do you perform data filtering using PySpark
DataFrames?
SQL String df.filter("age > 25")
Column Functions df.filter(col("age") > 25)
Complex Logic df.filter((col("age") > 30) & (col("country") == "US"))
Pattern Match df.filter(col("name").like("A%"))

Q12. What is the difference between repartition () and

coalesce (), and when would you use each?
Feature repartition() coalesce()

Operation Full shuffle Narrow dependency (no shuffle)

Change partitions Increase or decrease Only decrease

Cost Expensive due to shuffle Cheap, avoids shuffle
Improve parallelism, Reduce partitions, optimize output
Use case
repartition by key files

Q13. How do you handle missing or null values in PySpark?

Function /
Task Description
Method
Detect nulls .filter(col.isNull()) Find rows with nulls

Drop rows with nulls .dropna() Remove rows with nulls

Fill nulls .fillna() Replace nulls with specified values

Impute values Imputer (MLlib) Replace nulls with mean/median
Replace in expressions coalesce() Use first non-null value

Follow Me | Subhash Yadav |Big Data Engineer

INTERVIEW QUESTIONS & WITH ANSWER
Q14. How can you add a new column to a DataFrame using
withColumn()?
df_new = df.withColumn("country", lit("USA"))

Q15. How do you perform a left join between two DataFrames

in PySpark?
spark.sql("select a.id,a.name,b.product from cust a left join prod b on
a.id=b.id").show()

Q16. What are temporary views in PySpark, and how do they

differ from global temporary views?

df.createOrReplaceTempView("temp_view")
spark.sql("SELECT * FROM temp_view WHERE age > 30").show()

Q17. How do you use window functions in PySpark for

advanced analytics?
windowSpec =
Window.partitionBy("department").orderBy(col("salary").desc())

Q18. How can you register a UDF (User-Defined Function) in

PySpark?

def to_uppercase(s):
return s.upper()

from pyspark.sql.functions import udf

from pyspark.sql.types import StringType

# Register the UDF with a return type

to_upper_udf = udf(to_uppercase, StringType())

Follow Me | Subhash Yadav |Big Data Engineer

INTERVIEW QUESTIONS & WITH ANSWER
spark.udf.register("to_uppercase_sql", to_uppercase, StringType())
# Dataframe
df.withColumn("name_upper", to_upper_udf(col("name"))).show()

df.createOrReplaceTempView("people")
spark.sql("SELECT name, to_uppercase_sql(name) AS name_upper FROM
people").show()

Q19. What is the difference between persist() and cache()?

Feature cache() persist()
Not a shortcut, you specify
Shortcut for .persist(MEMORY_AND_DISK)
StorageLevel
Storage control No Yes
✅ (e.g., MEMORY_ONLY,
Custom levels No
DISK_ONLY)
Advanced control over
Use case Default caching needs
storage behavior

Q20. How do you read and write data in Parquet, CSV, and
JSON formats in PySpark?
Read:
df_parquet = spark.read.parquet("path/to/file.parquet")
df_csv = spark.read.option("header", "true").csv("path/to/file.csv")
df_json = spark.read.json("path/to/file.json")
Write:
df.write.mode("overwrite").parquet("path/to/output_parquet")
df.write.option("header",
"true").mode("overwrite").csv("path/to/output_csv")
df.write.mode("overwrite").json("path/to/output_json")

Follow Me | Subhash Yadav |Big Data Engineer

INTERVIEW QUESTIONS & WITH ANSWER
Spark SQL and Query
Optimization
Q21. How do you run SQL queries on a DataFrame in
PySpark?
df.createOrReplaceTempView("temp_view")
spark.sql("SELECT * FROM temp_view WHERE age > 30").show()
Register temp view createOrReplaceTempView("name")
Register global view createGlobalTempView("name")
Run SQL query spark.sql("SQL QUERY")

Q22. What is the purpose of Catalyst Optimizer in Spark SQL?

The Catalyst Optimizer is the query optimization engine used by Spark SQL.
Its main goal is to automatically optimize queries to improve performance
and efficiency

1. Logical Optimization
2. Physical Plan Optimization
3. Rule-Based and Cost-Based Optimization
Uses rule-based techniques (static transformations) and optionally cost-
based optimization (CBO) to make smarter choices.
4. Extensibility
Automatic optimization (you don’t need to tune manually)
Improved performance for complex SQL/DataFrame queries
Extensible for custom logic in enterprise environments

Q23. How do you handle schema inference when reading data

from external sources?
PySpark tries to infer the schema by scanning the data when you use
.option("inferSchema", "true") (mainly for CSV and JSON).
df = spark.read \

Follow Me | Subhash Yadav |Big Data Engineer

INTERVIEW QUESTIONS & WITH ANSWER
.option("header", "true") \
.option("inferSchema", "true") \
.csv("path/to/file.csv")

df = spark.read \
.option("inferSchema", "true") \
.json("path/to/file.json")
Manual Schema Definition (Recommended for Large Data)
You can define a StructType schema to explicitly specify data types and
improve performance.
schema = StructType([
StructField("name", StringType(), True),
StructField("age", IntegerType(), True)
])

Parquet & ORC – Schema is Embedded

For Parquet and ORC files, schema is already embedded in the file format:
df = spark.read.parquet("path/to/file.parquet")

Q24. What are the different join types in Spark SQL, and when
would you use each?

spark.sql("select a.id,a.name,b.product from cust a join prod b on

a.id=b.id").show()
spark.sql("select a.id,a.name,b.product from cust a left join prod b on
a.id=b.id").show()
spark.sql("select a.id,a.name,b.product from cust a right join prod b on
a.id=b.id").show()
spark.sql("select a.id,a.name,b.product from cust a full join prod b on
a.id=b.id").show()
spark.sql("select a.id,a.name from cust a LEFT ANTI JOIN prod b on
a.id=b.id").show() --Tbl1—(1,2,3) Tbl2—(1,3)
LEFT ANTI JOIN Only Show (2)
Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER
spark.sql("select a.id,a.name from cust a LEFT SEMI JOIN prod b on
a.id=b.id").show()--Tbl1—(1,2,3) Tbl2—(1,3)
LEFT SEMI JOIN Only Show (1,3)
spark.sql("select a.id,a.name from cust a CROSS JOIN prod b").show()
Tbl1*Tbl2

Q25. How do you create a persistent table in Spark SQL?

This stores both the data and schema persistently in the metastore.
df.write
.mode("overwrite")
.saveAsTable("employee")

Q26. How does dynamic partition pruning improve query

performance?

Static partition pruning: Prunes partitions before query starts (e.g., WHERE
region = 'US').
DPP (Dynamic Partition pruning): Prunes partitions during execution,
based on values coming from another table.
spark.conf.set("spark.sql.optimizer.dynamicPartitionPruning.enabled",
"true")

Q27. Explain how to use broadcast joins to optimize query

performance?

from pyspark.sql.functions import broadcast

# 'small_df' is the smaller table, 'large_df' is the big one
joined_df = large_df.join(broadcast(small_df), "join_key")

Q28. What is data skew, and how do you handle it in Spark

SQL?

Follow Me | Subhash Yadav |Big Data Engineer

INTERVIEW QUESTIONS & WITH ANSWER
1. Salting Keys
Add a random prefix/suffix ("salt") to skewed keys to spread the data across
multiple partitions, then join on this salted key
from pyspark.sql.functions import concat, lit, floor, rand

# Add salt column to both DataFrames

df1_salted = df1.withColumn("salt", floor(rand() * 10))
df1_salted = df1_salted.withColumn("salted_key",
concat(df1_salted["join_key"], lit("_"), df1_salted["salt"]))

df2_salted = df2.withColumn("salt", floor(rand() * 10))

df2_salted = df2_salted.withColumn("salted_key",
concat(df2_salted["join_key"], lit("_"), df2_salted["salt"]))

# Join on salted_key instead of join_key

result = df1_salted.join(df2_salted, "salted_key")

2. Broadcast Join
If one table is small, broadcast it to avoid shuffling and reduce skew impact.
from pyspark.sql.functions import broadcast
result = large_df.join(broadcast(small_df), "join_key")

3. Increase Shuffle Partitions

Increase spark.sql.shuffle.partitions to spread skewed keys over more
partitions.
spark.conf.set("spark.sql.shuffle.partitions", 500)

4. Skew Join Optimization (Spark 3.0+) Adaptive Query Execution (AQE)

Spark 3+ supports adaptive query execution (AQE) with built-in skew join
optimization that detects skew and splits large partitions automatically.
spark.conf.set("spark.sql.adaptive.enabled", "true")
spark.conf.set("spark.sql.adaptive.skewJoin.enabled", "true")

5. Filter or Aggregate Early

Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER
Reduce skew by filtering or aggregating data before the join to minimize
skewed keys.

6. spark_partition_id()
In Apache Spark, spark_partition_id() is a built-in function that returns the
partition ID (an integer) of the row it is associated with. It is particularly
useful for debugging, understanding data distribution, and optimizing
performance.

import org.apache.spark.sql.functions.spark_partition_id
val df = spark.range(0, 10).repartition(3)
df.withColumn("partition_id", spark_partition_id()).show()

Technique When to Use

Salting Manual control when you know skewed keys
Broadcast Join When one table is small
Increase Shuffle Partitions To increase parallelism
Adaptive Query Execution Spark 3+ automatic skew handling
Early Filtering/Aggregation Reduce skew data volume before join
spark_partition_id() After a join to check skewed partitions.

Q29. How can you perform aggregations using SQL queries

on large datasets?
When working with large datasets, aggregations are common operations like
SUM, COUNT, AVG, MIN, MAX, GROUP BY, etc. Spark SQL is designed to handle
these efficiently even at scale

Q30. How do you enable query caching in Spark SQL?

Method How to Enable When to Use
SQL CACHE TABLE CACHE TABLE tableName; Cache tables for repeated SQL queries
PySpark .cache() df.cache() Cache DataFrames in Spark applications
Persist with StorageLevel df.persist(StorageLevel) Customize cache storage behavior
AQE Cache Set configs for adaptive query caching Automatic optimization in Spark 3.2+

Follow Me | Subhash Yadav |Big Data Engineer

INTERVIEW QUESTIONS & WITH ANSWER
Data Pipeline Scenarios and
Real World Use Cases
Q31. How would you build an ETL pipeline using PySpark?
Key ETL Steps with PySpark

1. Extract – Read data from external sources

2. Transform – Clean, filter, join, aggregate, enrich data
3. Load – Write the final dataset to storage (Parquet, S3, Hive, etc.)

from pyspark.sql import SparkSession

from pyspark.sql.functions import col, to_date

# Step 1: Initialize Spark Session

spark = SparkSession.builder \
.appName("ETL Pipeline Example") \
.getOrCreate()

# Step 2: Extract - Load raw data from CSV

raw_df = spark.read.option("header", True).csv("s3://your-
bucket/raw/sales.csv")

# Step 3: Transform - Clean and prepare the data

transformed_df = raw_df \
.withColumn("sales_amount", col("sales_amount").cast("double")) \
.withColumn("date", to_date(col("date"), "yyyy-MM-dd")) \
.filter(col("sales_amount") > 0)

# Optional: Join with product/dimension data

products_df = spark.read.parquet("s3://your-bucket/dim/products/")
final_df = transformed_df.join(products_df, on="product_id", how="left")

Follow Me | Subhash Yadav |Big Data Engineer

INTERVIEW QUESTIONS & WITH ANSWER
# Step 4: Load - Write cleaned data to S3 in Parquet format
final_df.write.mode("overwrite").partitionBy("date").parquet("s3://your-
bucket/processed/sales/")

# Stop Spark session

spark.stop()

Q32. How do you handle real-time data processing with

Structured Streaming in PySpark?

from pyspark.sql import SparkSession

from pyspark.sql.functions import expr

# 1. Create Spark session

spark = SparkSession.builder \
.appName("RealTimeETL") \
.getOrCreate()

# 2. Read streaming data from Kafka

df = spark.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", "localhost:9092") \
.option("subscribe", "sales_topic") \
.load()

# 3. Transform - Parse Kafka value and extract fields

sales_df = df.selectExpr("CAST(value AS STRING) as json_data") \
.selectExpr("from_json(json_data, 'product_id INT, sales_amount DOUBLE,
ts STRING') as data") \
.select("data.*")

# 4. Optional aggregation
agg_df = sales_df.groupBy("product_id").sum("sales_amount")
Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER
# 5. Write to output sink (console or storage)
query = agg_df.writeStream \
.outputMode("complete") \
.format("console") \
.option("truncate", "false") \
.trigger(processingTime="10 seconds") \
.start()

query.awaitTermination()

Triggers.
trigger(processingTime="10 seconds") # every 10 seconds
.trigger(once=True) # one-time batch for debugging

Fault Tolerance
.writeStream.option("checkpointLocation", "/tmp/checkpoints/")

Q33. What are the best practices for partitioning data in large
datasets?

1. Partition by Frequently Queried Columns

df.write.partitionBy("country", "year").parquet("s3://your-bucket/sales/")

2. Avoid Over-Partitioning and Small Files

df.coalesce(10).write.parquet("path/")

3. Use Repartitioning for Data Shuffle Efficiency

df = df.repartition(100, "customer_id")

4. Use .coalesce() to Reduce Partition Count Before Writing

Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER
df.coalesce(1).write.parquet("path/")

5. Choose Cardinality Wisely

Avoid partitioning by high-cardinality columns like user_id or transaction_id
— leads to too many tiny partitions.

Prefer low to medium cardinality columns like:

country,year,region,event_type

6. Use Bucketing for Efficient Joins

CREATE TABLE sales_bucketed
USING parquet
CLUSTERED BY (customer_id) INTO 100 BUCKETS;

7. Leverage Partition Pruning

SELECT * FROM sales WHERE year = 2024 AND country = 'US';

8. Monitor and Tune with Spark UI

Tune spark.sql.shuffle.partitions (default: 200)

Technique When to Use Benefit

partitionBy() Writing data to storage Enables pruning, efficient reads
repartition() Before joins, increase parallelism Improves shuffle-based ops
coalesce() Before writing output Combines partitions, fewer files
Bucketing For repetitive joins on a key Faster joins without reshuffling
Partition
Filtering on partition columns Reads only required data
Pruning

Q34. How would you debug and optimize a slow-running

Spark job?
1. Check Spark UI
URL: Usually at http://<driver-node>:4040

Follow Me | Subhash Yadav |Big Data Engineer

INTERVIEW QUESTIONS & WITH ANSWER
2. Identify Expensive Operations
Common causes of slowness:
Wide transformations (e.g., join, groupBy, distinct)
Large shuffles (data moved between nodes)
Skewed data (some tasks take much longer)
df.explain(True)
df.queryExecution.debug.codegen()

3. Check for Data Skew

Use .groupBy("key").count().orderBy("count", ascending=False) to detect
skewed

4. Broadcast Joins for Small Tables

df1.join(broadcast(df2), "id")

5. Optimize Shuffles
spark.conf.set("spark.sql.shuffle.partitions", 200) # default, increase or
decrease based on data size

6. Cache/Persist Intermediate Results

df.cache() # or df.persist(StorageLevel.MEMORY_AND_DISK)
df.count() # trigger caching

7. Avoid Unnecessary Collect/Show

Use .limit(n).show() instead for sampling.

8. Tune Resource Allocation

--executor-memory 4G
--executor-cores 4
--num-executors 50

spark.conf.set("spark.dynamicAllocation.enabled", "true")

9. Enable Adaptive Query Execution (AQE)

Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER
spark.conf.set("spark.sql.adaptive.enabled", "true")

10. Profile with Spark History Server

http://<your-cluster>/history

Q35. How do you handle schema evolution in PySpark

pipelines?
1. Enable Schema Merging (for Parquet/ORC)
df = spark.read.option("mergeSchema",
"true").parquet("s3://path/to/parquet/")

2. Use Delta Lake for Robust Schema Evolution

Delta Lake (on Databricks or open source) supports automatic schema
evolution.
new_data.write \
.format("delta") \
.option("mergeSchema", "true") \
.mode("append") \
.save("/mnt/delta/sales/")
3. Infer Schema Dynamically (for Semi-Structured Data)
df = spark.read \
.option("inferSchema", "true") \
.json("s3://path/json/")

4. Define and Update Explicit Schemas

from pyspark.sql.types import StructType, StructField, StringType,
IntegerType

schema_v2 = StructType([
StructField("id", IntegerType(), True),
StructField("name", StringType(), True),
StructField("email", StringType(), True) # new column
Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER
])
df = spark.read.schema(schema_v2).json("path")

5. Handle Nulls and Defaults for Missing Fields

df = df.withColumn("email", coalesce(col("email"),
lit("[email protected]")))

6. Monitor and Validate Schema Changes

.schema.json() to save schema versions.

7. Backfill Historical Data (Optional)

If schema changes are breaking (e.g., renaming a column):
Consider backfilling historical data to the new schema.
Or maintain versioned data models (v1, v2 folders or tables).

Q36. What is the role of checkpointing in Spark Streaming?

Checkpointing in Spark Streaming (including Structured Streaming) is a

critical mechanism that enables fault tolerance, state recovery, and state
management during stream processing.

Types of Checkpointing in Spark

Metadata Checkpointing:-
Saves streaming job progress (e.g., offsets, batch IDs).Required for Structured
Streaming.

Data Checkpointing:-
Saves the RDD lineage and data to avoid recomputation.
Mainly used in DStream-based Spark Streaming (less common now).

How to Enable Checkpointing in Structured Streaming

query = df.writeStream \

Follow Me | Subhash Yadav |Big Data Engineer

INTERVIEW QUESTIONS & WITH ANSWER
.format("parquet") \
.outputMode("append") \
.option("checkpointLocation", "s3://my-bucket/checkpoints/job1/") \
.start("s3://my-bucket/output/")
Checkpoint directory must be reliable and durable (e.g., HDFS, S3).

Purpose Description
Fault Recovers the stream from failures by storing metadata
Recovery and data state
Stateful Required for operations like updateStateByKey,
Operations mapGroupsWithState
Progress Tracks offsets (Kafka, file source), watermarks, and
Tracking batch info

Q37. How can you implement incremental data processing in

PySpark?

Common Strategies for Incremental Processing

1. Using Timestamps or Date Columns

Assumption: Your source data has a column like
last_updated, created_at, or ingestion_date.

# Last processed timestamp, from metadata store

last_timestamp = "2025-05-20 00:00:00"

# Filter new/updated records

new_data = df.filter(df["last_updated"] > lit(last_timestamp))

2. Using Watermarking in Structured Streaming

df = spark.readStream \
Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER
.format("kafka") \
.option("subscribe", "orders") \
.load()

parsed = df \
.withWatermark("event_time", "10 minutes") \
.groupBy(window("event_time", "5 minutes")) \
.count()

3. Delta Lake’s Change Data Feed (CDF) 🔁

If using Delta Lake, enable CDF to get only updated/new/deleted rows:
df = spark.read.format("delta") \
.option("readChangeData", "true") \
.option("startingVersion", 23) \
.load("/delta/orders/")
4. Using Surrogate Keys or Auto-Increment IDs
last_processed_id = 10250
new_data = df.filter(df["id"] > last_processed_id)

Store the last processed ID externally.

Useful when data is strictly append-only.

5. Compare Against Existing Target Table (Merge)

Use merge (upsert) to load only new/changed rows into a target:
from delta.tables import DeltaTable
target = DeltaTable.forPath(spark, "/delta/customers/")

target.alias("t").merge(
source=new_data.alias("s"),
condition="t.id = s.id"
).whenMatchedUpdateAll() \
.whenNotMatchedInsertAll() \
.execute()

Follow Me | Subhash Yadav |Big Data Engineer

INTERVIEW QUESTIONS & WITH ANSWER
Real-World Use Case Example
Scenario: You want to process only new orders added daily to a Parquet file.
last_processed_date = "2024-05-19"

df = spark.read.parquet("s3://bucket/orders/")
incremental_df = df.filter(df["order_date"] > last_processed_date)

# Process and write

incremental_df.write.parquet("s3://bucket/processed_orders/",
mode="append")

# Update last_processed_date in metadata store

Q38. How do you handle large joins between multiple

DataFrames?

1. Broadcast Joins (for Small Tables)

from pyspark.sql.functions import broadcast
result = large_df.join(broadcast(small_df), "join_key")

2. Repartition Before Join

Ensure both DataFrames are partitioned on the join key to reduce shuffle
skew.
df1 = df1.repartition("join_key")
df2 = df2.repartition("join_key")
joined_df = df1.join(df2, "join_key")

3. Use Bucketing (For Hive Tables)

CREATE TABLE t1 (...) CLUSTERED BY (key) INTO 50 BUCKETS;

4. Avoid Cross Joins Unless Necessary

df1.crossJoin(df2).filter("df1.col = df2.col")
Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER
5. Skew Join Handling (When Keys Are Uneven)

Add a salt key (e.g., key + rand()) to spread out skewed data.
Use salting or enable AQE skew join handling (in Spark 3.0+):

spark.conf.set("spark.sql.adaptive.enabled", "true")
spark.conf.set("spark.sql.adaptive.skewJoin.enabled", "true")

6. Join in Stages (Multi-Table Join Strategy)

temp = df1.join(df2, "key1")
result = temp.join(df3, "key2")

7. Use SQL for Complex Joins

df1.createOrReplaceTempView("orders")
df2.createOrReplaceTempView("customers")
df3.createOrReplaceTempView("products")

spark.sql("""
SELECT o.*, c.name, p.name
FROM orders o
JOIN customers c ON o.cust_id = c.id
JOIN products p ON o.prod_id = p.id
""")

8. Tune Configurations

Setting Purpose
spark.sql.shuffle.partitions Controls # of shuffle partitions
spark.sql.autoBroadcastJoinThreshold Max size (bytes) for broadcast
spark.sql.adaptive.enabled Enables Adaptive Query Execution
spark.sql.adaptive.skewJoin.enabled Handles skewed joins automatically

Follow Me | Subhash Yadav |Big Data Engineer

INTERVIEW QUESTIONS & WITH ANSWER
Q39. What is the difference between batch processing and
stream processing in Spark?

Feature Batch Processing Stream Processing

Nature of Data Processes finite/static data Processes continuous/infinite data
Input Source Files (Parquet, CSV, etc.), DBs Kafka, socket, files, etc.
Execution Runs as a job, ends when data is Runs continuously, processing data
Mode processed in real time

Q40. How would you secure sensitive data in a PySpark

pipeline?

1. Data Encryption
At Rest
Enable encryption on storage systems like:
Amazon S3 (SSE-S3, SSE-KMS)
HDFS transparent encryption
Azure Data Lake encryption
Use encrypted file formats like Parquet + GZIP/Snappy.
In Transit

Enable SSL/TLS when transferring data:

Between Spark and Kafka, S3, JDBC, etc.
Use spark.ssl.enabled for Spark encryption.
2. Masking and Tokenization
Use data masking to obscure sensitive fields (e.g., SSNs, emails):
from pyspark.sql.functions import sha2, col
df = df.withColumn("email_hash", sha2(col("email"), 256))
Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER
3. Column-Level Encryption (Custom)
Encrypt sensitive columns before writing:
from cryptography.fernet import Fernet

key = Fernet.generate_key()
cipher = Fernet(key)
@udf("string")
def encrypt(value):
return cipher.encrypt(value.encode()).decode()
df = df.withColumn("ssn_encrypted", encrypt(col("ssn")))

Store encryption keys in a secure vault (e.g., AWS KMS, Azure Key Vault,
HashiCorp Vault).
4. Access Control
Use Role-Based Access Control (RBAC):
On data storage (S3, ADLS, Hive, etc.)
On Databricks / Spark clusters
Apply fine-grained access control via:
Apache Ranger (for HDFS, Hive, etc.)
Unity Catalog (Databricks)
5. Auditing and Logging
Log:

Follow Me | Subhash Yadav |Big Data Engineer

INTERVIEW QUESTIONS & WITH ANSWER
Who accessed data
What data was read or written
When and where the access occurred
Use audit logs from:

Spark history server

Cloud providers (AWS CloudTrail, Azure Monitor)
Access gateways (e.g., Lake Formation)
6. Data Governance and Classification
Tag sensitive columns in metadata catalogs like:
AWS Glue Data Catalog
Apache Atlas
Unity Catalog (Databricks)
Define policies based on sensitivity level (e.g., PII, HIPAA).

7. DevSecOps Practices
Don't hardcode credentials in scripts.
Use secrets managers:
spark.conf.set("spark.hadoop.fs.s3a.access.key", ...) via environment vars or
secret scopes.
Encrypt logs and control log verbosity.
Security Measure Technique / Tool
Encryption (at rest) S3/KMS, HDFS encryption

Follow Me | Subhash Yadav |Big Data Engineer

INTERVIEW QUESTIONS & WITH ANSWER
Encryption (in transit) TLS/SSL in Spark, Kafka, JDBC
Data masking/tokenizing sha2(), custom UDFs
Access control RBAC, Apache Ranger, Unity Catalog
Auditing Cloud logs, Spark audit logs
Secrets management AWS Secrets Manager, Databricks secrets

Advanced PySpark Features

Q41. How do you handle large datasets in PySpark to
optimize performance and reduce memory usage?

1. Use Efficient Data Formats

Parquet or ORC are columnar storage formats optimized for Spark.

They provide better compression and faster I/O compared to formats like
CSV or JSON.
df.write.parquet("path/to/output.parquet")

2. Partitioning
Use repartition(n) to increase partitions (e.g., after a wide transformation).
Use coalesce(n) to reduce the number of partitions (e.g., before writing).

df = df.repartition(100, "col1") # Better parallelism

df = df.coalesce(10) # Reduce shuffles before write

3. Cache and Persist

Cache intermediate DataFrames if reused multiple times to avoid
recomputation.
Use .cache() or .persist(storage_level) only when needed.
df.persist(StorageLevel.MEMORY_AND_DISK)
Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER
4. Avoid Wide Transformations
Wide transformations (like groupByKey, join, distinct, repartition) trigger
shuffling.
Prefer reduceByKey or aggregateByKey instead of groupByKey.
rdd.reduceByKey(lambda x, y: x + y) # More efficient than groupByKey

5. Use Broadcast Join

If one dataset is small, broadcast it to all nodes to avoid shuffle-heavy joins.
from pyspark.sql.functions import broadcast
df = large_df.join(broadcast(small_df), "key")

# Create a small DataFrame to broadcast

small_df = spark.read.csv("small_dataset.csv", header=True,
inferSchema=True)
broadcast_small_df = spark.sparkContext.broadcast(small_df.collect())
# Use broadcast variable in a join
large_df = spark.read.csv("large_dataset.csv", header=True,
inferSchema=True)
joined_df = large_df.join(small_df, "key_column")

6. Column Pruning & Filter Pushdown

Read only required columns and apply filters early using predicate
pushdown.
spark.read.parquet("path").select("col1", "col2").filter("col1 > 100")

7. Avoid Collecting Large Data to Driver

Avoid using .collect() or .toPandas() on large datasets as it can crash the
driver.
Use .show(), .take(n) or .limit(n) for previewing.
df.limit(10).toPandas()

8. Optimize Joins
Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER
Ensure the join keys are distributed and avoid skewed joins.
Use salting or skew join hints when facing data skew.
df1.join(df2.hint("skew"), "key")

9. Use UDFs Wisely

Avoid Python UDFs due to serialization and performance overhead.
Prefer Spark built-in functions (pyspark.sql.functions) or Pandas UDFs.
from pyspark.sql.functions import col, upper
df = df.withColumn("name_upper", upper(col("name")))

10. Resource Tuning

Tune Spark configuration:

--executor-memory 4G
--executor-cores 4
--num-executors 10

# Example of configuring Spark settings in the SparkSession

spark = SparkSession.builder \
.appName("OptimizationExample") \
.config("spark.executor.memory", "4g") \
.config("spark.executor.cores", "4") \
.config("spark.driver.memory", "4g") \
.getOrCreate()

# Example of caching and partitioning

df = spark.read.csv("data.csv", header=True, inferSchema=True) # Read data
df.cache() # Cache the DataFrame
df_partitioned = df.repartition(numPartitions=100,
partitioningColumn="key_column") # Repartition

Follow Me | Subhash Yadav |Big Data Engineer

INTERVIEW QUESTIONS & WITH ANSWER

Q42. What is the purpose of Delta Lake, and how does it

improve reliability?

Delta Lake is an open-source storage layer that brings ACID transactions,

schema enforcement, and time travel to big data workloads on Apache Spark
and data lakes (like S3, ADLS, etc.).

Purpose of Delta Lake

Reliable, scalable big data pipelines
Transactional consistency on top of distributed storage
Unified batch and streaming data processing
1. ACID Transactions
Ensures atomicity, consistency, isolation, and durability even across multiple
writers
df.write.format("delta").mode("append").save("/path/to/delta-table")

2. Schema Enforcement & Evolution

Prevents bad data from corrupting tables with strict schema checks.
Supports schema evolution (e.g., adding new columns).
spark.read.format("delta").load("/path").printSchema()

3. Time Travel
Access previous versions of data using versioning or timestamps.
Useful for debugging, rollback, and reproducibility.
delta_table = DeltaTable.forPath(spark, "/path")
delta_table.history() # Show all versions
spark.read.format("delta").option("versionAsOf", 3).load("/path")

4. Unified Batch + Streaming

Enables a single table to support both streaming reads and batch writes,
improving consistency across pipelines.
spark.readStream.format("delta").load("/path")
Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER
5. Data Quality with Constraints
You can define constraints like NOT NULL, CHECK, etc.
Ensures data correctness at the write level.

6. Efficient Upserts and Deletes (MERGE)

Simplifies slow-changing dimension updates and deduplication.
from delta.tables import DeltaTable

deltaTable = DeltaTable.forPath(spark, "/path")

deltaTable.alias("target").merge(
source_df.alias("source"),
"target.id = source.id"
).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()

7. Scalable Metadata Handling

Delta Lake uses transaction logs (stored as _delta_log) rather than relying on
file listings, making it scalable for tables with millions of files

Q43. How do you enable time travel queries using Delta

Lake?

Delta Lake allows you to query past versions of a table using:

versionAsOf — specify a version number

timestampAsOf — specify a timestamp

1. Using versionAsOf
df = spark.read.format("delta") \
.option("versionAsOf", 5) \
.load("/path/to/delta-table")

2. Using timestampAsOf
Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER
df = spark.read.format("delta") \
.option("timestampAsOf", "2024-05-20T10:00:00") \
.load("/path/to/delta-table")

3. View Table History

from delta.tables import DeltaTable

delta_table = DeltaTable.forPath(spark, "/path/to/delta-table")

delta_table.history().show(truncate=False)

4. Notes
Delta Lake stores all changes as incremental commits in the _delta_log/
directory.
Older data is retained by default for 30 days, but this is configurable with the
data retention period
spark.databricks.delta.retentionDurationCheck.enabled = false

5. Optional: Clean Up Old Versions

# Remove files no longer needed for time travel (older than 7 days)
spark.sql("VACUUM delta.`/path/to/delta-table` RETAIN 168 HOURS")

Q44. How do you handle complex aggregations using window

functions?

1. Running Totals / Cumulative Sum

from pyspark.sql.window import Window
from pyspark.sql.functions import sum

window_spec =
Window.partitionBy("customer_id").orderBy("transaction_date") \
Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER
.rowsBetween(Window.unboundedPreceding,
Window.currentRow)
df = df.withColumn("running_total", sum("amount").over(window_spec))

2. Moving Average
from pyspark.sql.functions import avg

window_spec =
Window.partitionBy("customer_id").orderBy("transaction_date") \
.rowsBetween(-2, 0) # 3-day moving average
df = df.withColumn("moving_avg", avg("amount").over(window_spec))

3. Row Number / Ranking / Dense Ranking

from pyspark.sql.functions import row_number, rank, dense_rank

window_spec = Window.partitionBy("category").orderBy("sales")

df = df.withColumn("row_num", row_number().over(window_spec)) \
.withColumn("rank", rank().over(window_spec)) \
.withColumn("dense_rank", dense_rank().over(window_spec))

4. Lag/Lead for Value Comparison

from pyspark.sql.functions import lag, lead

window_spec = Window.partitionBy("user_id").orderBy("event_time")

df = df.withColumn("prev_val", lag("score", 1).over(window_spec)) \

.withColumn("next_val", lead("score", 1).over(window_spec))

5. Detecting Change Points or Gaps

from pyspark.sql.functions import col, lag, when

window_spec = Window.partitionBy("user_id").orderBy("event_time")
df = df.withColumn("prev_status", lag("status").over(window_spec)) \
Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER
.withColumn("status_changed", when(col("status") != col("prev_status"),
1).otherwise(0))

6. First and Last Value

from pyspark.sql.functions import first, last

window_spec = Window.partitionBy("department").orderBy("date")

df = df.withColumn("first_sale", first("sale").over(window_spec)) \
.withColumn("last_sale", last("sale").over(window_spec))

Function Description
`row_number()` Unique row number per partition
`rank()` Ranking with gaps
`dense_rank()` Ranking without gaps
`lag()` Value from a previous row
`lead()` Value from a following row
`sum()` Cumulative or windowed sum
`avg()` Moving or group average
`first()` First value in the window
`last()` Last value in the window

Q45. What are stateful operations in Spark Structured

Streaming?

Key Characteristics of Stateful Operations

State is maintained in memory and periodically checkpointed to ensure fault
tolerance.
Requires watermarks and timeout configurations to prevent unbounded state
growth.
Involves grouping, windowing, or matching events over time.

Follow Me | Subhash Yadav |Big Data Engineer

INTERVIEW QUESTIONS & WITH ANSWER
Examples of Stateful Operations
1. Group-based Aggregations (with Time Window)
from pyspark.sql.functions import window, sum

df.groupBy(
window("event_time", "10 minutes"),
"user_id"
).agg(sum("amount"))

2. Streaming Joins (between two streams)

stream1.join(stream2, "id") # Requires watermarking
3. FlatMapGroupsWithState
from pyspark.sql.functions import expr
from pyspark.sql.streaming import GroupState, GroupStateTimeout

def update_state(user_id, inputs, state: GroupState):

# custom logic here
return ...

df.groupByKey(lambda row: row.user_id).flatMapGroupsWithState(

update_state,
outputMode="update",
stateTimeoutDuration="10 minutes"
)

4. Deduplication
df.dropDuplicates(["user_id", "event_time"])

5. Role of Watermarking
Watermarking helps limit state size by specifying the maximum expected
lateness of data.
df.withWatermark("event_time", "15 minutes")

Follow Me | Subhash Yadav |Big Data Engineer

INTERVIEW QUESTIONS & WITH ANSWER

Q46. How do you implement error handling and retries in

PySpark jobs?

Implementing robust error handling and retry logic in PySpark jobs is

essential for production-grade data pipelines. Here’s how you can structure it
across different components of a PySpark job:
1. Use Try-Except Blocks in Driver Code
try:
df = spark.read.parquet("/input/path")
result = df.groupBy("category").count()
result.write.mode("overwrite").parquet("/output/path")
except Exception as e:
print(f"Job failed: {e}")
# Optionally send alert or write error to log

2. Implement Retries with Exponential Backoff

import time
import random

def retry_operation(func, retries=3):

for i in range(retries):
try:
return func()
except Exception as e:
print(f"Retry {i + 1} failed: {e}")
time.sleep(2 ** i + random.random()) # exponential backoff
raise Exception("All retries failed.")

3. Handle Errors in UDFs Carefully

from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

Follow Me | Subhash Yadav |Big Data Engineer

INTERVIEW QUESTIONS & WITH ANSWER
def safe_parse(value):
try:
return complex_parsing_logic(value)
except:
return None # or "error"

safe_parse_udf = udf(safe_parse, StringType())

df = df.withColumn("parsed", safe_parse_udf(df["raw_col"]))

4. Use Accumulators or Logs for Error Tracking

from pyspark.accumulators import AccumulatorParam

error_count = spark.sparkContext.accumulator(0)

def parse_and_count(value):
try:
return int(value)
except:
error_count.add(1)
return None

udf_parse = udf(parse_and_count)
df = df.withColumn("parsed", udf_parse(df["col"]))

5. Validate Data Early

expected_schema = StructType([...])
df = spark.read.schema(expected_schema).json("/data/path")

if df.filter("col IS NULL").count() > 0:

raise ValueError("Null values found in critical column.")

6. Checkpoints and Recovery for Streaming

query.writeStream \
Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER
.format("delta") \
.option("checkpointLocation", "/checkpoints/stream1") \
.start("/output/path")

7. Leverage Workflow Orchestration for Retries

PythonOperator(
task_id='spark_job',
python_callable=run_spark_job,
retries=3,
retry_delay=timedelta(minutes=2),
)

Area Strategy
Driver code Try-except with logging
External systems Retry with exponential backoff
UDFs Safe exception handling inside logic
Streaming Use check pointing and watermarking
Data quality Validate schema and critical fields early
Workflow orchestration Handle retries and notifications externally

Q47. How do you monitor and manage Spark clusters using

Spark UI?
The Spark UI is a web-based tool that provides detailed insights into:

Job and stage execution

Task-level metrics
Memory usage
Storage and caching
Executors
SQL query plans

1. Local Mode or Standalone Cluster

Default URL: http://localhost:4040
Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER
2. YARN (Yet Another Resource Negotiator)
In YARN mode, the Spark UI is linked from the YARN Resource Manager UI
under the "Tracking URL".

3. Databricks
Available as part of the "Spark UI" tab inside each job/run.

Key Spark UI Tabs

1. Jobs
2. Stages
3. Tasks
4. Storage
5. Environment
6. Executors
7. SQL (if using Spark SQL)

Q48. What is the difference between SparkSession and

SparkContext?

Use SparkSession in modern Spark applications (especially with DataFrames,

SQL, Delta Lake, etc.).
Use SparkContext only when working directly with RDDs or for low-level
operations.

Feature `SparkContext` `SparkSession`

Introduced in Spark 1.x Spark 2.0
Purpose Entry point for low-level RDD APIs Unified entry point for all Spark APIs
Supports RDD YES YES (via `spark.sparkContext`)
Supports DataFrames NO YES
Supports SQL NO YES
Encapsulates N/A `SparkContext`, `SQLContext`, `HiveContext`
Recommended in Legacy RDD-based code Modern Spark apps (especially DataFrame-based)

Follow Me | Subhash Yadav |Big Data Engineer

INTERVIEW QUESTIONS & WITH ANSWER

Q49. How do you handle late-arriving data in Spark

Structured Streaming?

Handling Late Data (Event - Time Processing)

Causes of Late - Arriving Data in Kafka
1. Network and Producer Delays: high network latency. Resource connections
or retries in the producer can delay message delivery.
2. Broker overload: overloaded kafka brokers or slow replication can
introduce processing delays.
3. Upstream Delays “latency in upstream systems or IoT devices can make
events arrive late in kafka.

Event Time vs Processing Time in Structured Streaming

Time Ranges:
Start Time Getdate()
End Time Getdate()

Example Kafka Message:

{
"timestamp": "2025-11-22T08:52:10.000+00:00",
"userid" : "123",
"item" : "headphones",
"quantity" : 1
}

Understanding the State Store in Spark Structured Streaming

1. What is aState Dtore?
The State Store is a key-value store used by Spark to persist and manage the
state for each micro-batch in a streaming query. This state is updated with
each batch and saved for future use.

Follow Me | Subhash Yadav |Big Data Engineer

INTERVIEW QUESTIONS & WITH ANSWER
Example Use Case:
>> in a windowed aggregation the state store keeps track of partial results for
each window until the window closes.
2. How it Works

Each streaming query operates in micro-batches , following these steps:

1) Input Data: Data is read and processed.

2) Query Existing State : the state store is queried for existing state.
3) State Update: The state is updated based on the new data.
4) Output Results : Results are written to the output sink.
5) Persist State : The updated state is saved for use in subsequent micro-
batches.

>>Automatic State Cleanup:

* Spark automatically removes Old state based on the watermark, which
defines when data is considered late and no longer affects the state.

provider_class=spark.conf.get("spark.sql.streaming.statestore.providerclass")

>> Background : What is RockDB in Spark Structured Streaming??

RockDB is an embedded key-value store designed for high-performance reads
and writes

In the context of Spark Structured Streaming , it serves as a powerful

alternative to the default file-based state store.By leveraging RocksDB, Spark
can significantly boost the performance of stateful computations like
aggregations and joins , particularly under heavy workloads.

this is achieved by minimizing disk I/O overhead,making RockDB an excellent

choice for handling large states or high-throughput streaming queries.

Follow Me | Subhash Yadav |Big Data Engineer

INTERVIEW QUESTIONS & WITH ANSWER
Q50. What is the difference between Spark’s Catalyst
Optimizer and Tungsten Execution Engine?

Catalyst Optimizer — Logical & Query Optimization Layer

Purpose: Optimizes the logical and physical execution plans of Spark
SQL queries.

Layer: Query Optimization (part of the planning phase).

Written in: Scala, using functional programming concepts and pattern

matching.

Key Features:

Rule-based and cost-based optimization: Applies transformations like

predicate pushdown, constant folding, projection pruning, etc.

Logical Plan → Op mized Logical Plan → Physical Plan → Executable

Plan

Supports user-defined optimizations and extensibility via rules.

Follow Me | Subhash Yadav |Big Data Engineer

INTERVIEW QUESTIONS & WITH ANSWER
Abstracts SQL, DataFrame, and Dataset APIs into a unified
optimization flow.

Tungsten Execution Engine — Physical Execution Layer

Purpose: Provides low-level, memory-efficient execution of the query
plan.

Layer: Execution Engine (part of the runtime phase).

Introduced in: Spark 1.4+ for improved performance.

Key Features:

Whole-stage code generation (WSG): Compiles parts of the query into

Java bytecode to avoid virtual function calls and for-loop overhead.

Off-heap memory management: Reduces garbage collection overhead.

Cache-friendly and CPU-efficient algorithms

Improves performance by using binary processing and vectorization.

Follow Me | Subhash Yadav |Big Data Engineer
INTERVIEW QUESTIONS & WITH ANSWER

Feature Catalyst Optimizer Tungsten Execution Engine

**Function** Query planning and optimization Efficient physical query execution
**Phase** Compile-time (Planning) Run-time (Execution)
**Optimizes** Logical and physical plans Memory usage, CPU efficiency
**Techniques
Used** Rule-based and cost-based optimization Whole-stage codegen, off-heap memory
**Target** SQL, DataFrame, Dataset JVM bytecode, CPU, memory

Bonus: Practical Coding

Challenges
💻 Challenge 1: Write a PySpark function to remove
duplicate rows from a DataFrame based on specific
columns.
💻 Challenge 2: Create a PySpark pipeline to read a
CSV file, filter out rows with null values, and write
the result to a Parquet file.
💻 Challenge 3: Implement a window function to
rank salespeople based on total sales by region. 💻
Challenge 4: Write a PySpark SQL query to calculate
the average salary by department, including only
employees with more than 3 years of experience.
💻 Challenge 5: Implement a PySpark function to
split a large DataFrame into smaller DataFrames
based on a specific column value.

Follow Me | Subhash Yadav |Big Data Engineer

INTERVIEW QUESTIONS & WITH ANSWER
Quick Tips for Interviews
Tip 1: Be ready to explain real-world scenarios where you’ve
used PySpark.
Tip 2: Know how to optimize Spark jobs using caching,
partitioning, and broadcasting.
Tip 3: Understand the trade-offs between RDDs,
DataFrames, and Datasets.

Follow Me | Subhash Yadav |Big Data Engineer

Top 70 Advanced SQL Queries 1744995525
No ratings yet
Top 70 Advanced SQL Queries 1744995525
27 pages
CDS Views Best Practices
No ratings yet
CDS Views Best Practices
36 pages
A Complete Data Science Interview With 100 Questions
100% (1)
A Complete Data Science Interview With 100 Questions
57 pages
Data and Business Analytics Interview Questions
No ratings yet
Data and Business Analytics Interview Questions
54 pages
Py Spark
No ratings yet
Py Spark
10 pages
Mastercard Data Engineer Interview Questions
No ratings yet
Mastercard Data Engineer Interview Questions
16 pages
Amazon Interview Questions & Answers
No ratings yet
Amazon Interview Questions & Answers
8 pages
SQL & Python Interview Q&A
No ratings yet
SQL & Python Interview Q&A
7 pages
Wipro Data Analyst Interview Questions
No ratings yet
Wipro Data Analyst Interview Questions
29 pages
Data Engineer (3-5 Years of Experience.) PDF
No ratings yet
Data Engineer (3-5 Years of Experience.) PDF
7 pages
Barclays Data Engineer Interview Questions
No ratings yet
Barclays Data Engineer Interview Questions
17 pages
Questions For Preparation
No ratings yet
Questions For Preparation
9 pages
Data Science: Part 2 - SQL
100% (1)
Data Science: Part 2 - SQL
13 pages
SQL Problems
No ratings yet
SQL Problems
18 pages
1) Union and Union All
No ratings yet
1) Union and Union All
11 pages
DBMS 3b (Employee Department Location)
No ratings yet
DBMS 3b (Employee Department Location)
9 pages
Walmart Data Analyst Interview Experience
No ratings yet
Walmart Data Analyst Interview Experience
10 pages
SQL 1732644814
No ratings yet
SQL 1732644814
7 pages
Question
No ratings yet
Question
24 pages
100 SQL Questions With Real Examples-2
No ratings yet
100 SQL Questions With Real Examples-2
16 pages
SQL Short Notes Top 10 Questions 1748266007
No ratings yet
SQL Short Notes Top 10 Questions 1748266007
8 pages
Top 50 Industry-Relevant Data Analyst Interview Q - A
No ratings yet
Top 50 Industry-Relevant Data Analyst Interview Q - A
5 pages
KPMG Data Analyst Interview Questions
No ratings yet
KPMG Data Analyst Interview Questions
30 pages
SQL Interview Cheat Sheet
No ratings yet
SQL Interview Cheat Sheet
7 pages
Tech Mahindra SQL Interview Questions For Data Engineer
No ratings yet
Tech Mahindra SQL Interview Questions For Data Engineer
6 pages
Conference Presentation Resume Tips
No ratings yet
Conference Presentation Resume Tips
9 pages
Deloite Data Engineer Interview Questions
No ratings yet
Deloite Data Engineer Interview Questions
24 pages
Day 9-1
No ratings yet
Day 9-1
5 pages
Company Interview
No ratings yet
Company Interview
24 pages
Ade 1737191501
No ratings yet
Ade 1737191501
29 pages
Top 15 SQL Interview Questions
No ratings yet
Top 15 SQL Interview Questions
6 pages
SQL Interview Questions Freshers
No ratings yet
SQL Interview Questions Freshers
2 pages
Assignment 3 - Shouvik (1159)
No ratings yet
Assignment 3 - Shouvik (1159)
15 pages
HTML Code
No ratings yet
HTML Code
3 pages
DBMS External Questions
No ratings yet
DBMS External Questions
32 pages
HTML Code
No ratings yet
HTML Code
4 pages
Flipkart Data Analyst Interview Questions 1747625566
No ratings yet
Flipkart Data Analyst Interview Questions 1747625566
27 pages
Interviewsss
No ratings yet
Interviewsss
4 pages
DEBasic Test Que NAns
No ratings yet
DEBasic Test Que NAns
15 pages
Day 77
No ratings yet
Day 77
10 pages
SQL
No ratings yet
SQL
3 pages
SQL Interview
100% (1)
SQL Interview
68 pages
Assignment
No ratings yet
Assignment
4 pages
TCS Data Analyst Interview Questions
No ratings yet
TCS Data Analyst Interview Questions
8 pages
SQL Questions
No ratings yet
SQL Questions
7 pages
Comparison of SQL
No ratings yet
Comparison of SQL
11 pages
Database Interview Questions
No ratings yet
Database Interview Questions
27 pages
Top Advanced SQL Interview Questions & Answers
No ratings yet
Top Advanced SQL Interview Questions & Answers
6 pages
Deloitte Data Engineer Interview Experience (0-3 Yoe)
No ratings yet
Deloitte Data Engineer Interview Experience (0-3 Yoe)
22 pages
SQL Programming Practice Papers Set 5 6
No ratings yet
SQL Programming Practice Papers Set 5 6
4 pages
Interview Questions
No ratings yet
Interview Questions
24 pages
SQL Question Basic to Advanced
No ratings yet
SQL Question Basic to Advanced
21 pages
Quantiphi Interview
No ratings yet
Quantiphi Interview
2 pages
Murugappan Sir Notes - GT
No ratings yet
Murugappan Sir Notes - GT
35 pages
Deloitee Data Engineer Interview Questions
No ratings yet
Deloitee Data Engineer Interview Questions
24 pages
60 Questions
No ratings yet
60 Questions
4 pages
SQL Questions p1
No ratings yet
SQL Questions p1
5 pages
K L University Department of Computer Science & Engineering II/IV B.Tech Semester II Database Management Systems (13CS204) TEST-2 Key
No ratings yet
K L University Department of Computer Science & Engineering II/IV B.Tech Semester II Database Management Systems (13CS204) TEST-2 Key
8 pages
K L University Department of Computer Science & Engineering II/IV B.Tech Semester II Database Management Systems (13CS204) TEST-2 Key
No ratings yet
K L University Department of Computer Science & Engineering II/IV B.Tech Semester II Database Management Systems (13CS204) TEST-2 Key
8 pages
SAS Interview Questions You'll Most Likely Be Asked
From Everand
SAS Interview Questions You'll Most Likely Be Asked
Vibrant Publishers
No ratings yet
SAS Programming Guidelines Interview Questions You'll Most Likely Be Asked
From Everand
SAS Programming Guidelines Interview Questions You'll Most Likely Be Asked
Vibrant Publishers
No ratings yet
Unity Catalog
No ratings yet
Unity Catalog
8 pages
Spark QA
No ratings yet
Spark QA
34 pages
Hadoop Recap
No ratings yet
Hadoop Recap
27 pages
Cluster Size
No ratings yet
Cluster Size
4 pages
Recap Spark
No ratings yet
Recap Spark
21 pages
SQL Solutions
No ratings yet
SQL Solutions
59 pages
Top50 Python
No ratings yet
Top50 Python
21 pages
Chap 7 PHP MySQL Database
No ratings yet
Chap 7 PHP MySQL Database
33 pages
Web Design Technologies
No ratings yet
Web Design Technologies
5 pages
NYOUG-Oracle GoldenGate Technical Deep Dive Dec2016
No ratings yet
NYOUG-Oracle GoldenGate Technical Deep Dive Dec2016
45 pages
LTI Mindtree Interview Process and Preparation Guide
No ratings yet
LTI Mindtree Interview Process and Preparation Guide
5 pages
Introduction To Jena: Use Semantic Web Technologies in Your Java Applications
No ratings yet
Introduction To Jena: Use Semantic Web Technologies in Your Java Applications
27 pages
SageX3 PU9 HousekeepingTasks
No ratings yet
SageX3 PU9 HousekeepingTasks
37 pages
Data Modeling Interview Questions
75% (4)
Data Modeling Interview Questions
11 pages
Eltpjpapy, Utiu Ge Jhbaj GHK G: Gilj Jsgjpfisf FSKPWF FK Kfpe J
No ratings yet
Eltpjpapy, Utiu Ge Jhbaj GHK G: Gilj Jsgjpfisf FSKPWF FK Kfpe J
6 pages
Bab 4
No ratings yet
Bab 4
3 pages
Importance of Database Design in DBMS
No ratings yet
Importance of Database Design in DBMS
5 pages
Rehva 2022 - 06
No ratings yet
Rehva 2022 - 06
84 pages
Jasperreportsserver Auth Cookbook
No ratings yet
Jasperreportsserver Auth Cookbook
124 pages
8Cspl6241-Advanced Java Programming: Prof. Mohan Reddy - Y SOSS, CMR University
No ratings yet
8Cspl6241-Advanced Java Programming: Prof. Mohan Reddy - Y SOSS, CMR University
10 pages
Configure Dblookup 00
No ratings yet
Configure Dblookup 00
7 pages
Tech Note 787 - Resolving Wonderware Intelligence Dimension Query TimeOut Exceptions
No ratings yet
Tech Note 787 - Resolving Wonderware Intelligence Dimension Query TimeOut Exceptions
2 pages
GlobalMapper ScriptingReference
No ratings yet
GlobalMapper ScriptingReference
134 pages
Summary
No ratings yet
Summary
5 pages
Installation Guide LEMON POS
No ratings yet
Installation Guide LEMON POS
4 pages
PySpark Notes
No ratings yet
PySpark Notes
31 pages
Project Demo Presentation
No ratings yet
Project Demo Presentation
17 pages
SRS Document For ER Diagram
No ratings yet
SRS Document For ER Diagram
11 pages
Latihan Pertemuan 1
No ratings yet
Latihan Pertemuan 1
24 pages
DotNet Syllabus
No ratings yet
DotNet Syllabus
5 pages
Introduction To XQuery in SQL Server 2005 From Microsoft
No ratings yet
Introduction To XQuery in SQL Server 2005 From Microsoft
32 pages
Cloud Google Com Use-Cases Retrieval-Augmented-Generation
No ratings yet
Cloud Google Com Use-Cases Retrieval-Augmented-Generation
7 pages
Best Practices For Using Dremio With BI Tools
No ratings yet
Best Practices For Using Dremio With BI Tools
9 pages
Data Model Reference Guide
No ratings yet
Data Model Reference Guide
248 pages
Module V: Data Recovery and Protection
No ratings yet
Module V: Data Recovery and Protection
47 pages
Lec 12 - Physical Design
No ratings yet
Lec 12 - Physical Design
25 pages