# Python program to drop a column with same
# name using column index in PySpark
# Import the library SparkSession
from pyspark.sql import SparkSession
# Create a spark session using getOrCreate() function
spark_session = SparkSession.builder.getOrCreate()
# Create a data frame with duplicate column names
df = spark_session.createDataFrame(
[('Monday',25,27,29,30),('Tuesday',40,38,36,34),
('Wednesday',18,20,22,17),('Thursday',25,27,29,19)],
['day','temperature','temperature','temperature',
'temperature'])
# Store all the column names in the list
df_cols = df.columns
# Get index of the duplicate columns
duplicate_col_index = [idx for idx,
val in enumerate(df_cols) if val in df_cols[:idx]]
# Create a new list by renaming duplicate
# columns by adding prefix 'day_'
for i in duplicate_col_index:
df_cols[i] = 'day_'+ df_cols[i]
# Rename the duplicate columns in data frame
df = df.toDF(*df_cols)
# Create a list for the columns to be removed
cols_to_remove = [c for c in df_cols if 'day_' in c]
# Remove the columns with same name
df.drop(*cols_to_remove).show()