# importing necessary libraries
from pyspark.sql import SparkSession
# function to create SparkSession
def create_session():
spk = SparkSession.builder \
.master("local") \
.appName("Student_report.com") \
.getOrCreate()
return spk
# function to create Dataframe
def create_df(spark,data,schema):
df1 = spark.createDataFrame(data,schema)
return df1
# main function
if __name__ == "__main__":
# calling function to create SparkSession
spark = create_session()
input_data = [(1,"Shivansh","Male",20,80),
(2,"Arpita","Female",18,66),
(3,"Raj","Male",21,90),
(4,"Swati","Female",19,91),
(5,"Arpit","Male",20,50),
(6,"Swaroop","Male",23,65),
(6,"Swaroop","Male",23,65),
(6,"Swaroop","Male",23,65),
(7,"Reshabh","Male",19,70),
(7,"Reshabh","Male",19,70),
(8,"Dinesh","Male",20,75),
(9,"Rohit","Male",21,85),
(9,"Rohit","Male",21,85),
(10,"Sanjana","Female",22,87)]
schm = ["Id","Name","Gender","Age","Percentage"]
# calling function to create dataframe
df = create_df(spark,input_data,schm)
df.show()
# extracting number of distinct rows
# from the Dataframe
row = df.distinct().count()
# extracting total number of rows from
# the Dataframe
all_rows = df.count()
# extracting number of columns from the
# Dataframe
col = len(df.columns)
# printing
print(f'Dimension of the Dataframe is: {(row,col)}')
print(f'Distinct Number of Rows are: {row}')
print(f'Total Number of Rows are: {all_rows}')
print(f'Number of Columns are: {col}')