0% found this document useful (0 votes)
3 views

Data Analysis Report

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PPTX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
3 views

Data Analysis Report

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PPTX, PDF, TXT or read online on Scribd
You are on page 1/ 74

Data Analysis Report

Generated from Jupyter Notebook


Code Cell
• import pandas as pd
• import numpy as np
• import matplotlib.pyplot as plt
• import seaborn as sns
• import pylab
• import warnings
• %matplotlib inline
• sns.set(style="darkgrid",font_scale=1.5)
• pd.set_option("display.max.columns",None)
Code Cell
• from google.colab import files

• files.upload()
Code Cell
• df =
pd.read_csv("https://raw.githubusercontent.c
om/amankharwal/Website-data/master/
CarPrice.csv")
Code Cell
• df.head()
Code Cell
• df.tail()
Code Cell
• df.shape
Code Cell
• df.info()
Code Cell
• df.describe()
Code Cell
• df.isnull().sum()
Code Cell
• print("Duplicate Values
=",df.duplicated().sum())
Code Cell
• df.select_dtypes(include=["int","float"]).head(
)
Markdown Cell
• # **DATA CLEANING**
Code Cell
• Company_Name =
df["CarName"].apply(lambda x: x.split(" ")[0])
• df.insert(2,"CompanyName",Company_Name)

• df.drop(columns=["CarName"],inplace=True)
Code Cell
• df.head()
Code Cell
• def replace(a,b):

df["CompanyName"].replace(a,b,inplace=True
)

• replace('maxda','mazda')
• replace('porcshce','porsche')
• replace('toyouta','toyota')
• replace('vokswagen','volkswagen')
Code Cell
• df["CompanyName"].unique()
Markdown Cell
• # **EDA**
Code Cell
• plt.figure(figsize=(20, 6))

• plt.subplot(1, 2, 1)
• sns.distplot(df["price"], color="red", kde=True)
• plt.title("Car Price Distribution",
fontweight="black", pad=20, fontsize=20)

• plt.subplot(1, 2, 2)
• sns.boxplot(y=df["price"], palette="Set2")
Code Cell
• df["price"].agg(["min","mean","median","max
","std","skew"]).to_frame().T
Code Cell
• plt.figure(figsize=(14,6))
• counts = df["CompanyName"].value_counts()
• sns.barplot(x=counts.index, y=counts.values)
• plt.xlabel("Car Company")
• plt.ylabel("Total No. of cars sold")
• plt.title("Total Cars produced by Companies",
pad=20, fontweight="black", fontsize=20)
• plt.xticks(rotation=90)
• plt.show()
Code Cell
• df[df["CompanyName"]=="renault"]
Code Cell
• df[df["CompanyName"]=="mercury"]
Code Cell
• df[df["CompanyName"]=="porshe"]
Code Cell
• def clean_company_names(df, column):
• df[column] = df[column].str.lower()
• df[column] = df[column].replace({
• 'porshe': 'porsche',
• 'vw': 'volkswagen',
• })
• return df

• df = clean_company_names(df,
Code Cell
• df["fueltype"].unique()
Code Cell
• def categorical_visualization(cols):
• plt.figure(figsize=(20,10))
• plt.subplot(1,3,1)

sns.countplot(x=cols,data=df,palette="Set2",or
der=df[cols].value_counts().index)
• plt.title(f"{cols}
Distribution",pad=10,fontweight="black",fonts
ize=18)
• plt.xticks(rotation=90)
Code Cell
• df["aspiration"].unique()
Code Cell
• categorical_visualization("aspiration")
Code Cell
• categorical_visualization("doornumber")
Code Cell
• categorical_visualization("carbody")
Code Cell
• categorical_visualization("drivewheel")
Code Cell
• categorical_visualization("enginelocation")
Code Cell
• df[df["enginelocation"]=="rear"]
Code Cell
• categorical_visualization("enginetype")
Code Cell
• df[df["enginetype"]=="rotor"]
Code Cell
• df[df["enginetype"]=="dohcv"]
Code Cell
• categorical_visualization("cylindernumber")
Code Cell
• df[df["cylindernumber"]=="three"]
Code Cell
• df[df["cylindernumber"]=="twelve"]
Code Cell
• categorical_visualization("fuelsystem")
Code Cell
• df[df["fuelsystem"]=="mfi"]
Code Cell
• df[df["fuelsystem"]=="spfi"]
Code Cell
• categorical_visualization("symboling")
Code Cell
• def scatter_plot(cols):
• x=1
• plt.figure(figsize=(15,6))
• for col in cols:
• plt.subplot(1,3,x)

sns.scatterplot(x=col,y="price",data=df,color="
blue")
• plt.title(f"{col} vs
Code Cell
• scatter_plot(["carlength","carwidth","carheigh
t"])
Code Cell
• scatter_plot(["enginesize","boreratio","stroke"
])
Code Cell
• scatter_plot(["compressionratio","horsepower
","peakrpm"])
Code Cell
• def scatter_plot(cols):
• q_low = df["price"].quantile(0.01)
• q_hi = df["price"].quantile(0.99)
• df_filtered = df[(df["price"] > q_low) &
(df["price"] < q_hi)]
• x=1
• plt.figure(figsize=(15,6))
• for col in cols:
• plt.subplot(1,2,x)
Code Cell
• scatter_plot(["wheelbase","curbweight"])
Code Cell
• scatter_plot(["citympg","highwaympg"])
Code Cell
• f = round(df.groupby(["CompanyName"])
["price"].agg(["mean"]),2).T
• f
Code Cell
• df =
df.merge(f.T,how="left",on="CompanyName")
Code Cell
• bins = [0,10000,20000,40000]
• cars_bin=['Budget','Medium','Highend']
• df['CarsRange'] =
pd.cut(df['mean'],bins,right=False,labels=cars_
bin)
• df.head()
Code Cell
• new_df =
df[['fueltype','aspiration','doornumber','carbo
dy','drivewheel','enginetype','cylindernumber'
,'fuelsystem'
• ,'wheelbase','carlength','carwidth','cur
bweight','enginesize','boreratio','horsepower','
citympg','highwaympg',
• 'price','CarsRange']]
Code Cell
• new_df.head()
Code Cell
• new_df =
pd.get_dummies(columns=["fueltype","aspirat
ion","doornumber","carbody","drivewheel","e
nginetype",

"cylindernumber","fuelsystem","CarsRange"],
data=new_df)
Code Cell
• new_df.head()
Code Cell
• scaler = StandardScaler()
Code Cell
• num_cols =
['wheelbase','carlength','carwidth','curbweight
','enginesize','boreratio','horsepower',
• 'citympg','highwaympg']

• new_df[num_cols] =
scaler.fit_transform(new_df[num_cols])
Code Cell
• new_df.head()
Code Cell
• x = new_df.drop(columns=["price"])
• y = new_df["price"]
Code Cell
• x.shape
Code Cell
• y.shape
Code Cell
• x_train,x_test,y_train,y_test=train_test_split(x
,y,test_size=0.2,random_state=42)
Code Cell
• print("x_train - > ",x_train.shape)
• print("x_test - > ",x_test.shape)
• print("y_train - > ",y_train.shape)
• print("y_test - > ",y_test.shape)
Markdown Cell
• # **MODEL BUILDING**
Code Cell
• training_score = []
• testing_score = []
Code Cell
• def model_prediction(model):
• model.fit(x_train,y_train)
• x_train_pred = model.predict(x_train)
• x_test_pred = model.predict(x_test)
• a = r2_score(y_train,x_train_pred)*100
• b = r2_score(y_test,x_test_pred)*100
• training_score.append(a)
• testing_score.append(b)
Code Cell
• model_prediction(LinearRegression())
Code Cell
• model_prediction(DecisionTreeRegressor())
Code Cell
• model_prediction(RandomForestRegressor())
Code Cell
• models = ["Linear Regression","Decision
Tree","Random Forest"]

• df = pd.DataFrame({"Algorithms":models,
• "Training Score":training_score,
• "Testing Score":testing_score})
• df
Code Cell
• df.plot(x="Algorithms",y=["Training
Score","Testing Score"],
figsize=(16,6),kind="bar",
• title="Performance Visualization of
Different Models",colormap="Set1")
• plt.show()

You might also like