Datacleaning - Ipynb - Colab
Datacleaning - Ipynb - Colab
ipynb - Colab
import pandas as pd
df=pd.read_csv("https://docs.google.com/spreadsheets/d/1okbuAhRRXId5a5LPC85wSE2MBgKpQ-kY4nnABE3gCFc/export?format=csv&usp=sharing")
df
... ... ... ... ... ... ... ... ... ...
import plotly.express as px
px.scatter(x=df['engine'],y=df['km'])
120k
100k
80k
60k
y
40k
20k
i=df[df['engine']>1500].index
df=df.drop(index=i)
l=[]
for i in range (0,len(df)):
l.append(i)
df['index']=l
px.scatter(x=df['engine'],y=df['km'])
https://colab.research.google.com/drive/1YpcH_4uz3sVHGwE9Em0GZ4p8Tjq5ZTxR#scrollTo=e9RzZJomeQtH&printMode=true 1/4
10/22/24, 7:17 PM datacleaning.ipynb - Colab
120k
100k
80k
60k
y
40k
20k
px.scatter(x=df['price'],y=df['km'])
120k
100k
80k
60k
y
40k
20k
df.isnull().sum()
https://colab.research.google.com/drive/1YpcH_4uz3sVHGwE9Em0GZ4p8Tjq5ZTxR#scrollTo=e9RzZJomeQtH&printMode=true 2/4
10/22/24, 7:17 PM datacleaning.ipynb - Colab
brand 0
model 0
transmission 0
age 0
fuel 0
engine 4
km 0
owner 0
price 0
index 0
dtype: int64
df[df['engine'].isnull()]
df['engine'].unique()
df['engine'].mode()[0]
1197.0
df['engine']=df['engine'].fillna(df['engine'].mode()[0])
df[df['engine'].isnull()]
df.isnull().sum()
brand 0
model 0
transmission 0
age 0
fuel 0
engine 0
km 0
owner 0
price 0
index 0
dtype: int64
df.duplicated().sum()
https://colab.research.google.com/drive/1YpcH_4uz3sVHGwE9Em0GZ4p8Tjq5ZTxR#scrollTo=e9RzZJomeQtH&printMode=true 3/4
10/22/24, 7:17 PM datacleaning.ipynb - Colab
len(df['model'].unique())/len(df)
0.2981366459627329
len(df['model'].unique())
48
len(df)
161
df=df.drop(columns=['model'])
df
... ... ... ... ... ... ... ... ... ...
https://colab.research.google.com/drive/1YpcH_4uz3sVHGwE9Em0GZ4p8Tjq5ZTxR#scrollTo=e9RzZJomeQtH&printMode=true 4/4