vertopal.com_Week_4
vertopal.com_Week_4
#NAME: V.Vyaswanth
#Roll No : 23071A66K4
#23071A66K2
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
df=pd.read_csv("auto-mpg.csv")
df.head()
sns.boxplot(df['mpg'],orient='h')
<Axes: xlabel='mpg'>
1. Removing outliers / missing values
baddata = df[df['horsepower'] == '?']
baddata
arr= df['horsepower'].values
print(arr)
['130' '165' '150' '150' '140' '198' '220' '215' '225' '190' '170'
'160'
'150' '225' '95' '95' '97' '85' '88' '46' '87' '90' '95' '113' '90'
'215'
'200' '210' '193' '88' '90' '95' '?' '100' '105' '100' '88' '100'
'165'
'175' '153' '150' '180' '170' '175' '110' '72' '100' '88' '86' '90'
'70'
'76' '65' '69' '60' '70' '95' '80' '54' '90' '86' '165' '175' '150'
'153'
'150' '208' '155' '160' '190' '97' '150' '130' '140' '150' '112' '76'
'87' '69' '86' '92' '97' '80' '88' '175' '150' '145' '137' '150'
'198'
'150' '158' '150' '215' '225' '175' '105' '100' '100' '88' '95' '46'
'150' '167' '170' '180' '100' '88' '72' '94' '90' '85' '107' '90'
'145'
'230' '49' '75' '91' '112' '150' '110' '122' '180' '95' '?' '100'
'100'
'67' '80' '65' '75' '100' '110' '105' '140' '150' '150' '140' '150'
'83'
'67' '78' '52' '61' '75' '75' '75' '97' '93' '67' '95' '105' '72'
'72'
'170' '145' '150' '148' '110' '105' '110' '95' '110' '110' '129' '75'
'83' '100' '78' '96' '71' '97' '97' '70' '90' '95' '88' '98' '115'
'53'
'86' '81' '92' '79' '83' '140' '150' '120' '152' '100' '105' '81'
'90'
'52' '60' '70' '53' '100' '78' '110' '95' '71' '70' '75' '72' '102'
'150'
'88' '108' '120' '180' '145' '130' '150' '68' '80' '58' '96' '70'
'145'
'110' '145' '130' '110' '105' '100' '98' '180' '170' '190' '149' '78'
'88' '75' '89' '63' '83' '67' '78' '97' '110' '110' '48' '66' '52'
'70'
'60' '110' '140' '139' '105' '95' '85' '88' '100' '90' '105' '85'
'110'
'120' '145' '165' '139' '140' '68' '95' '97' '75' '95' '105' '85'
'97'
'103' '125' '115' '133' '71' '68' '115' '85' '88' '90' '110' '130'
'129'
'138' '135' '155' '142' '125' '150' '71' '65' '80' '80' '77' '125'
'71'
'90' '70' '70' '65' '69' '90' '115' '115' '90' '76' '60' '70' '65'
'90'
'88' '90' '90' '78' '90' '75' '92' '75' '65' '105' '65' '48' '48'
'67'
'67' '67' '?' '67' '62' '132' '100' '88' '?' '72' '84' '84' '92'
'110'
'84' '58' '64' '60' '67' '65' '62' '68' '63' '65' '65' '74' '?' '75'
'75'
'100' '74' '80' '76' '116' '120' '110' '105' '88' '85' '88' '88' '88'
'85' '84' '90' '92' '?' '74' '68' '68' '63' '70' '88' '75' '70' '67'
'67'
'67' '110' '85' '92' '112' '96' '84' '90' '86' '52' '84' '79' '82']
df.isnull().sum()
Unnamed: 0 0
mpg 0
cylinders 0
displacement 0
horsepower 0
weight 0
acceleration 0
model year 0
origin 0
car name 0
dtype: int64
df.replace('?',np.nan,inplace=True)
df.isnull().sum()
Unnamed: 0 0
mpg 0
cylinders 0
displacement 0
horsepower 6
weight 0
acceleration 0
model year 0
origin 0
car name 0
dtype: int64
q1=df.mpg.quantile(0.25)
q3=df.mpg.quantile(0.75)
iqr=q3-q1
ll=q1-(1.5)*iqr
ul=q3+(1.5)*iqr
upper=np.where(df['mpg']>=ul)
lower=np.where(df['mpg']<=ll)
print("upper outliers",upper)
print("lower outliers",lower)
df.drop(upper[0],inplace=True)
print(df.shape)
df.drop(lower[0],inplace=True)
print(df.shape)
(397, 10)
(397, 10)
sns.boxplot(df['mpg'],orient='h')
<Axes: xlabel='mpg'>
newdf=df.dropna()
newdf.shape
(391, 10)
(398, 10)
sns.boxplot(df['acceleration'],orient='h')
<Axes: xlabel='acceleration'>
df2.plot(kind="scatter",x='acceleration',y='mpg')
arr= df2['acceleration'].values
array([ True, True, True, True, True, True, False, True, True,
True, True, True, False, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, False,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, False, True,
True, True, True, True, True, True, True, True, True,
True, True, False, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, False, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, False, True, True, False, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True])
mid1=np.mean(df2['acceleration'][true_index])
mid1
np.float64(15.468240722430952)
false_index=~true_index # ~ is bool operator for inverse
df2['acceleration'].values[false_index]=mid1
print(np.where(df2['acceleration'] > UL))
(array([], dtype=int64),)
df3=pd.read_csv('auto-mpg.csv')
df3.head()
max_threshold=df3['mpg'].quantile(0.95)
min_threshold=df3['mpg'].quantile(0.05)
print(max_threshold,min_threshold)
print(df3.loc[[322]])
37.029999999999994 13.0
Unnamed: 0 mpg cylinders displacement horsepower weight \
322 322 46.6 4 86.0 65 2110
df3['mpg']=np.where(df3['mpg']>max_threshold,max_threshold,
np.where(df3['mpg']<min_threshold,min_threshold,df3['mpg']))
# this command finds the values and also replaces them
sns.boxplot(df3['mpg'],orient='h')
<Axes: xlabel='mpg'>