Data Cleaning and Manipulation in Python
Data Cleaning and Manipulation in Python
Data Cleaning and Manipulation in Python
https://www.kaggle.com/code/pythonafroz/python-for-machine-learning-part-01
Follow for more AI content: https://www.linkedin.com/in/syed-afroz-70939914/
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
# Set Display
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)
pd.set_option('display.precision', 2)
import numpy
print('numpy:{}'.format(numpy.__version__))
numpy:1.26.4
df = pd.read_csv("titanic.csv")
display(df.shape)
df.head()
(891, 12)
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.25 NaN S
STON/O2.
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 7.92 NaN S
3101282
df.head(3).style.set_properties(**{'background-color': 'blue',
'color': 'white',
'border-color': 'darkblack'})
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.000000 1 0 A/5 21171 7.250000 nan S
STON/O2.
2 3 1 3 Heikkinen, Miss. Laina female 26.000000 0 0 7.925000 nan S
3101282
df1 = df.copy('Deep')
df1["Survived"].replace({0:"Died" , 1:"Saved"},inplace = True)
df1.head(3)
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 Died 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.25 NaN S
STON/O2.
2 3 Saved 3 Heikkinen, Miss. Laina female 26.0 0 0 7.92 NaN S
3101282
# Drop Columns
df1 = df.copy('Deep')
df1 = df1.drop(['PassengerId','Ticket'],axis=1)
df1.head(3)
Survived Pclass Name Sex Age SibSp Parch Fare Cabin Embarked
1 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 71.28 C85 C
# Drop Rows
df = df.drop(labels=[1,3,5,7],axis=0)
df.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.25 NaN S
STON/O2.
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 7.92 NaN S
3101282
print('Method 1:')
df.isnull().sum()
Method 1:
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 176
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 685
Embarked 2
dtype: int64
Age 176
Cabin 685
Embarked 2
dtype: int64
print('Method 12:')
import missingno as msno
msno.matrix(df)
plt.show()
Method 12:
Join Our Telegram Channel to Learn AI & ML: https://t.me/AIMLDeepThaught
df[df['Embarked'].isnull()]
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
829 830 1 1 Stone, Mrs. George Nelson (Martha Evelyn) female 62.0 0 0 113572 80.0 B28 NaN
sample_incomplete_rows =df[df.isnull().any(axis=1)].head()
sample_incomplete_rows
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.25 NaN S
STON/O2.
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 7.92 NaN S
3101282
9 10 1 2 Nasser, Mrs. Nicholas (Adele Achem) female 14.0 1 0 237736 30.07 NaN C
df.describe()
# Describe
df[df['Survived']==0].describe().T.style.background_gradient(subset=['mean','std','50%','count'], cmap='RdPu')
count mean std min 25% 50% 75% max
df.describe(percentiles=[0.05,0.25,0.35,0.5,0.75,0.85,0.95,0.995,0.999])
# Agg
df[['Age','Fare','Pclass']].agg(['sum','max','mean','std','skew','kurt'])
# value_counts
df['Embarked'].value_counts().to_frame()
count
Embarked
S 642
C 167
Q 76
df['Embarked'].value_counts().tolist()
#Count
df[['Age','Embarked','Sex']].count()
Age 711
Embarked 885
Sex 887
dtype: int64
df['Embarked'][df['Sex']=='female'].value_counts(normalize=True)*100
Embarked
S 65.16
C 23.23
Q 11.61
Name: proportion, dtype: float64
df['Embarked'].value_counts()/len(df['Embarked'])
Embarked
S 0.72
C 0.19
Q 0.09
Name: count, dtype: float64
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
733 734 0 2 Berriman, Mr. William John male 23.0 0 0 28425 13.00 NaN S
392 393 0 3 Gustafsson, Mr. Johan Birger male 28.0 2 0 3101277 7.92 NaN S
614 615 0 3 Brocklebank, Mr. William Alfred male 35.0 0 0 364512 8.05 NaN S
df1 = df.copy()
columns = ['Age']
df1.describe()
corr = df.select_dtypes('number').corr()
display(corr)
plt.xlabel('Features')
plt.ylabel('Features')
plt.title('Correlation Heatmap')
plt.show()
n_groups = len(corr.index.levels[0])
group_count = 0
for embarked_group in corr.index.levels[0]:
ax = axes.flat[group_count]
sns.heatmap(corr.xs(embarked_group), annot=True, cmap='viridis', ax=ax)
ax.set_title(f"Correlation Heatmap for Embarked: {embarked_group}")
group_count += 1
plt.tight_layout()
plt.show()
Fare Age
Embarked
corr = df.select_dtypes('number').corr()
mask = np.triu(np.ones_like(corr,dtype = bool))
plt.figure(dpi=100)
plt.title('Correlation Analysis')
sns.heatmap(corr,mask=mask,annot=True,lw=0,linecolor='white',cmap='viridis',fmt = "0.2f")
plt.xticks(rotation=90)
plt.yticks(rotation = 0)
plt.show()
df1 = df1[df1['Cabin'].notna()]
df1.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
# Fillna Method
df1 = df.copy()
df1 = df1.dropna()
# Fillna Method
df1 = df.copy()
df1.fillna(method="ffill", inplace=True)
df1 = df.copy()
df1["Age"] = df1["Age"].fillna(df1["Age"].mean())
df1 = df.copy()
df1['Embarked'] = df1['Embarked'].fillna(df1['Embarked'] == 'Q')
df1.head(3)
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.25 NaN S
#Fill Method :
df1 = df.copy()
df1['Age'] = df1['Age'].fillna(0)
df1['Age'] = df1['Age'].fillna('None')
df1["Age"].fillna(method="backfill",inplace=True)
df1["Embarked"].fillna(value="A",inplace=True)
df1["Pclass"].fillna(value= 0,inplace=True)
df1.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.25 NaN S
STON/O2.
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 7.92 NaN S
3101282
df1 = df.copy()
df1 = df1.drop('Cabin',axis =1)
sample_incomplete_rows = df1[df1.isnull().any(axis=1)]
display(sample_incomplete_rows.shape)
sample_incomplete_rows.head()
(178, 11)
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Embarked
(3, 2)
Name Embarked
(5, 3)
(47, 12)
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
(398, 11)
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Embarked
df1 = df.copy()
cabin_no_na = df1[df1["Cabin"].notna()]
display(cabin_no_na.shape)
cabin_no_na.head()
(202, 12)
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
display(titanic_Pclass.shape)
titanic_Pclass.head()
(13, 12)
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
PC
195 196 1 1 Lurette, Miss. Elise female 58.0 0 0 146.52 B80 C
17569
275 276 1 1 Andrews, Miss. Kornelia Theodosia female 63.0 1 0 13502 77.96 D7 S
# np.where
df1 = df.copy()
df1['Cabin_null'] = np.where(df1['Cabin'].isnull(),0,1)
df1.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Cabin_null
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.25 NaN S 0
STON/O2.
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 7.92 NaN S 0
3101282
df1 = df.copy()
df1["Bucket"] = np.where(df1["Fare"] < 250, "Low", "High")
df1.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Bucket
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.25 NaN S Low
STON/O2.
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 7.92 NaN S Low
3101282
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.05 NaN S Low
df1 = df.copy()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
df1 = df.copy()
df1.sort_values(by = 'Age' , ascending = False)[['Name','Ticket','Survived','Pclass', 'Age' ]].head()
Numerical_data = df1.select_dtypes(include=['number'])
Numerical_data.head()
PassengerId Survived Pclass Age SibSp Parch Fare
0 1 0 3 22.0 1 0 7.25
2 3 1 3 26.0 0 0 7.92
4 5 0 3 35.0 0 0 8.05
6 7 0 1 54.0 0 0 51.86
8 9 1 3 27.0 0 2 11.13
Categorical_data = df1.select_dtypes(include=['object'])
Categorical_data.head()
cabin_notna = df1[df1['Cabin'].notna()]
cabin_notna.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
#groupby
df1 = df.copy()
titanic_room = df1.groupby(['Embarked'])['Age'].mean().reset_index()
titanic_room.head()
Embarked Age
0 C 30.76
1 Q 28.09
2 S 29.49
Fare Sex
Embarked
C 59.89 167
Q 13.34 76
S 27.05 642
temp = df1.groupby("Sex")['Age'].min().to_frame().reset_index()
temp
Sex Age
0 female 0.75
1 male 0.42
size mean
Embarked Pclass
C 1 84 105.12
2 17 25.36
3 66 11.21
Q 1 2 90.00
2 3 12.35
3 71 11.22
S 1 126 70.50
2 164 20.33
3 352 14.63
df1.groupby(['Survived',"Sex"])['Fare'].first().to_frame()
Fare
Survived Sex
0 female 7.85
male 7.25
1 female 7.92
male 13.00
Tit_groupby = df1.groupby("Pclass")["Pclass"].count().to_frame()
Tit_groupby
Pclass
Pclass
1 214
2 184
3 489
df1.groupby('Survived')['Sex'].value_counts().to_frame()
count
Survived Sex
0 male 466
female 81
1 female 231
male 109
df1.groupby(['Survived',"Sex"])['Pclass'].count()/df1.groupby(["Sex"])['Pclass'].count()*100
Survived Sex
0 female 25.96
male 81.04
1 female 74.04
male 18.96
Name: Pclass, dtype: float64
(df1.groupby(['Embarked','Pclass']).count()['Fare']/df1.groupby(['Embarked']).count()['Fare'])*100
Embarked Pclass
C 1 50.30
2 10.18
3 39.52
Q 1 2.63
2 3.95
3 93.42
S 1 19.63
2 25.55
3 54.83
Name: Fare, dtype: float64
df1.groupby("Sex")[["Age","Pclass"]].mean()
Age Pclass
Sex
df1.groupby(["Sex", "Pclass"])["Fare"].mean()
Sex Pclass
female 1 107.08
2 21.97
3 16.12
male 1 67.23
2 19.74
3 12.65
Name: Fare, dtype: float64
Fare Age
Sex Embarked
Q 454.86 291.50
S 7811.31 5130.50
Q 558.94 495.00
S 9553.92 11145.25
Fare
size mean
Embarked Pclass
C 1 84 105.12
2 17 25.36
3 66 11.21
Q 1 2 90.00
2 3 12.35
3 71 11.22
S 1 126 70.50
2 164 20.33
3 352 14.63
temp = df1.groupby("Sex")['Age'].min().reset_index()
temp
Sex Age
0 female 0.75
1 male 0.42
titanic_room= df1.groupby(['Embarked','Sex'])[['Age','Fare']].mean().reset_index()
titanic_room
df1.groupby(['Survived',"Sex","Embarked"])['Pclass'].count().to_frame()
Pclass
0 female C 9
Q 9
S 63
male C 66
Q 37
S 363
1 female C 63
Q 27
S 139
male C 29
Q 3
S 77
Fare Sex
Embarked
C 59.89 167
Q 13.34 76
S 27.05 642
(df1.groupby(['Survived',"Sex"])['Fare'].count()/df1.groupby(['Survived'])['Fare'].count()).to_frame()*100
Fare
Survived Sex
0 female 14.81
male 85.19
1 female 67.94
male 32.06
df1.groupby('Sex')['Embarked'].count().nlargest(2).reset_index()
Sex Embarked
0 male 575
1 female 310
Fare
Pclass
1 84.36
2 20.66
3 13.67
#pivot_table
Age
Pclass
1 38.25
2 29.88
3 25.21
x=pd.DataFrame(pd.pivot_table(df1,index=['Sex','Embarked'],aggfunc='count')['Fare'])
x
Fare
Sex Embarked
female C 72
Q 36
S 202
male C 95
Q 40
S 440
Age
Pclass
1 37.0
2 29.0
3 24.0
pd.crosstab(df1['Pclass'],df1['Survived'])
Survived 0 1
Pclass
1 80 134
2 97 87
3 370 119
#Cross Tab
pd.crosstab(df1['Sex'],df1['Embarked'])
Embarked C Q S
Sex
female 72 36 202
male 95 40 440
#Cross Tab
plot_criteria= ['Sex', 'Pclass']
cm = sns.light_palette("red", as_cmap=True)
(round(pd.crosstab(df1[plot_criteria[0]], df1[plot_criteria[1]], normalize='columns') * 100,2)).style.background_grad
Pclass 1 2 3
Sex
Embarked C Q S
Sex
Pclass 1 2 3
Embarked
Pclass
1 80 134 214
2 97 87 184
sex
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Titanic
map
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.25 NaN S 1
STON/O2.
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 7.92 NaN S 0
3101282
sex
Fare
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Titanic
Range
map
STON/O2.
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 7.92 NaN S 0 low
3101282
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.05 NaN S 1 low
sex
Fare
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Titanic age_bins
Range
map
Braund, Mr.
0 1 0 3 Owen male 22.0 1 0 A/5 21171 7.25 NaN S 1 low teen
Harris
Heikkinen, STON/O2.
2 3 1 3 female 26.0 0 0 7.92 NaN S 0 low teen
Miss. Laina 3101282
Allen, Mr.
4 5 0 3 William male 35.0 0 0 373450 8.05 NaN S 1 low adult
Henry
McCarthy,
6 7 0 1 Mr. Timothy male 54.0 0 0 17463 51.86 E46 S 1 low old
J
Johnson,
Mrs. Oscar
W
8 9 1 3 female 27.0 0 2 347742 11.13 NaN S 0 low teen
(Elisabeth
Vilhelmina
Berg)
Braund,
0 1 0 3 Mr. Owen male 22.0 1 0 A/5 21171 7.25 NaN S 1 low teen False
Harris
Heikkinen,
STON/O2.
2 3 1 3 Miss. female 26.0 0 0 7.92 NaN S 0 low teen True
3101282
Laina
Allen, Mr.
4 5 0 3 William male 35.0 0 0 373450 8.05 NaN S 1 low adult True
Henry
McCarthy,
6 7 0 1 Mr. male 54.0 0 0 17463 51.86 E46 S 1 low old False
Timothy J
Johnson,
Mrs.
Oscar W
8 9 1 3 female 27.0 0 2 347742 11.13 NaN S 0 low teen True
(Elisabeth
Vilhelmina
Berg)
df1.iloc[2:4, 3:6]
df1.iloc[0:4, 3] = "Anonymous"
df1.head()
sex
Fare
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Titanic age_bins is_train
Range
map
0 1 0 3 Anonymous male 22.0 1 0 A/5 21171 7.25 NaN S 1 low teen False
STON/O2.
2 3 1 3 Anonymous female 26.0 0 0 7.92 NaN S 0 low teen True
3101282
Johnson,
Mrs. Oscar
W
8 9 1 3 female 27.0 0 2 347742 11.13 NaN S 0 low teen True
(Elisabeth
Vilhelmina
Berg)
df1.iloc[[3,6,9],[2,3]]
Pclass Name
6 1 Anonymous
#Replace
df1["Survived"].replace({0:"Died" , 1:"Saved"} , inplace=True)
df1.head()
sex
Fare
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Titanic age_bins is_train
Range
map
0 1 Died 3 Anonymous male 22.0 1 0 A/5 21171 7.25 NaN S 1 low teen False
STON/O2.
2 3 Saved 3 Anonymous female 26.0 0 0 7.92 NaN S 0 low teen True
3101282
4 5 Died 3 Anonymous male 35.0 0 0 373450 8.05 NaN S 1 low adult True
6 7 Died 1 Anonymous male 54.0 0 0 17463 51.86 E46 S 1 low old False
Johnson,
Mrs. Oscar
W
8 9 Saved 3 female 27.0 0 2 347742 11.13 NaN S 0 low teen True
(Elisabeth
Vilhelmina
Berg)
#Rename
df1.rename(columns={"Name" : 'Person Name'},inplace=True)
df1.head()
sex
Person Fare
PassengerId Survived Pclass Sex Age SibSp Parch Ticket Fare Cabin Embarked Titanic age_bins is_train
Name Range
map
0 1 Died 3 Anonymous male 22.0 1 0 A/5 21171 7.25 NaN S 1 low teen False
STON/O2.
2 3 Saved 3 Anonymous female 26.0 0 0 7.92 NaN S 0 low teen True
3101282
4 5 Died 3 Anonymous male 35.0 0 0 373450 8.05 NaN S 1 low adult True
6 7 Died 1 Anonymous male 54.0 0 0 17463 51.86 E46 S 1 low old False
Johnson,
Mrs. Oscar
W
8 9 Saved 3 female 27.0 0 2 347742 11.13 NaN S 0 low teen True
(Elisabeth
Vilhelmina
Berg)
sex
Person Fare
PassengerId Survived Pclass Sex Age SibSp Parch Ticket Fare Cabin Embarked Titanic age_bins is_train
Name Range
map
Barber,
Miss.
290 291 Saved 1 female 26.0 0 0 19877 78.85 NaN Southampton 0 low teen False
Ellen
"Nellie"
Graham,
Miss.
887 888 Saved 1 female 19.0 0 0 112053 30.00 B42 Southampton 0 low teen True
Margaret
Edith
Richard,
SC/PARIS
135 136 Died 2 Mr. male 23.0 0 0 15.05 NaN Cherbourg 1 low teen False
2133
Emile
Turpin,
Mrs.
William
41 42 Died 2 John female 27.0 1 0 11668 21.00 NaN Southampton 0 low teen True
Robert
(Dorothy
Ann ...
Calic,
500 501 Died 3 Mr. male 17.0 0 0 315086 8.66 NaN Southampton 1 low child False
Petar
df1['Pclass'][df1['Pclass'] == 1] = 'Rich'
df1['Pclass'][df1['Pclass'] == 2] = 'Middel Class'
df1['Pclass'][df1['Pclass'] == 3] = 'Poor'
df1.head()
sex
Person Fare
PassengerId Survived Pclass Sex Age SibSp Parch Ticket Fare Cabin Embarked Titanic age_bins is_train
Name Range
map
0 1 Died Poor Anonymous male 22.0 1 0 A/5 21171 7.25 NaN Southampton 1 low teen False
STON/O2.
2 3 Saved Poor Anonymous female 26.0 0 0 7.92 NaN Southampton 0 low teen True
3101282
4 5 Died Poor Anonymous male 35.0 0 0 373450 8.05 NaN Southampton 1 low adult True
6 7 Died Rich Anonymous male 54.0 0 0 17463 51.86 E46 Southampton 1 low old False
Johnson,
Mrs. Oscar
W
8 9 Saved Poor female 27.0 0 2 347742 11.13 NaN Southampton 0 low teen True
(Elisabeth
Vilhelmina
Berg)
sex
Person Fare
PassengerId Survived Pclass Sex Age SibSp Parch Ticket Fare Cabin Embarked Titanic age_bins is_train
Name Range
map
0 1 Died Poor Anonymous male 22.0 1 0 A/5 21171 7.25 NaN Southampton 1 low teen False
STON/O2.
2 3 Saved Poor Anonymous female 26.0 0 0 7.92 NaN Southampton 0 low teen True
3101282
4 5 Died Poor Anonymous male 35.0 0 0 373450 8.05 NaN Southampton 1 low adult True
6 7 Died Rich Anonymous male 54.0 0 0 17463 51.86 E46 Southampton 1 low old False
Johnson,
Mrs. Oscar
W
8 9 Saved Poor female 27.0 0 2 347742 11.13 NaN Southampton 0 low teen True
(Elisabeth
Vilhelmina
Berg)
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
STON/O2.
2 3 1 3 ? female 26.0 0 0 7.92 NaN S
3101282
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
STON/O2.
2 3 1 3 NaN female 26.0 0 0 7.92 NaN S
3101282
df1 = df.copy()
df1[df1['Age'].isnull()].index
Index([ 17, 19, 26, 28, 29, 31, 32, 36, 42, 45,
...
832, 837, 839, 846, 849, 859, 863, 868, 878, 888],
dtype='int64', length=176)
#Join
#Join
Join = df1.join(df1, lsuffix = '_1') # lsuffix = Left Suffix
Join.head(2)
PassengerId_1 Survived_1 Pclass_1 Name_1 Sex_1 Age_1 SibSp_1 Parch_1 Ticket_1 Fare_1 Cabin_1 Embarked_1 PassengerId Surv
Braund,
0 1 0 3 Mr. Owen male 22.0 1 0 A/5 21171 7.25 NaN S 1
Harris
Heikkinen,
STON/O2.
2 3 1 3 Miss. female 26.0 0 0 7.92 NaN S 3
3101282
Laina
#Melt
Melt = pd.melt(df1,id_vars = ['Embarked'], value_vars = ['Survived'])
Melt.head()
0 S Survived 0
1 S Survived 1
2 S Survived 0
3 S Survived 0
4 S Survived 1
df1 = df.copy()
df1.dtypes
PassengerId int64
Survived int64
Pclass int64
Name object
Sex object
Age float64
SibSp int64
Parch int64
Ticket object
Fare float64
Cabin object
Embarked object
dtype: object
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Pclass_New
0 1 0 3 Braund, Mr. Owen Harris female 22.0 1 0 A/5 21171 7.25 NaN S 0
STON/O2.
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 7.92 NaN S 0
3101282
count
Pclass_New
0 673
UpperClass 214
0 1 0 3 Braund, Mr. Owen Harris female 22.0 1 0 A/5 21171 7.25 NaN S 5
STON/O2.
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 7.92 NaN S 5
3101282
count
Pclass_New
5 489
0 398
df1.columns.tolist()
['PassengerId',
'Survived',
'Pclass',
'Name',
'Sex',
'Age',
'SibSp',
'Parch',
'Ticket',
'Fare',
'Cabin',
'Embarked',
'Pclass_New']
df1.nunique()
PassengerId 887
Survived 2
Pclass 3
Name 887
Sex 1
Age 88
SibSp 7
Parch 7
Ticket 679
Fare 246
Cabin 146
Embarked 3
Pclass_New 2
dtype: int64
df1 = df.copy()
df1 = df.copy()
df1['Age_Range'] = pd.cut(df1['Age'],
bins=[0.,15,30,45,60,65,np.inf],
labels=[1,2,3,4,5,6])
df1.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Age_Range
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.25 NaN S 2
STON/O2.
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 7.92 NaN S 2
3101282
df1 = df.copy()
df1['AgeBand'] = pd.cut(df1['Age'], 5)
df1[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean().sort_values(by='AgeBand', ascending=True
AgeBand Survived
Cabin_Not_NA = df1[df1['Cabin'].notna()]
Cabin_Not_NA.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked AgeBand
(48.168,
6 7 0 1 McCarthy, Mr. Timothy J male 54.0 0 0 17463 51.86 E46 S
64.084]
(48.168,
11 12 1 1 Bonnell, Miss. Elizabeth female 58.0 0 0 113783 26.55 C103 S
64.084]
(32.252,
21 22 1 2 Beesley, Mr. Lawrence male 34.0 0 0 248698 13.00 D56 S
48.168]
(16.336,
23 24 1 1 Sloper, Mr. William Thompson male 28.0 0 0 113788 35.50 A6 S
32.252]
Cabin_NaN = df1[df1['Cabin'].isnull()]
Cabin_NaN.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked AgeBand
(16.336,
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.25 NaN S
32.252]
STON/O2. (16.336,
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 7.92 NaN S
3101282 32.252]
(32.252,
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.05 NaN S
48.168]
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin AgeBand Embarked_Q Embarked_S
Braund,
(16.336,
0 1 0 3 Mr. Owen male 22.0 1 0 A/5 21171 7.25 NaN False True
32.252]
Harris
Heikkinen,
STON/O2. (16.336,
2 3 1 3 Miss. female 26.0 0 0 7.92 NaN False True
3101282 32.252]
Laina
Allen, Mr.
(32.252,
4 5 0 3 William male 35.0 0 0 373450 8.05 NaN False True
48.168]
Henry
McCarthy,
(48.168,
6 7 0 1 Mr. male 54.0 0 0 17463 51.86 E46 False True
64.084]
Timothy J
Johnson,
Mrs.
Oscar W (16.336,
8 9 1 3 female 27.0 0 2 347742 11.13 NaN False True
(Elisabeth 32.252]
Vilhelmina
Berg)
df1 = df.copy()
def Grade(Percentage):
if Percentage >= 500:
return 'High'
if Percentage >= 300:
return 'Medium'
if Percentage >= 200:
return 'Average'
if Percentage >= 100:
return 'Low'
if Percentage >= 50:
return 'VeryLow'
return 'Free'
df1['Fare_Range']=df1.apply(lambda x: Grade(x['Fare']),axis=1)
df1.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Fare_Range
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.25 NaN S Free
STON/O2.
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 7.92 NaN S Free
3101282
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.05 NaN S Free
wine = pd.read_csv('WineQT.csv')
wine.head(3)
0 7.4 0.70 0.00 1.9 0.08 11.0 34.0 1.0 3.51 0.56 9.4 5 0
1 7.8 0.88 0.00 2.6 0.10 25.0 67.0 1.0 3.20 0.68 9.8 5 1
2 7.8 0.76 0.04 2.3 0.09 15.0 54.0 1.0 3.26 0.65 9.8 5 2
['citric acid',
'total sulfur dioxide',
'density',
'pH',
'sulphates',
'alcohol',
'quality',
'Id']
0 7.4 0.70 0.00 1.9 0.08 11.0 34.0 1.0 3.51 0.56 9.4 5 0 no
1 7.8 0.88 0.00 2.6 0.10 25.0 67.0 1.0 3.20 0.68 9.8 5 1 no
2 7.8 0.76 0.04 2.3 0.09 15.0 54.0 1.0 3.26 0.65 9.8 5 2 no
3 11.2 0.28 0.56 1.9 0.07 17.0 60.0 1.0 3.16 0.58 9.8 6 3 no
4 7.4 0.70 0.00 1.9 0.08 11.0 34.0 1.0 3.51 0.56 9.4 5 4 no
0 7.4 0.70 0.00 1.9 0.08 11.0 34.0 1.0 3.51 0.56 9.4 5 0 no
1 7.8 0.88 0.00 2.6 0.10 25.0 67.0 1.0 3.20 0.68 9.8 5 1 no
2 7.8 0.76 0.04 2.3 0.09 15.0 54.0 1.0 3.26 0.65 9.8 5 2 no
3 11.2 0.28 0.56 1.9 0.07 17.0 60.0 1.0 3.16 0.58 9.8 6 3 no
4 7.4 0.70 0.00 1.9 0.08 11.0 34.0 1.0 3.51 0.56 9.4 5 4 no
# Print Row
row_30 = wine.iloc[75]
print(row_30)
fixed acidity 7.8
volatile acidity 0.41
citric acid 0.68
residual sugar 1.7
chlorides 0.47
free sulfur dioxide 18.0
total sulfur dioxide 69.0
density 1.0
pH 3.08
sulphates 1.31
alcohol 9.3
quality 5
Id 106
good_quality no
Name: 75, dtype: object
if any_negative_yield:
print("The 'chlorides' column contains negative values.")
else:
print("The 'chlorides' column does not contain negative values.")
geo = pd.read_csv('gapminder_full.csv')
geo.head(3)
reg_medal=geo.groupby(['country','continent']).size().reset_index().head(10)
reg_medal
country continent 0
0 Afghanistan Asia 12
1 Albania Europe 12
2 Algeria Africa 12
3 Angola Africa 12
4 Argentina Americas 12
5 Australia Oceania 12
6 Austria Europe 12
7 Bahrain Asia 12
8 Bangladesh Asia 12
9 Belgium Europe 12
df2=geo.groupby('country')['continent'].nunique().reset_index()
df2.head()
country continent
0 Afghanistan 1
1 Albania 1
2 Algeria 1
3 Angola 1
4 Argentina 1
geo.groupby('country')['continent'].count().nlargest(20).reset_index().head(10)
country continent
0 Afghanistan 12
1 Albania 12
2 Algeria 12
3 Angola 12
4 Argentina 12
5 Australia 12
6 Austria 12
7 Bahrain 12
8 Bangladesh 12
9 Belgium 12
1296 Sao Tome and Principe 1952 60011 Africa 46.47 879.58
1297 Sao Tome and Principe 1957 61325 Africa 48.95 860.74
1298 Sao Tome and Principe 1962 65345 Africa 51.89 1071.55
1299 Sao Tome and Principe 1967 70787 Africa 54.42 1384.84
1300 Sao Tome and Principe 1972 76595 Africa 56.48 1532.99
1301 Sao Tome and Principe 1977 86796 Africa 58.55 1737.56
1302 Sao Tome and Principe 1982 98593 Africa 60.35 1890.22
24 China 11497920623
58 India 8413568878
59 Indonesia 1779874000
14 Brazil 1467745520
66 Japan 1341105696
97 Pakistan 1124200629
8 Bangladesh 1089064744
47 Germany 930564520
94 Nigeria 884496214
print(f"\033[031m\033[1m")
print("Unique continent Names :", geo['continent'].nunique())
geo['continent'].value_counts().nlargest(10).to_frame().style.background_gradient(cmap='copper')
continent
Africa 624
Asia 396
Europe 360
Americas 300
Oceania 24
df1 = geo[geo['continent']=='Asia']
df1.groupby('country')['life_exp'].max().sort_values(ascending=False).head(5).reset_index()
country life_exp
0 Japan 82.60
2 Israel 80.75
3 Singapore 79.97
geo.groupby('country')['gdp_cap'].mean().sort_values(ascending=False).index[0:5]
geo.groupby('country')['gdp_cap'].mean().sort_values(ascending=False).head(10)
country
Kuwait 65332.91
Switzerland 27074.33
Norway 26747.31
United States 26261.15
Canada 22410.75
Netherlands 21748.85
Denmark 21671.82
Germany 20556.68
Iceland 20531.42
Austria 20411.92
Name: gdp_cap, dtype: float64
data = geo[geo['continent']=='Africa']
data[data['gdp_cap'] == data['gdp_cap'].max()]['country']
905 Libya
Name: country, dtype: object
data = geo[geo['continent']=='Africa']
data[data['gdp_cap'] == data['gdp_cap'].min()]['country']
# Create a LazyClassifier
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metri
c=None)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)
print("Models performance:")
models
# If you want to evaluate a specific model (e.g., the best performing
one)
if not models.empty:
best_model = models.index[0]
print(f"\nBest model: {best_model}")
if best_model in predictions.columns:
best_model_predictions = predictions[best_model]
accuracy = accuracy_score(y_test, best_model_predictions)
print(f"Accuracy of the best model: {accuracy}")
else:
print(f"Warning: Predictions for {best_model} not found in the pr
edictions DataFrame.")
print("Available models in predictions:")
print(predictions.columns)
else:
print("No models were successfully trained.")
plt.tight_layout()
plt.show()
# Assuming 'models' and 'metrics' are already defined DataFrames
summary_df = models[metrics].copy()
summary_df['Model'] = models.index
summary_df = summary_df.melt(id_vars=['Model'], var_name='Metric'
, value_name='Score')
plt.figure(figsize=(15, 30))
barplot = sns.barplot(x='Score', y='Model', hue='Metric', data=summ
ary_df, orient='h', palette='viridis')