Aiml Lab04&5 - Output
Aiml Lab04&5 - Output
In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
x=pd.read_csv("test.csv")
y=pd.read_csv("train.csv")
z=pd.read_csv("gender_submission.csv")
In [3]:
y.head()
Out[3]: PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin E
Braund,
A/5
0 1 0 3 Mr. Owen male 22.0 1 0 7.2500 NaN
21171
Harris
Cumings,
Mrs. John
Bradley
1 2 1 1 female 38.0 1 0 PC 17599 71.2833 C85
(Florence
Briggs
Th...
Heikkinen,
STON/O2.
2 3 1 3 Miss. female 26.0 0 0 7.9250 NaN
3101282
Laina
Futrelle,
Mrs.
Jacques
3 4 1 1 female 35.0 1 0 113803 53.1000 C123
Heath
(Lily May
Peel)
Allen, Mr.
4 5 0 3 William male 35.0 0 0 373450 8.0500 NaN
Henry
In [4]:
x.head()
Out[4]: PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
Kelly, Mr.
0 892 3 male 34.5 0 0 330911 7.8292 NaN Q
James
Wilkes,
Mrs.
1 893 3 James female 47.0 1 0 363272 7.0000 NaN S
(Ellen
Needs)
Myles,
Mr.
2 894 2 male 62.0 0 0 240276 9.6875 NaN Q
Thomas
Francis
localhost:8889/nbconvert/html/Titanic.ipynb?download=false 1/18
2/26/24, 10:20 AM Titanic
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
Albert
Hirvonen,
Mrs.
4 896 3 Alexander female 22.0 1 1 3101298 12.2875 NaN S
(Helga E
Lindqvist)
In [5]:
y.describe()
In [6]:
y.columns
In [7]:
y.dtypes
y.isnull().sum()
Out[7]: PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 177
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2
dtype: int64
In [8]:
y.info
In [9]:
y.Survived.value_counts()
Out[9]: 0 549
1 342
Name: Survived, dtype: int64
In [10]:
plt=y.Survived.value_counts().plot(kind='bar')
plt.set_xlabel('Survived or not ')
plt.set_ylabel('Passenger Count ')
In [11]:
plt= y.Pclass.value_counts().sort_index().plot(kind='bar',title='')
plt.set_xlabel('Pclass')
plt.set_ylabel('Survival probabiltiy')
localhost:8889/nbconvert/html/Titanic.ipynb?download=false 3/18
2/26/24, 10:20 AM Titanic
In [12]:
y[['Pclass','Survived']].groupby('Pclass').count()
y[['Pclass','Survived']].groupby('Pclass').sum()
plt=y[['Pclass','Survived']].groupby('Pclass').mean().Survived.plot(kind='bar')
In [13]:
plt.set_xlabel('Pclass')
plt.set_ylabel('survival Probability')
In [14]:
plt=y.Sex.value_counts().sort_index().plot(kind='bar')
plt.set_xlabel('Sex')
plt.set_ylabel('Passenger Count')
plt=y[['Embarked','Survived']].groupby('Embarked').mean().Survived.plot(kind='bar')
localhost:8889/nbconvert/html/Titanic.ipynb?download=false 4/18
2/26/24, 10:20 AM Titanic
In [15]:
plt=y[['Sex','Survived']].groupby('Sex').mean().Survived.plot(kind='bar')
plt.set_xlabel('Sex')
plt.set_ylabel('Passenger Count')
In [16]:
plt=y.Embarked.value_counts().sort_index().plot(kind='bar')
plt.set_xlabel('Embarked')
plt.set_ylabel('Passenger Count')
localhost:8889/nbconvert/html/Titanic.ipynb?download=false 5/18
2/26/24, 10:20 AM Titanic
In [17]:
plt=y.SibSp.value_counts().sort_index().plot(kind='bar')
plt.set_xlabel('SibSp')
plt.set_ylabel('Passenger Count')
In [18]:
plt=y[['SibSp','Survived']].groupby('SibSp').mean().Survived.plot(kind='bar')
plt.set_xlabel('SibSp')
plt.set_ylabel('Survival Probability')
localhost:8889/nbconvert/html/Titanic.ipynb?download=false 6/18
2/26/24, 10:20 AM Titanic
In [19]:
plt=y.Parch.value_counts().sort_index().plot(kind='bar')
plt.set_xlabel('Parch')
plt.set_ylabel('Passenger Count')
In [20]:
plt=y[['Parch','Survived']].groupby('Parch').mean().Survived.plot(kind='bar')
plt.set_xlabel('Parch')
plt.set_ylabel('Survival Probability')
In [21]:
sns.factorplot('Pclass',col='Embarked',data=y,kind='count')
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\categorical.py:3714: UserWarning:
The `factorplot` function has been renamed to `catplot`. The original name will be re
moved in a future release. Please update your code. Note that the default `kind` in `
factorplot` (`'point'`) has changed `'strip'` in `catplot`.
warnings.warn(msg)
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning:
Pass the following variable as a keyword arg: x. From version 0.12, the only valid po
sitional argument will be `data`, and passing other arguments without an explicit key
word will result in an error or misinterpretation.
warnings.warn(
Out[21]: <seaborn.axisgrid.FacetGrid at 0x1eeddcb92e0>
localhost:8889/nbconvert/html/Titanic.ipynb?download=false 7/18
2/26/24, 10:20 AM Titanic
In [22]:
sns.factorplot('Sex',col='Pclass',data=y,kind='count')
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\categorical.py:3714: UserWarning:
The `factorplot` function has been renamed to `catplot`. The original name will be re
moved in a future release. Please update your code. Note that the default `kind` in `
factorplot` (`'point'`) has changed `'strip'` in `catplot`.
warnings.warn(msg)
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning:
Pass the following variable as a keyword arg: x. From version 0.12, the only valid po
sitional argument will be `data`, and passing other arguments without an explicit key
word will result in an error or misinterpretation.
warnings.warn(
Out[22]: <seaborn.axisgrid.FacetGrid at 0x1eede0e84f0>
In [23]:
sns.factorplot('Sex',col='Embarked',data=y,kind='count')
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\categorical.py:3714: UserWarning:
The `factorplot` function has been renamed to `catplot`. The original name will be re
moved in a future release. Please update your code. Note that the default `kind` in `
factorplot` (`'point'`) has changed `'strip'` in `catplot`.
warnings.warn(msg)
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning:
Pass the following variable as a keyword arg: x. From version 0.12, the only valid po
sitional argument will be `data`, and passing other arguments without an explicit key
word will result in an error or misinterpretation.
warnings.warn(
Out[23]: <seaborn.axisgrid.FacetGrid at 0x1eeddb86be0>
localhost:8889/nbconvert/html/Titanic.ipynb?download=false 8/18
2/26/24, 10:20 AM Titanic
In [24]:
y.drop(822,axis=0,inplace=True)
In [25]:
y['Familysize']=y['SibSp']+y['Parch']+1
y.head()
Out[25]: PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin E
Braund,
A/5
0 1 0 3 Mr. Owen male 22.0 1 0 7.2500 NaN
21171
Harris
Cumings,
Mrs. John
Bradley
1 2 1 1 female 38.0 1 0 PC 17599 71.2833 C85
(Florence
Briggs
Th...
Heikkinen,
STON/O2.
2 3 1 3 Miss. female 26.0 0 0 7.9250 NaN
3101282
Laina
Futrelle,
Mrs.
Jacques
3 4 1 1 female 35.0 1 0 113803 53.1000 C123
Heath
(Lily May
Peel)
Allen, Mr.
4 5 0 3 William male 35.0 0 0 373450 8.0500 NaN
Henry
In [26]:
y=y.drop(columns=['Ticket','PassengerId','Cabin'])
y.head()
y['Sex']=y['Sex'].map({'male':0,'female':1})
y['Embarked']=y['Embarked'].map({'C':0,'Q':1,'S':2})
y.head()
Out[26]: Survived Pclass Name Sex Age SibSp Parch Fare Embarked Familysize
localhost:8889/nbconvert/html/Titanic.ipynb?download=false 9/18
2/26/24, 10:20 AM Titanic
Survived Pclass Name Sex Age SibSp Parch Fare Embarked Familysize
Briggs Th...
Heikkinen, Miss.
2 1 3 1 26.0 0 0 7.9250 2.0 1
Laina
Futrelle, Mrs.
3 1 1 Jacques Heath (Lily 1 35.0 1 0 53.1000 2.0 2
May Peel)
In [27]:
y['Title']=y.Name.str.extract('([A-Za-z]+)\.',expand=False)
In [28]:
y=y.drop(columns="Name")
y.Title.unique()
y.head()
Out[28]: Survived Pclass Sex Age SibSp Parch Fare Embarked Familysize Title
In [29]:
y.Title.value_counts().plot(kind='bar')
Out[29]: <AxesSubplot:>
In [30]:
y['Title']=y['Title'].replace(['Dr','Rev','Col', 'Major', 'Countess', 'Sir', 'Johnkh
In [31]:
y['Title']=y['Title'].replace('Ms', 'Miss')
localhost:8889/nbconvert/html/Titanic.ipynb?download=false 10/18
2/26/24, 10:20 AM Titanic
In [32]:
y['Title']=y['Title'].replace('Mlle', 'Miss')
In [33]:
y['Title']=y['Title'].replace('Mme', 'Mrs')
In [34]:
y['Title']=y['Title'].replace('Master', 'Mr')
y.head()
Out[34]: Survived Pclass Sex Age SibSp Parch Fare Embarked Familysize Title
In [35]:
plt=y.Title.value_counts().sort_index().plot(kind='bar')
plt.set_xlabel('Title')
plt.set_ylabel('Passenger Count')
In [36]:
plt=y[['Title','Survived']].groupby('Title').mean().Survived.plot(kind='bar')
plt.set_xlabel('Title')
plt.set_ylabel('Survival Probability')
localhost:8889/nbconvert/html/Titanic.ipynb?download=false 11/18
2/26/24, 10:20 AM Titanic
In [37]:
y['Title']=y['Title'].map({'Master':0,'Miss':1,'Mr':2,'Mrs':3,'Others':4})
corr_matrix=y.corr()
In [38]:
import matplotlib.pyplot as plt
plt.figure(figsize=(9,8))
sns.heatmap(data=corr_matrix, cmap='BrBG', annot=True, linewidths=0.2)
Out[38]: <AxesSubplot:>
localhost:8889/nbconvert/html/Titanic.ipynb?download=false 12/18
2/26/24, 10:20 AM Titanic
In [39]:
y.isnull().sum()
y['Embarked']=y['Embarked'].fillna(2)
y.head()
Out[39]: Survived Pclass Sex Age SibSp Parch Fare Embarked Familysize Title
In [40]:
age_median_train=y.Age.median()
y.Age=y.Age.fillna(age_median_train)
print(age_median_train)
28.0
In [41]:
y.isnull().sum()
y.head()
Out[41]: Survived Pclass Sex Age SibSp Parch Fare Embarked Familysize Title
In [42]:
from sklearn.utils import shuffle
y=shuffle(y)
y.head()
Out[42]: Survived Pclass Sex Age SibSp Parch Fare Embarked Familysize Title
In [43]:
x_train=y.drop(columns='Survived')
y_train=y[['Survived']]
x_train.shape
Out[43]: (890, 9)
localhost:8889/nbconvert/html/Titanic.ipynb?download=false 13/18
2/26/24, 10:20 AM Titanic
In [44]:
y=y.drop(columns='Sex')
x_train.head()
Out[44]: Pclass Sex Age SibSp Parch Fare Embarked Familysize Title
In [ ]:
In [45]:
from sklearn.linear_model import LogisticRegression
y_train.head()
Out[45]: Survived
761 0
555 0
351 0
438 0
172 1
In [46]:
y_train.isnull()
Out[46]: Survived
761 False
555 False
351 False
438 False
172 False
... ...
129 False
117 False
692 False
419 False
360 False
In [47]:
from sklearn.model_selection import train_test_split
localhost:8889/nbconvert/html/Titanic.ipynb?download=false 14/18
2/26/24, 10:20 AM Titanic
In [48]:
x_training, x_valid, y_training, y_valid= train_test_split(x_train, y_train, test_si
logreg_clf=LogisticRegression()
In [49]:
logreg_clf.fit(x_training, y_training)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py:63: DataConver
sionWarning: A column-vector y was passed when a 1d array was expected. Please change
the shape of y to (n_samples, ), for example using ravel().
return f(*args, **kwargs)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: Con
vergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
Out[49]: LogisticRegression()
In [50]:
prediction=logreg_clf.predict(x_valid)
In [51]:
from sklearn.metrics import accuracy_score
accuracy_score(y_valid,prediction)
Out[51]: 0.7640449438202247
In [52]:
from sklearn.metrics import confusion_matrix
confusion=confusion_matrix(y_valid,prediction,labels=[1,0])
In [53]:
print(confusion)
[[42 22]
[20 94]]
In [54]:
from sklearn.metrics import classification_report
report=classification_report(y_valid,prediction)
print(report)
In [55]:
x['Familysize']=x['SibSp']+x['Parch']+1
x.head()
localhost:8889/nbconvert/html/Titanic.ipynb?download=false 15/18
2/26/24, 10:20 AM Titanic
Out[55]: PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked F
Kelly, Mr.
0 892 3 male 34.5 0 0 330911 7.8292 NaN Q
James
Wilkes,
Mrs.
1 893 3 James female 47.0 1 0 363272 7.0000 NaN S
(Ellen
Needs)
Myles,
Mr.
2 894 2 male 62.0 0 0 240276 9.6875 NaN Q
Thomas
Francis
Wirz, Mr.
3 895 3 male 27.0 0 0 315154 8.6625 NaN S
Albert
Hirvonen,
Mrs.
4 896 3 Alexander female 22.0 1 1 3101298 12.2875 NaN S
(Helga E
Lindqvist)
In [56]:
x=x.drop(columns=['Ticket','PassengerId','Cabin'])
x.head()
x['Sex']=x['Sex'].map({'male':0,'female':1})
x['Embarked']=x['Embarked'].map({'C':0,'Q':1,'S':2})
x.head()
Out[56]: Pclass Name Sex Age SibSp Parch Fare Embarked Familysize
In [57]:
x['Title']=x.Name.str.extract('([A-Za-z]+)\.',expand=False)
In [58]:
x=x.drop(columns="Name")
x.Title.unique()
x.head()
Out[58]: Pclass Sex Age SibSp Parch Fare Embarked Familysize Title
0 3 0 34.5 0 0 7.8292 1 1 Mr
2 2 0 62.0 0 0 9.6875 1 1 Mr
3 3 0 27.0 0 0 8.6625 2 1 Mr
localhost:8889/nbconvert/html/Titanic.ipynb?download=false 16/18
2/26/24, 10:20 AM Titanic
In [59]:
x['Title']=x['Title'].replace(['Dr','Rev','Col', 'Major', 'Countess', 'Sir', 'Johnkh
x['Title']=x['Title'].replace('Ms', 'Miss')
x['Title']=x['Title'].replace('Mlle', 'Miss')
x['Title']=x['Title'].replace('Mme', 'Mrs')
x['Title']=x['Title'].replace('Master', 'Mr')
x.head()
Out[59]: Pclass Sex Age SibSp Parch Fare Embarked Familysize Title
0 3 0 34.5 0 0 7.8292 1 1 Mr
2 2 0 62.0 0 0 9.6875 1 1 Mr
3 3 0 27.0 0 0 8.6625 2 1 Mr
In [60]:
x['Title']=x['Title'].map({'Master':0,'Miss':1,'Mr':2,'Mrs':3,'Others':4})
corr_matrix=x.corr()
In [61]:
x.isnull().sum()
x['Embarked']=x['Embarked'].fillna(2)
x.head()
Out[61]: Pclass Sex Age SibSp Parch Fare Embarked Familysize Title
In [62]:
z_train=x
w_train=z[['Survived']]
z_train.shape
Out[62]: (418, 9)
In [63]:
age_median_train=x.Age.median()
x.Age=x.Age.fillna(age_median_train)
print(age_median_train)
27.0
In [64]:
T_median_train=y.Title.median()
x.Title=x.Title.fillna(T_median_train)
print(T_median_train)
2.0
localhost:8889/nbconvert/html/Titanic.ipynb?download=false 17/18
2/26/24, 10:20 AM Titanic
In [65]:
f_median_train=x.Fare.median()
x.Fare=x.Fare.fillna(f_median_train)
print(f_median_train)
14.4542
In [66]:
prediction1=logreg_clf.predict(z_train)
In [68]:
accuracy_score(w_train,prediction1)
Out[68]: 0.9425837320574163
In [69]:
confusion=confusion_matrix(w_train,prediction1,labels=[1,0])
In [70]:
from sklearn.metrics import classification_report
report=classification_report(w_train,prediction1)
print(report)
In [ ]:
localhost:8889/nbconvert/html/Titanic.ipynb?download=false 18/18