My Code
My Code
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
pd.pandas.set_option('display.max_columns', None)
# Reading Dataset:
dataset = pd.read_csv("/content/drive/MyDrive/Project Work/Kidney_data.csv")
# Top 5 records:
dataset.head()
age - age
bp - blood pressure
sg - specific gravity
al - albumin
su - sugar
pc - pus cell
ba - bacteria
bu - blood urea
sc - serum creatinine
sod - sodium
pot - potassium
hemo - hemoglobin
htn - hypertension
dm - diabetes mellitus
appet - appetite
pe - pedal edema
ane - anemia
class - class
Attribute Information:
Albumin(nominal) al - (0,1,2,3,4,5)
Sugar(nominal) su - (0,1,2,3,4,5)
Bacteria(nominal) ba - (present,notpresent)
# Shape of dataset:
dataset.shape
# Description:
dataset.describe()
dataset.columns
dataset.dtypes
1. rbc
dataset['rbc'].value_counts()
2. pc
dataset['pc'].value_counts()
dataset['pc'] = dataset['pc'].replace(to_replace = {'normal' : 0, 'abnormal' : 1})
3. pcc
dataset['pcc'].value_counts()
4. ba
dataset['ba'].value_counts()
5. htn
dataset['htn'].value_counts()
6. dm
dataset['dm'].value_counts()
7. cad
dataset['cad'].value_counts()
8. appet
dataset['appet'].unique()
dataset['appet'] = dataset['appet'].replace(to_replace={'good':1,'poor':0,'no':np.nan})
9. pe
dataset['pe'].value_counts()
10. ane
dataset['ane'].value_counts()
dataset['ane'] = dataset['ane'].replace(to_replace = {'yes' : 1, 'no' : 0})
11. classification
dataset['classification'].value_counts()
dataset['classification'] = dataset['classification'].replace(to_replace={'ckd\t':'ckd'})
dataset.head()
# Datatypes:
dataset.dtypes
# Datatypes:
dataset.dtypes
# Description:
dataset.describe()
There is Outliers present in our dataset so We fill NaN values with Median.
dataset.columns
features = ['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu',
'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad',
'appet', 'pe', 'ane']
dataset.isnull().any().sum()
Heatmap
plt.figure(figsize=(24,14))
sns.heatmap(dataset.corr(), annot=True, cmap='YlGnBu')
plt.show()
1. We clearly see that 'pcv' and 'hemo' feature has 85% multicollinearity
2. So we remove one of the feature. i.e pcv
dataset.head()
# Target feature:
sns.countplot(x='classification', data=dataset)
X.head()
# Feature Importance:
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
model=ExtraTreesClassifier()
model.fit(X,y)
plt.figure(figsize=(8,6))
ranked_features=pd.Series(model.feature_importances_,index=X.columns)
ranked_features.nlargest(24).plot(kind='barh')
plt.show()
ranked_features.nlargest(8).index
X.tail()
y.head()
print(X_train.shape)
print(X_test.shape)
# Initialzing empty lists to append all model's name and corresponding name
acc = []
model = []
# RandomForestClassifier:
from sklearn.ensemble import RandomForestClassifier
RandomForest = RandomForestClassifier()
RandomForest = RandomForest.fit(X_train,y_train)
# Predictions:
y_pred_rf = RandomForest.predict(X_test)
# Performance:
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print('Accuracy:', accuracy_score(y_test,y_pred_rf))
print(confusion_matrix(y_test,y_pred_rf))
print(classification_report(y_test, y_pred_rf))
rf_score= RandomForest.score(X_train,y_train)
report = classification_report(y_test, y_pred_rf, output_dict=True)
df = pd.DataFrame(report).transpose()
df = df.drop(['0', '1', 'accuracy', 'weighted avg'], axis=0)
df = df.drop('support', axis=1)
df.rename(index={"macro avg": "Random Forest"}, inplace=True)
df['accuracy'] = round((rf_score * 100), 2)
x = metrics.accuracy_score(y_test, y_pred_rf)
acc.append(x)
model.append('RF')
#Confusion Matrix
print(confusion_matrix(y_test, y_pred_rf))
df = {'y_Actual': y_test, 'y_Predicted': y_pred_rf}
df1 = pd.DataFrame(df, columns = ['y_Actual', 'y_Predicted'])
clf_confusion_matrix = pd.crosstab(df['y_Predicted'], df['y_Actual'], rownames = ['Predicted'], colnames = ['Actual'])
sns.heatmap(clf_confusion_matrix, annot=True)
# Predictions:
y_pred_svm = svm.predict(X_test)
# Performance:
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print('Accuracy:', accuracy_score(y_test,y_pred_svm))
print(confusion_matrix(y_test,y_pred_svm))
print(classification_report(y_test,y_pred_svm))
svm_score= svm.score(X_train,y_train)
report1 = classification_report(y_test, y_pred_svm, output_dict=True)
df1 = pd.DataFrame(report).transpose()
df1 = df1.drop(['0', '1', 'accuracy', 'weighted avg'], axis=0)
df1 = df1.drop('support', axis=1)
df1.rename(index={"macro avg": "Support Vector Machine"}, inplace=True)
df1['accuracy'] = round((svm_score * 100), 2)
x = metrics.accuracy_score(y_test, y_pred_svm)
acc.append(x)
model.append('SVM')
#Confusion Matrix
print(confusion_matrix(y_test, y_pred_svm))
df = {'y_Actual': y_test, 'y_Predicted': y_pred_svm}
df1 = pd.DataFrame(df, columns = ['y_Actual', 'y_Predicted'])
clf_confusion_matrix = pd.crosstab(df['y_Predicted'], df['y_Actual'], rownames = ['Predicted'], colnames = ['Actual'])
sns.heatmap(clf_confusion_matrix, annot=True)
keyboard_arrow_down Accuracy Comparison
plt.figure(figsize=[10,5],dpi = 100)
plt.title('Accuracy Comparison')
plt.xlabel('Accuracy')
plt.ylabel('Algorithm')
sns.barplot(x = acc,y = model,palette='dark')
metrics_df = pd.DataFrame(metrics_data)
# Plotting
sns.set(style="whitegrid")
plt.figure(figsize=(8, 6))
plt.tight_layout()
plt.show()