0% found this document useful (0 votes)
5 views

Week1 Code Corrected

The document loads diabetes-related data from an Excel file, analyzes it by finding means, null values, outliers, and replaces nulls with means before further exploration.

Uploaded by

aravindsv368
Copyright
© © All Rights Reserved
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
5 views

Week1 Code Corrected

The document loads diabetes-related data from an Excel file, analyzes it by finding means, null values, outliers, and replaces nulls with means before further exploration.

Uploaded by

aravindsv368
Copyright
© © All Rights Reserved
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 2

import numpy as np

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score
import seaborn as sns
from sklearn.metrics import
accuracy_score,mean_squared_error,classification_report,confusion_matrix,precision_
score,recall_score,roc_curve,auc
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.ensemble import ExtraTreesClassifier,RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier

data=pd.read_excel('health care diabetes.xlsx')


data.head()

data.describe()

#Identifying the mean of the features


print(data['Glucose'].mean())
print(data['BloodPressure'].mean())
print(data['SkinThickness'].mean())
print(data['Insulin'].mean())
print(data['Pregnancies'].mean())
print(data['BMI'].mean())

# Finding the number of rows which has the null values


print('Glucose-',len(data['Glucose'][data['Glucose']==0]))
print('BloodPressure-',len(data['BloodPressure'][data['BloodPressure']==0]))
print('SkinThickness-',len(data['SkinThickness'][data['SkinThickness']==0]))
print('Insulin-',len(data['Insulin'][data['Insulin']==0]))
print('Pregnancies-',len(data['Pregnancies'][data['Pregnancies']==0]))
print('BMI-',len(data['BMI'][data['BMI']==0]))

# Finding the null value percentage


selected_columns = ['Glucose', 'BloodPressure',
'SkinThickness','Insulin','Pregnancies','BMI']
null_percentage = (data[selected_columns] == 0).mean() * 100

# Displaying the null value percentage for each selected column


print("Percentage of Null Values for Each Column:")
print(null_percentage)

# Replacing the null values with the mean


data['Glucose']=data['Glucose'].replace([0],[data['Glucose'].mean()])
data['BloodPressure']=data['BloodPressure'].replace([0],
[data['BloodPressure'].mean()])
data['SkinThickness']=data['SkinThickness'].replace([0],
[data['SkinThickness'].mean()])
data['Insulin']=data['Insulin'].replace([0],[data['Insulin'].mean()])
data['Pregnancies']=data['Pregnancies'].replace([0],[data['Pregnancies'].mean()])
data['BMI']=data['BMI'].replace([0],[data['BMI'].mean()])

data.describe()

#Checking the null value percentage of the treated columns


null_percentage_treated = (data[selected_columns] == 0).mean() * 100

# Displaying the null value percentage for each selected column


print("Percentage of Null Values for Each Column after the null value treatment:")
print(null_percentage_treated)

columns=data[selected_columns]

# Display boxplots for numeric columns to visualize outliers


plt.figure(figsize=(12, 8))
sns.boxplot(data=columns)
plt.title("Boxplots for Numeric Columns")
plt.show()

# Finding the Outlier Count in the selected Columns:


def find_outliers_iqr(data, column_name):
# Calculate the first quartile (Q1) and third quartile (Q3)
Q1 = data[column_name].quantile(0.25)
Q3 = data[column_name].quantile(0.75)

# Calculate the interquartile range (IQR)


IQR = Q3 - Q1

# Define the lower and upper bounds for outliers


lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Find outliers
outliers = data[(data[column_name] < lower_bound) | (data[column_name] >
upper_bound)]

# Count the number of outliers


count_outliers = len(outliers)

return count_outliers

# Calculate and print the number of outliers for each column of interest
for column_name in selected_columns:
outlier_count = find_outliers_iqr(data, column_name)
print(f"Number of outliers in the '{column_name}' column: {outlier_count}")

You might also like