0% found this document useful (0 votes)
23 views2 pages

Week1 Code Corrected

The document loads diabetes-related data from an Excel file, analyzes it by finding means, null values, outliers, and replaces nulls with means before further exploration.

Uploaded by

aravindsv368
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
23 views2 pages

Week1 Code Corrected

The document loads diabetes-related data from an Excel file, analyzes it by finding means, null values, outliers, and replaces nulls with means before further exploration.

Uploaded by

aravindsv368
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 2

import numpy as np

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score
import seaborn as sns
from sklearn.metrics import
accuracy_score,mean_squared_error,classification_report,confusion_matrix,precision_
score,recall_score,roc_curve,auc
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.ensemble import ExtraTreesClassifier,RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier

data=pd.read_excel('health care diabetes.xlsx')


data.head()

data.describe()

#Identifying the mean of the features


print(data['Glucose'].mean())
print(data['BloodPressure'].mean())
print(data['SkinThickness'].mean())
print(data['Insulin'].mean())
print(data['Pregnancies'].mean())
print(data['BMI'].mean())

# Finding the number of rows which has the null values


print('Glucose-',len(data['Glucose'][data['Glucose']==0]))
print('BloodPressure-',len(data['BloodPressure'][data['BloodPressure']==0]))
print('SkinThickness-',len(data['SkinThickness'][data['SkinThickness']==0]))
print('Insulin-',len(data['Insulin'][data['Insulin']==0]))
print('Pregnancies-',len(data['Pregnancies'][data['Pregnancies']==0]))
print('BMI-',len(data['BMI'][data['BMI']==0]))

# Finding the null value percentage


selected_columns = ['Glucose', 'BloodPressure',
'SkinThickness','Insulin','Pregnancies','BMI']
null_percentage = (data[selected_columns] == 0).mean() * 100

# Displaying the null value percentage for each selected column


print("Percentage of Null Values for Each Column:")
print(null_percentage)

# Replacing the null values with the mean


data['Glucose']=data['Glucose'].replace([0],[data['Glucose'].mean()])
data['BloodPressure']=data['BloodPressure'].replace([0],
[data['BloodPressure'].mean()])
data['SkinThickness']=data['SkinThickness'].replace([0],
[data['SkinThickness'].mean()])
data['Insulin']=data['Insulin'].replace([0],[data['Insulin'].mean()])
data['Pregnancies']=data['Pregnancies'].replace([0],[data['Pregnancies'].mean()])
data['BMI']=data['BMI'].replace([0],[data['BMI'].mean()])

data.describe()

#Checking the null value percentage of the treated columns


null_percentage_treated = (data[selected_columns] == 0).mean() * 100

# Displaying the null value percentage for each selected column


print("Percentage of Null Values for Each Column after the null value treatment:")
print(null_percentage_treated)

columns=data[selected_columns]

# Display boxplots for numeric columns to visualize outliers


plt.figure(figsize=(12, 8))
sns.boxplot(data=columns)
plt.title("Boxplots for Numeric Columns")
plt.show()

# Finding the Outlier Count in the selected Columns:


def find_outliers_iqr(data, column_name):
# Calculate the first quartile (Q1) and third quartile (Q3)
Q1 = data[column_name].quantile(0.25)
Q3 = data[column_name].quantile(0.75)

# Calculate the interquartile range (IQR)


IQR = Q3 - Q1

# Define the lower and upper bounds for outliers


lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Find outliers
outliers = data[(data[column_name] < lower_bound) | (data[column_name] >
upper_bound)]

# Count the number of outliers


count_outliers = len(outliers)

return count_outliers

# Calculate and print the number of outliers for each column of interest
for column_name in selected_columns:
outlier_count = find_outliers_iqr(data, column_name)
print(f"Number of outliers in the '{column_name}' column: {outlier_count}")

You might also like