Aids
Aids
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv("aids.csv")
df.head()
... str2 strat symptom treat offtrt cd40 cd420 cd80 cd820
cid
0 ... 0 1 0 1 0 422 477 566 324
0
1 ... 1 3 0 1 0 162 218 392 564
1
2 ... 1 3 0 1 1 326 274 2063 1893
0
3 ... 1 3 0 1 0 287 394 1590 966
0
4 ... 1 3 0 0 0 504 353 870 782
0
[5 rows x 25 columns]
df.tail()
[5 rows x 25 columns]
df.shape
(2139, 25)
df.columns
df.duplicated().sum()
df.isnull().sum()
Unnamed: 0 0
time 0
trt 0
age 0
wtkg 0
hemo 0
homo 0
drugs 0
karnof 0
oprior 0
z30 0
zprior 0
preanti 0
race 0
gender 0
str2 0
strat 0
symptom 0
treat 0
offtrt 0
cd40 0
cd420 0
cd80 0
cd820 0
cid 0
dtype: int64
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2139 entries, 0 to 2138
Data columns (total 25 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Unnamed: 0 2139 non-null int64
1 time 2139 non-null int64
2 trt 2139 non-null int64
3 age 2139 non-null int64
4 wtkg 2139 non-null float64
5 hemo 2139 non-null int64
6 homo 2139 non-null int64
7 drugs 2139 non-null int64
8 karnof 2139 non-null int64
9 oprior 2139 non-null int64
10 z30 2139 non-null int64
11 zprior 2139 non-null int64
12 preanti 2139 non-null int64
13 race 2139 non-null int64
14 gender 2139 non-null int64
15 str2 2139 non-null int64
16 strat 2139 non-null int64
17 symptom 2139 non-null int64
18 treat 2139 non-null int64
19 offtrt 2139 non-null int64
20 cd40 2139 non-null int64
21 cd420 2139 non-null int64
22 cd80 2139 non-null int64
23 cd820 2139 non-null int64
24 cid 2139 non-null int64
dtypes: float64(1), int64(24)
memory usage: 417.9 KB
df.describe()
numerical_columns = df.select_dtypes(include=['int64',
'float64']).columns
print("\nNumerical type columns:")
print(numerical_columns)
def classify_features(df):
categorical_features = []
non_categorical_features = []
discrete_features = []
continuous_features = []
Categorical Features: []
Non-Categorical Features: []
Discrete Features: ['trt', 'hemo', 'homo', 'drugs', 'karnof',
'oprior', 'z30', 'zprior', 'race', 'gender', 'str2', 'strat',
'symptom', 'treat', 'offtrt']
Continuous Features: ['time', 'age', 'wtkg', 'preanti', 'cd40',
'cd420', 'cd80', 'cd820']
for i in discrete:
print(i, ':')
print(df[i].unique())
print()
trt :
[2 3 0 1]
hemo :
[0 1]
homo :
[0 1]
drugs :
[0 1]
karnof :
[100 90 80 70]
oprior :
[0 1]
z30 :
[0 1]
zprior :
[1]
race :
[0 1]
gender :
[0 1]
str2 :
[0 1]
strat :
[1 3 2]
symptom :
[0 1]
treat :
[1 0]
offtrt :
[0 1]
for i in discrete:
print(i, ':')
print(df[i].value_counts())
print()
trt :
3 561
0 532
2 524
1 522
Name: trt, dtype: int64
hemo :
0 1959
1 180
Name: hemo, dtype: int64
homo :
1 1414
0 725
Name: homo, dtype: int64
drugs :
0 1858
1 281
Name: drugs, dtype: int64
karnof :
100 1263
90 787
80 80
70 9
Name: karnof, dtype: int64
oprior :
0 2092
1 47
Name: oprior, dtype: int64
z30 :
1 1177
0 962
Name: z30, dtype: int64
zprior :
1 2139
Name: zprior, dtype: int64
race :
0 1522
1 617
Name: race, dtype: int64
gender :
1 1771
0 368
Name: gender, dtype: int64
str2 :
1 1253
0 886
Name: str2, dtype: int64
strat :
1 886
3 843
2 410
Name: strat, dtype: int64
symptom :
0 1769
1 370
Name: symptom, dtype: int64
treat :
1 1607
0 532
Name: treat, dtype: int64
offtrt :
0 1363
1 776
Name: offtrt, dtype: int64
for i in discrete:
plt.figure(figsize=(15,6))
sns.countplot(df[i], data = df, palette='hls')
plt.show()
for i in discrete:
plt.figure(figsize=(20,10))
plt.pie(df[i].value_counts(), labels=df[i].value_counts().index,
autopct='%1.1f%%', textprops={'fontsize': 15,
'color': 'black',
'weight': 'bold',
'family': 'serif' })
hfont = {'fontname':'serif', 'weight': 'bold'}
plt.title(i, size=20, **hfont)
plt.show()
for i in continuous:
plt.figure(figsize=(15,6))
sns.histplot(df[i], bins = 20, kde = True, palette='hls')
plt.xticks(rotation = 90)
plt.show()
for i in continuous:
plt.figure(figsize=(15,6))
sns.distplot(df[i], bins = 20, kde = True)
plt.xticks(rotation = 90)
plt.show()
for i in continuous:
plt.figure(figsize=(15,6))
sns.boxplot(i, data = df, palette='hls')
plt.xticks(rotation = 90)
plt.show()
for i in continuous:
plt.figure(figsize=(15,6))
sns.violinplot(i, data = df, palette='hls')
plt.xticks(rotation = 90)
plt.show()
for i in continuous:
for j in continuous:
if i != j:
plt.figure(figsize=(15,6))
sns.scatterplot(x = i, y = j, data = df, ci = None,
palette='hls')
plt.xticks(rotation = 90)
plt.show()
for i in continuous:
for j in continuous:
if i != j:
plt.figure(figsize=(15,6))
sns.lineplot(x = i, y = j, data = df, ci = None,
palette='hls')
plt.xticks(rotation = 90)
plt.show()
correlation_matrix = df.corr()
correlation_matrix
plt.figure(figsize=(20,10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.show()
threshold = 0.75
correlation_pairs = set()
for i in range(len(correlation_matrix.columns)):
for j in range(i):
if abs(correlation_matrix.iloc[i, j]) > threshold:
colname_i = correlation_matrix.columns[i]
colname_j = correlation_matrix.columns[j]
correlation_pairs.add((colname_i, colname_j))
correlation_matrix_after_drop = df.corr()
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix_after_drop, annot=True,
cmap='coolwarm', fmt=".2f")
plt.title('Feature Correlation Matrix After Dropping Highly Correlated
Features')
plt.show()
print("Remaining Features:")
print(df.columns)
Remaining Features:
Index(['time', 'trt', 'age', 'wtkg', 'hemo', 'homo', 'drugs',
'karnof',
'oprior', 'zprior', 'preanti', 'race', 'gender', 'symptom',
'offtrt',
'cd40', 'cd420', 'cd80'],
dtype='object')
plt.show()
scatter_features = ['age', 'cd40', 'wtkg']
plt.show()
Thanks !!!