Analysis Process Data Analysis and Visualisation Using Python
Analysis Process Data Analysis and Visualisation Using Python
#---------------------------------------------------------------------------------------------
#imported cleaned database
df = pd.read_csv(r"cleaned_student_depression_dataset.csv",index_col= 0,header␣
↪= 0 )
[37]: df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 27857 entries, 0 to 27901
Data columns (total 18 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 27857 non-null int64
1 Gender 27857 non-null object
2 Age 27857 non-null int64
3 City 27857 non-null object
4 Profession 27857 non-null object
5 Academic Pressure 27857 non-null int64
6 Work Pressure 27857 non-null int64
7 CGPA 27857 non-null float64
8 Study Satisfaction 27857 non-null int64
9 Job Satisfaction 27857 non-null int64
10 Sleep Duration 27857 non-null object
11 Dietary Habits 27857 non-null object
12 Degree 27857 non-null object
13 Have you ever had suicidal thoughts ? 27857 non-null object
14 Work/Study Hours 27857 non-null int64
15 Financial Stress 27857 non-null int64
1
16 Family History of Mental Illness 27857 non-null object
17 Depression 27857 non-null bool
dtypes: bool(1), float64(1), int64(8), object(8)
memory usage: 3.9+ MB
[38]: df.head()
2
[39]: id Age Academic Pressure Work Pressure \
count 27857.000000 27857.000000 27857.000000 27857.000000
mean 70443.316725 25.820835 3.141580 0.000431
std 40648.631003 4.906158 1.381802 0.044027
min 1.000000 18.000000 0.000000 0.000000
25% 35039.000000 21.000000 2.000000 0.000000
50% 70694.000000 25.000000 3.000000 0.000000
75% 105827.000000 30.000000 4.000000 0.000000
max 140699.000000 59.000000 5.000000 5.000000
Financial Stress
count 27857.000000
mean 3.140467
std 1.437145
min 1.000000
25% 2.000000
50% 3.000000
75% 4.000000
max 5.000000
[40]: # Inference 1
#---------------------------------------------------------------------------------------------
fig,axes = plt.subplots(nrows = 1, ncols = 2, figsize=(10, 6))
colors = ['#e87d5d','#62d997']
#---------------------------------------------------------------------------------------------
labels1 = ['Depressed','Non-depressed']
axes[0].bar(labels1, df['Depression'].value_counts(), width=0.4, color = colors)
axes[0].set_xticks(labels1,labels1,
rotation=0, ha='center')
axes[0].tick_params(axis='x', labelsize=10)
axes[0].set_title('Distribution of depression\n in individuals', size = 15)
axes[0].set_ylabel('Individuals (count)', size = 12)
#---------------------------------------------------------------------------------------------
explode = (0.05,0.05)
axes[1].pie(df['Depression'].value_counts(), labels=labels1,
autopct='%1.0f%%', colors=colors, explode=explode,
shadow=True, startangle = 30)
3
axes[1].set_title('Distribution of depression\n in individuals',size = 15)
plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.1, wspace=0.1,␣
↪hspace=0.4)
#---------------------------------------------------------------------------------------------
plt.show()
[41]: # Inference 2
#---------------------------------------------------------------------------------------------
fig,axes = plt.subplots(nrows = 1, ncols = 2, figsize=(10, 6))
colors = ['#62add9','#d57bdb']
#---------------------------------------------------------------------------------------------
labels1 = df[df['Depression'] == True]['Gender'].value_counts().index
axes[0].bar(labels1, df[df['Depression'] == True]['Gender'].value_counts(),␣
↪width=0.4, color = colors)
axes[0].set_xticks(labels1,labels1,
rotation=0, ha='center')
axes[0].tick_params(axis='x', labelsize=10)
axes[0].set_title('Distribution of gender\n in depressed individuals', size =␣
↪15)
4
axes[1].pie(df[df['Depression'] == True]['Gender'].value_counts(),␣
↪labels=labels1,
#---------------------------------------------------------------------------------------------
plt.show()
[42]: # Inference 3
#---------------------------------------------------------------------------------------------
fig,axes = plt.subplots(nrows = 1, ncols = 2, figsize=(10, 6))
colors = ['#0390fc','#f032b7']
#---------------------------------------------------------------------------------------------
labels1 = df[df['Family History of Mental Illness'] == 'Yes']['Depression'].
↪value_counts().index
5
axes[0].tick_params(axis='x', labelsize=10)
axes[0].set_title('Distribution of individuals whose\n family had history of␣
↪mental illness', size = 15)
#---------------------------------------------------------------------------------------------
plt.show()
[43]: # Inference 4
#---------------------------------------------------------------------------------------------
fig,axes = plt.subplots(nrows = 2, ncols = 2, figsize=(15, 10))
colors = (0.2,0.4,0.2,0.6)
#---------------------------------------------------------------------------------------------
labels1 = df['Sleep Duration'].value_counts().index
6
axes[0,0].bar(labels1, df['Sleep Duration'].value_counts(), width=0.4, color =␣
↪colors)
explode = explode )
axes[1,1].set_title('Sleep duration of all individuals',size = 15)
plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.1, wspace=0.17,␣
↪hspace=0.53)
#---------------------------------------------------------------------------------------------
plt.show()
7
[44]: # Inference 5
#---------------------------------------------------------------------------------------------
fig,axes = plt.subplots(nrows = 2, ncols = 2, figsize=(15, 10))
colors = (0.2,0.4,0.2,0.6)
#---------------------------------------------------------------------------------------------
labels1 = df['Dietary Habits'].value_counts().index
axes[0,0].bar(labels1, df['Dietary Habits'].value_counts(), width=0.4,color =␣
↪colors)
8
axes[0,1].set_ylabel('Individuals (count)', size = 12)
axes[0,1].set_xlabel('Dietary Habits', size = 12)
#---------------------------------------------------------------------------------------------
labels3 = df[df['Depression'] == False]['Dietary Habits'].value_counts().index
axes[1,0].bar(labels3, df[df['Depression'] == False]['Dietary Habits'].
↪value_counts(), width=0.4, color = colors)
explode = explode)
axes[1,1].set_title('Dietary habits of all individuals', size = 15)
#---------------------------------------------------------------------------------------------
plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.1, wspace=0.17,␣
↪hspace=0.4)
plt.show()
9
[45]: # Inference 6
#---------------------------------------------------------------------------------------------
labels = ['age of all \nindividuals', 'age of depressed \nindividuals',
'age of non-depressed\n individuals']
colors = ['#995757', '#b0ae54', '#4bad6f']
axes.tick_params(axis='x', labelrotation=25)
axes.set_yticks(range(16,64,2))
axes.set_yticklabels(range(16,64,2))
plt.show()
10
[46]: # Inference 7
#---------------------------------------------------------------------------------------------
labels = ['Academic Pressure \nof all individuals', 'Academic Pressure of␣
↪\ndepressed individuals',
11
fig, axes = plt.subplots(nrows=1,ncols=1,figsize=(5, 7))
axes.set_title('Academic pressure of individuals in different categories', size␣
↪= 15)
patch_artist=True,
tick_labels=labels)
axes.tick_params(axis='x', labelrotation=25)
axes.set_yticks(np.arange(0,5.5,0.5))
axes.set_yticklabels(np.arange(0,5.5,0.5))
plt.show()
12
[47]: # Inference 8
#---------------------------------------------------------------------------------------------
labels = ['CGPA \nof all individuals', 'CGPA of \ndepressed individuals',
'CGPA of \nnon-depressed individuals']
colors = ['peachpuff', '#32a8a2', '#32a852']
13
axes.set_ylabel('CGPA', size = 12)
bplot = axes.boxplot([df['CGPA'],df[df['Depression'] == True]['CGPA'],
df[df['Depression'] == False]['CGPA']], widths=0.80,
patch_artist=True,
tick_labels=labels)
axes.tick_params(axis='x', labelrotation=25)
axes.set_yticks(np.arange(0,10.5,0.5))
axes.set_yticklabels(np.arange(0,10.5,0.5))
plt.show()
14
[48]: # Inference 9
#---------------------------------------------------------------------------------------------
fig, axes = plt.subplots(nrows=1,ncols=1,figsize=(20, 10))
labels = df['Degree'].unique()
collection = []
for x in df['Degree'].unique() :
collection.append(df[df['Degree']==x]['Work/Study Hours'])
15
bplot = axes.boxplot(collection, widths=0.80,
patch_artist=True,
tick_labels=labels)
to_highlight = [1,3,6,10,17,27]
for x in to_highlight:
bplot['boxes'][x].set_facecolor('#cf9d7c')
axes.tick_params(axis='x', labelrotation=90)
axes.set_title('Study hours of different courses', size = 15)
axes.set_xlabel('Degrees', size = 12)
axes.set_ylabel('Study hours', size = 12)
axes.set_yticks(np.arange(0,14,1))
axes.set_yticklabels(np.arange(0,14,1))
plt.show()
[49]: # Inference 10
#---------------------------------------------------------------------------------------------
fig, axes = plt.subplots(nrows=1,ncols=1,figsize=(20, 10))
labels = df['City'].unique()
collection = []
16
for x in df['City'].unique() :
collection.append(df[df['City']==x]['Academic Pressure'])
to_highlight = [0,4,8,10,13,14,17,22,24,28]
for x in to_highlight:
bplot['boxes'][x].set_facecolor('#cf9d7c')
axes.tick_params(axis='x', labelrotation=90)
axes.set_title('Academic pressure of individuals in different cities', size =␣
↪15)
plt.show()
17
[50]: # Inference 11
#---------------------------------------------------------------------------------------------
labels = ['Male','Female']
colors = ['#707ccc', '#cc708d']
patch_artist=True,
tick_labels=labels)
axes[0].tick_params(axis='x', labelrotation=25)
axes[0].set_yticks(np.arange(0,5.5,0.5))
axes[0].set_yticklabels(np.arange(0,5.5,0.5))
axes[0].set_aspect(0.5)
#---------------------------------------------------------------------------------------------
axes[1].set_title('Academic pressure of depressed \nindividuals of both␣
↪genders', size = 15)
df[df['Depression']==True][df['Gender'] ==␣
↪'Female']['Academic Pressure']], widths=0.5,
patch_artist=True,
tick_labels=labels)
axes[1].tick_params(axis='x', labelrotation=25)
axes[1].set_yticks(np.arange(0,5.5,0.5))
18
axes[1].set_yticklabels(np.arange(0,5.5,0.5))
axes[1].set_aspect(0.5)
#---------------------------------------------------------------------------------------------
axes[2].set_title('Academic pressure of non-depressed\n individuals of both␣
↪genders', size = 15)
df[df['Depression']==False][df['Gender'] ==␣
↪'Female']['Academic Pressure']], widths=0.5,
patch_artist=True,
tick_labels=labels)
axes[2].tick_params(axis='x', labelrotation=25)
axes[2].set_yticks(np.arange(0,5.5,0.5))
axes[2].set_yticklabels(np.arange(0,5.5,0.5))
axes[2].set_aspect(0.5)
#---------------------------------------------------------------------------------------------
plt.show()
19
[51]: # Inference 12
#---------------------------------------------------------------------------------------------
labels = ['Male','Female']
colors = ['#707ccc', '#cc708d']
axes[0].set_yticks(np.arange(0,10.5,0.5))
axes[0].set_yticklabels(np.arange(0,10.5,0.5))
axes[0].set_aspect(0.35)
#---------------------------------------------------------------------------------------------
axes[1].set_title('CGPA of depressed\n individuals of both genders', size = 15)
axes[1].set_xlabel('Gender', size = 12)
axes[1].set_ylabel('CGPA', size = 12)
bplot1 = axes[1].boxplot([df[df['Depression']==True][df['Gender'] ==␣
↪'Male']['CGPA'],
df[df['Depression']==True][df['Gender'] ==␣
↪'Female']['CGPA']], widths=0.5,
patch_artist=True,
tick_labels=labels)
axes[1].set_yticks(np.arange(0,10.5,0.5))
axes[1].set_yticklabels(np.arange(0,10.5,0.5))
axes[1].set_aspect(0.35)
#---------------------------------------------------------------------------------------------
axes[2].set_title('CGPA of non-depressed\n individuals of both genders', size =␣
↪15)
20
axes[2].set_xlabel('Gender', size = 12)
axes[2].set_ylabel('CGPA', size = 12)
bplot2 = axes[2].boxplot([df[df['Depression']==False][df['Gender'] ==␣
↪'Male']['CGPA'],
df[df['Depression']==False][df['Gender'] ==␣
↪'Female']['CGPA']], widths=0.5,
patch_artist=True,
tick_labels=labels)
axes[2].set_yticks(np.arange(0,10.5,0.5))
axes[2].set_yticklabels(np.arange(0,10.5,0.5))
axes[2].set_aspect(0.35)
#---------------------------------------------------------------------------------------------
plt.show()
[52]: # Inference 13
#---------------------------------------------------------------------------------------------
fig, axes = plt.subplots(nrows=1,ncols=1,figsize=(6,7.5))
labels = df['Sleep Duration'].unique()
21
collection = []
colors = ['#48db5e','#0390fc','#d93261','#07f57e','#17e3d5']
for x in df['Sleep Duration'].unique() :
collection.append(df[df['Sleep Duration']==x]['Age'])
axes.tick_params(axis='x', labelrotation=45)
axes.set_title('How age affects sleeping hours', size = 15)
axes.set_xlabel('Sleeping hours', size = 12)
axes.set_ylabel('Age', size = 12)
axes.set_yticks(np.arange(14,60,2))
axes.set_yticklabels(np.arange(14,60,2))
plt.show()
22
[53]: # Inference 14
#---------------------------------------------------------------------------------------------
fig,axes = plt.subplots(nrows = 1, ncols = 2, figsize=(10, 6))
23
#---------------------------------------------------------------------------------------------
labels1 = df['Have you ever had suicidal thoughts ?'].value_counts().index
colors = ['#e87d5d','#62add9']
rotation=0, ha='center')
axes[0].tick_params(axis='x', labelsize=10)
axes[0].set_title('Individuals with/without\n suicidal throughts', size = 15)
axes[0].set_ylabel('Individuals (count)',size = 12)
#---------------------------------------------------------------------------------------------
explode = (0.05,0.05)
axes[1].pie(df['Have you ever had suicidal thoughts ?'].value_counts(),␣
↪autopct='%1.3f%%', shadow=True, startangle = 30,
#---------------------------------------------------------------------------------------------
plt.show()
24
[54]: # Inference 15
#---------------------------------------------------------------------------------------------
fig, axes = plt.subplots(nrows=1,ncols=1,figsize=(6,7.5))
labels = sorted(df['Financial Stress'].unique())
collection = []
colors = ['#48db5e','#0390fc','#b4db48','#07f57e','#17e3d5']
plt.show()
25
[55]: # Inference 16
#---------------------------------------------------------------------------------------------
fig, axes = plt.subplots(nrows=1,ncols=1,figsize=(6,4))
26
p = np.poly1d(z)
plt.show()
[56]: # Inference 17
#---------------------------------------------------------------------------------------------
fig, axes = plt.subplots(nrows=1,ncols=1,figsize=(6,4))
27
axes.scatter(x,y,s =170, c = '#42424270')
axes.plot(x,p(x),'r--')
axes.set_xticks(np.arange(0,13,1))
axes.set_yticks(np.arange(0,10.5,1))
axes.set_title('Relation between work/study hours and CGPA', size = 15)
axes.set_ylabel('CGPA', size = 12)
axes.set_xlabel('Work/study hours', size = 12)
plt.show()
[57]: # Inference 18
#---------------------------------------------------------------------------------------------
g = sns.pairplot(df[['Age','Academic Pressure','CGPA','Study␣
↪Satisfaction','Work/Study Hours','Financial Stress']])
28
[58]: # Inference 19
#---------------------------------------------------------------------------------------------
sns.kdeplot(data=df, x ='Academic Pressure')
plt.title('Density distribution for Academic Pressure')
plt.xlabel('Academic Pressure (on a scale of 0 to 5)')
plt.show()
29
[59]: df[df['Academic Pressure'] < 3]['Depression'].value_counts()
[59]: Depression
False 6475
True 2497
Name: count, dtype: int64
[60]: Depression
True 9335
False 2103
Name: count, dtype: int64
[61]: # Inference 20
#---------------------------------------------------------------------------------------------
sns.kdeplot(data=df, x = 'Study Satisfaction')
plt.title('Density distribution for Study Satisfaction')
plt.xlabel('Study Satisfaction (on a scale of 0 to 5)')
plt.show()
30
[62]: df[df['Study Satisfaction'] < 4]['Depression'].value_counts()
[62]: Depression
True 10969
False 6123
Name: count, dtype: int64
[63]: Depression
False 5421
True 5344
Name: count, dtype: int64
[64]: # Inference 21
#---------------------------------------------------------------------------------------------
sns.kdeplot(data=df, x = 'Financial Stress')
plt.title('Density distribution for Financial Stress')
plt.xlabel('Financial Stress (on a scale of 0 to 5)')
plt.show()
31
[65]: df[df['Financial Stress'] > 4]['Depression'].value_counts()
[65]: Depression
True 5448
False 1257
Name: count, dtype: int64
[66]: # Inference 22
#---------------------------------------------------------------------------------------------
matrix = df[['Age','Academic Pressure','CGPA','Study Satisfaction','Work/Study␣
↪Hours','Financial Stress']]
values = pd.DataFrame(columns=['mean','mode','median','standard␣
↪deviation','confidence interval at 95%','standard error'],
index=['Age','Academic Pressure','CGPA','Study␣
↪Satisfaction','Work/Study Hours','Financial Stress'])
for x in matrix.columns :
values.loc[x,'mean'] = matrix[x].mean()
values.loc[x,'mode'] = matrix[x].mode()[0]
32
values.loc[x,'median'] = matrix[x].median()
values.loc[x,'standard deviation'] = matrix[x].std()
values.loc[x,'standard error'] = matrix[x].sem()
interval = values.loc[x,'standard error'] * stats.t.ppf((1 + 0.95) / 2,␣
↪len(matrix[x]) - 1)
values.head()
[67]: # Inference 23
#---------------------------------------------------------------------------------------------
corr_matrix = df[['Age','Academic Pressure','CGPA','Study Satisfaction','Work/
↪Study Hours','Financial Stress']].corr()
tick_marks = np.arange(len(corr_matrix.columns))
plt.xticks(tick_marks, corr_matrix.columns, rotation=80)
plt.yticks(tick_marks, corr_matrix.index)
for i in range(len(corr_matrix.index)):
for j in range(len(corr_matrix.columns)):
plt.text(j, i, round(corr_matrix.iloc[i, j],3), ha="center",␣
↪va="center", color="black")
plt.show()
33
[68]: #for summary -- refer thesis/documentation
34