Cleaning Process Data Analysis and Visualisation Using Python
Cleaning Process Data Analysis and Visualisation Using Python
#reads csv file and initializes first row as headers and first column as index␣
↪column
df = pd.read_csv(r"uncleaned_student_depression_dataset.csv",index_col=␣
↪0,header = 0 )
[2]: print(df.shape)
(27902, 17)
[3]: #creates a new index column and keeps the old 'id' column too
df.reset_index(inplace=True)
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27902 entries, 0 to 27901
Data columns (total 18 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 27902 non-null int64
1 Gender 27902 non-null object
2 Age 27902 non-null int64
3 City 27902 non-null object
4 Profession 27902 non-null object
5 Academic Pressure 27902 non-null int64
6 Work Pressure 27902 non-null int64
7 CGPA 27902 non-null float64
8 Study Satisfaction 27902 non-null int64
9 Job Satisfaction 27902 non-null int64
10 Sleep Duration 27902 non-null object
11 Dietary Habits 27902 non-null object
12 Degree 27902 non-null object
13 Have you ever had suicidal thoughts ? 27902 non-null object
14 Work/Study Hours 27902 non-null int64
1
15 Financial Stress 27902 non-null object
16 Family History of Mental Illness 27902 non-null object
17 Depression 27902 non-null int64
dtypes: float64(1), int64(8), object(9)
memory usage: 3.8+ MB
2
4 5 Yes 1
[5]: df.describe()
Depression
count 27902.000000
mean 0.585514
std 0.492642
min 0.000000
25% 0.000000
50% 1.000000
75% 1.000000
max 1.000000
[6]: df['Gender'].nunique()
[6]: 2
[7]: df['Gender'].unique()
[8]: df['Gender'].value_counts()
[8]: Gender
Male 15548
Female 12354
Name: count, dtype: int64
3
[9]: df['Age'].nunique()
[9]: 34
[10]: df['Age'].unique()
[10]: array([19, 33, 24, 31, 28, 25, 29, 30, 27, 20, 23, 18, 21, 22, 34, 32, 26,
39, 35, 42, 36, 58, 49, 38, 51, 44, 43, 46, 59, 54, 48, 56, 37, 41])
[11]: df['Age'].value_counts()
[11]: Age
24 2258
20 2237
28 2133
29 1950
33 1893
25 1784
21 1726
23 1645
18 1587
19 1561
34 1468
27 1462
31 1427
32 1262
22 1160
26 1155
30 1145
35 10
38 8
36 7
42 4
48 3
39 3
43 2
46 2
37 2
49 1
51 1
44 1
59 1
54 1
58 1
56 1
41 1
Name: count, dtype: int64
4
[12]: df['City'].nunique()
[12]: 51
[13]: df['City'].unique()
[14]: df['City'].value_counts()
[14]: City
Kalyan 1570
Srinagar 1372
Hyderabad 1340
Vasai-Virar 1290
Lucknow 1155
Thane 1139
Ludhiana 1111
Agra 1094
Surat 1078
Kolkata 1066
Jaipur 1036
Patna 1007
Visakhapatnam 969
Pune 968
Ahmedabad 951
Bhopal 934
Chennai 885
Meerut 825
Rajkot 816
Delhi 770
Bangalore 767
Ghaziabad 745
Mumbai 699
Vadodara 694
Varanasi 685
Nagpur 651
Indore 643
Kanpur 609
5
Nashik 547
Faridabad 461
Harsha 2
Saanvi 2
Bhavna 2
City 2
ME 1
M.Com 1
Nalyan 1
Nandini 1
Mihir 1
Nalini 1
Kibara 1
Rashi 1
'Less than 5 Kalyan' 1
Reyansh 1
Harsh 1
Gaurav 1
Vaanya 1
Mira 1
3 1
M.Tech 1
Khaziabad 1
Name: count, dtype: int64
[16]: df['Profession'].nunique()
[16]: 14
[17]: df['Profession'].unique()
6
[18]: df['Profession'].value_counts()
[18]: Profession
Student 27847
Architect 8
Teacher 6
'Digital Marketer' 3
'Content Writer' 2
Chef 2
Doctor 2
Pharmacist 2
'Civil Engineer' 1
'UX/UI Designer' 1
'Educational Consultant' 1
Manager 1
Lawyer 1
Entrepreneur 1
Name: count, dtype: int64
[19]: 6
[22]: 3
7
[24]: Work Pressure
0 27875
5 2
2 1
Name: count, dtype: int64
[25]: df['CGPA'].nunique()
[25]: 332
[26]: df['CGPA'].unique()
8
6.29 , 5.25 , 9.69 , 9.9 , 6.39 , 8.09 , 5.83 ,
5.47 , 6.56 , 8.71 , 9.94 , 6.69 , 5.52 , 7.3 ,
7.02 , 6.33 , 8.07 , 8.37 , 8. , 7.79 , 8.65 ,
6.28 , 7.35 , 8.69 , 7.12 , 7.32 , 7.13 , 5.97 ,
5.09 , 6.91 , 6.76 , 6.52 , 7.45 , 8.56 , 6.5 ,
8.63 , 8.27 , 8.49 , 6.59 , 9.29 , 5.3 , 7.06 ,
5.38 , 6.65 , 9.16 , 8.01 , 8.25 , 8.02 , 8.47 ,
7.34 , 8.88 , 7.14 , 8.42 , 5.17 , 9.1 , 7.49 ,
9.85 , 7.42 , 9.31 , 6.35 , 7. , 5.39 , 5.61 ,
9.78 , 9.25 , 5.69 , 9.47 , 8.16 , 7.23 , 6.46 ,
0. , 8.26 , 6.32 , 6.77 , 8.85 , 5.03 , 7.65 ,
5.78 , 6.24 , 5.35 , 6.06 , 7.78 , 6.64 , 7.0625,
6.98 , 6.44 , 6.09 ])
[27]: df['CGPA'].value_counts()
[27]: CGPA
8.04 821
9.96 425
5.74 410
8.95 370
9.21 342
…
7.65 1
6.77 1
8.26 1
7.23 1
6.09 1
Name: count, Length: 332, dtype: int64
[28]: 6
9
[31]: df['Job Satisfaction'].nunique()
[31]: 5
[34]: 6
[35]: array(["'6-7 hours'", "'5-6 hours'", "'Less than 5 hours'", "'7-8 hours'",
"'More than 8 hours'", 'Others'], dtype=object)
[36]: #drops all the incorrect and unfixable values in 'Sleep Duration' column
df.drop( df[df['Sleep Duration'] == 'Others'].index, inplace=True)
df['Sleep Duration'].value_counts()
[37]: 4
10
[39]: df['Dietary Habits'].value_counts()
[40]: df['Degree'].nunique()
[40]: 28
[41]: df['Degree'].unique()
[42]: df['Degree'].value_counts()
[42]: Degree
'Class 12' 6075
B.Ed 1863
B.Com 1505
B.Arch 1476
BCA 1431
MSc 1187
B.Tech 1152
MCA 1043
M.Tech 1019
BHM 924
BSc 887
M.Ed 818
B.Pharm 810
M.Com 734
MBBS 696
BBA 696
LLB 670
BE 610
BA 595
M.Pharm 582
MD 571
MBA 560
MA 544
PhD 520
LLM 481
11
MHM 191
ME 185
Others 35
Name: count, dtype: int64
[43]: 2
[46]: 13
12
[49]: 6
[52]: #drops all the incorrect and unfixable values in 'Financial Stress' column
df.drop(df[df['Financial Stress'] == '?'].index, inplace = True)
#changes the data type of 'Financial Stress' column from 'object' to 'int'
df['Financial Stress'] = df['Financial Stress'].astype(int)
[53]: dtype('int64')
[54]: 2
[57]: df['Depression'].nunique()
[57]: 2
[58]: df['Depression'].unique()
13
[58]: array([1, 0])
[59]: df['Depression'].value_counts()
[59]: Depression
1 16313
0 11544
Name: count, dtype: int64
[60]: #changes the data type of 'Depression' column from 'int to 'bool'
df['Depression'] = df['Depression'].astype(bool)
[61]: df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 27857 entries, 0 to 27901
Data columns (total 18 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 27857 non-null int64
1 Gender 27857 non-null object
2 Age 27857 non-null int64
3 City 27857 non-null object
4 Profession 27857 non-null object
5 Academic Pressure 27857 non-null int64
6 Work Pressure 27857 non-null int64
7 CGPA 27857 non-null float64
8 Study Satisfaction 27857 non-null int64
9 Job Satisfaction 27857 non-null int64
10 Sleep Duration 27857 non-null object
11 Dietary Habits 27857 non-null object
12 Degree 27857 non-null object
13 Have you ever had suicidal thoughts ? 27857 non-null object
14 Work/Study Hours 27857 non-null int64
15 Financial Stress 27857 non-null int64
16 Family History of Mental Illness 27857 non-null object
17 Depression 27857 non-null bool
dtypes: bool(1), float64(1), int64(8), object(8)
memory usage: 3.9+ MB
[62]: df.head()
14
4 30 Female 28 Varanasi Student 3
[ ]: #exports the cleaned database as a csv file with headers and indexes
df.to_csv('cleaned_student_depression_dataset.csv')
15