index
November 14, 2023
1 Data Manipulation using Python Pandas
——Asad Mujeeb——
[ ]: import pandas as pd
[2]: df = pd.read_csv(r"C:\Users\Asad Raja\OneDrive\Desktop\code\StudentsPerformance.
↪csv")
print(df.head())
gender race/ethnicity parental level of education lunch
0 female group B bachelor's degree standard \
1 female group C some college standard
2 female group B master's degree standard
3 male group A associate's degree free/reduced
4 male group C some college standard
test preparation course math score reading score writing score
0 none 72 72 74
1 completed 69 90 88
2 none 90 95 93
3 none 47 57 44
4 none 76 78 75
[ ]:
[4]: df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 gender 1000 non-null object
1 race/ethnicity 1000 non-null object
2 parental level of education 1000 non-null object
3 lunch 1000 non-null object
4 test preparation course 1000 non-null object
1
5 math score 1000 non-null int64
6 reading score 1000 non-null int64
7 writing score 1000 non-null int64
dtypes: int64(3), object(5)
memory usage: 62.6+ KB
[5]: df.describe()
[5]: math score reading score writing score
count 1000.00000 1000.000000 1000.000000
mean 66.08900 69.169000 68.054000
std 15.16308 14.600192 15.195657
min 0.00000 17.000000 10.000000
25% 57.00000 59.000000 57.750000
50% 66.00000 70.000000 69.000000
75% 77.00000 79.000000 79.000000
max 100.00000 100.000000 100.000000
[6]: df.shape
[6]: (1000, 8)
[7]: df.columns
[7]: Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
'test preparation course', 'math score', 'reading score',
'writing score'],
dtype='object')
[8]: df.values
[8]: array([['female', 'group B', "bachelor's degree", …, 72, 72, 74],
['female', 'group C', 'some college', …, 69, 90, 88],
['female', 'group B', "master's degree", …, 90, 95, 93],
…,
['female', 'group C', 'high school', …, 59, 71, 65],
['female', 'group D', 'some college', …, 68, 78, 77],
['female', 'group D', 'some college', …, 77, 86, 86]],
dtype=object)
[10]: df.index
[10]: RangeIndex(start=0, stop=1000, step=1)
[15]: df.sort_values("lunch", ascending=False).head()
[15]: gender race/ethnicity parental level of education lunch
0 female group B bachelor's degree standard \
2
611 female group C some college standard
581 female group E some high school standard
583 female group D associate's degree standard
584 female group D some college standard
test preparation course math score reading score writing score
0 none 72 72 74
611 none 58 59 66
581 none 77 79 80
583 completed 73 75 80
584 none 69 77 77
[17]: df.sort_values(["lunch", "writing score"]).head()
[17]: gender race/ethnicity parental level of education lunch
59 female group C some high school free/reduced \
596 male group B high school free/reduced
327 male group A some college free/reduced
980 female group B high school free/reduced
211 male group C some college free/reduced
test preparation course math score reading score writing score
59 none 0 17 10
596 none 30 24 15
327 none 28 23 19
980 none 8 24 23
211 none 35 28 27
[18]: df.sort_values(["lunch","gender"], ascending= [True, False]).head()
[18]: gender race/ethnicity parental level of education lunch
3 male group A associate's degree free/reduced \
7 male group B some college free/reduced
8 male group D high school free/reduced
18 male group C master's degree free/reduced
24 male group D bachelor's degree free/reduced
test preparation course math score reading score writing score
3 none 47 57 44
7 none 40 43 39
8 completed 64 64 67
18 completed 46 42 46
24 completed 74 71 80
[20]: df["parental level of education"].head()
3
[20]: 0 bachelor's degree
1 some college
2 master's degree
3 associate's degree
4 some college
Name: parental level of education, dtype: object
[21]: df[["math score", "reading score"]].head()
[21]: math score reading score
0 72 72
1 69 90
2 90 95
3 47 57
4 76 78
[26]: cols_to_subset = ["math score", "reading score"]
df[cols_to_subset].head()
[26]: math score reading score
0 72 72
1 69 90
2 90 95
3 47 57
4 76 78
[27]: df["math score"] > 76
[27]: 0 False
1 False
2 True
3 False
4 False
…
995 True
996 False
997 False
998 False
999 True
Name: math score, Length: 1000, dtype: bool
[30]: df[df["reading score"] < 30].head()
[30]: gender race/ethnicity parental level of education lunch
59 female group C some high school free/reduced \
76 male group E some high school standard
211 male group C some college free/reduced
327 male group A some college free/reduced
4
596 male group B high school free/reduced
test preparation course math score reading score writing score
59 none 0 17 10
76 none 30 26 22
211 none 35 28 27
327 none 28 23 19
596 none 30 24 15
[33]: df[df["parental level of education"] == "high school"].head()
[33]: gender race/ethnicity parental level of education lunch
8 male group D high school free/reduced \
9 female group B high school free/reduced
12 female group B high school standard
16 male group C high school standard
20 male group D high school standard
test preparation course math score reading score writing score
8 completed 64 64 67
9 none 38 60 50
12 none 65 81 73
16 none 88 89 86
20 none 66 69 63
[35]: df[df["race/ethnicity"] < "group C"].head()
[35]: gender race/ethnicity parental level of education lunch
0 female group B bachelor's degree standard \
2 female group B master's degree standard
3 male group A associate's degree free/reduced
5 female group B associate's degree standard
6 female group B some college standard
test preparation course math score reading score writing score
0 none 72 72 74
2 none 90 95 93
3 none 47 57 44
5 none 71 83 78
6 completed 88 95 92
[46]: is_lunch = df["lunch"].isin(["standard"])
df[is_lunch].head()
[46]: gender race/ethnicity parental level of education lunch
0 male group B bachelor's degree standard \
1 male group C some college standard
5
2 male group B master's degree standard
4 male group C some college standard
5 male group B associate's degree standard
test preparation course math score reading score writing score
0 none 72 72 74
1 completed 69 90 88
2 none 90 95 93
4 none 76 78 75
5 none 71 83 78
[47]: df["percentage"] = (df["math score"] + df["reading score"] + df["writing␣
↪score"]) / 300
[48]: df.head()
[48]: gender race/ethnicity parental level of education lunch
0 male group B bachelor's degree standard \
1 male group C some college standard
2 male group B master's degree standard
3 male group A associate's degree free/reduced
4 male group C some college standard
test preparation course math score reading score writing score
0 none 72 72 74 \
1 completed 69 90 88
2 none 90 95 93
3 none 47 57 44
4 none 76 78 75
percentage
0 0.726667
1 0.823333
2 0.926667
3 0.493333
4 0.763333
[3]: df["reading score"].mean()
[3]: 69.169
[5]: df["reading score"].min()
[5]: 17
[6]: df["reading score"].max()
[6]: 100
6
[8]: def Func(column):
return column.quantile(0.3)
df["reading score"].agg(Func)
[8]: 62.0
[9]: df[['reading score', "writing score"]].agg(Func)
[9]: reading score 62.0
writing score 60.0
dtype: float64
[10]: def func1(column):
return column.quantile(0.3)
def func2(column):
return column.quantile(0.4)
df["reading score"].agg([func1, func2])
[10]: func1 62.0
func2 66.0
Name: reading score, dtype: float64
[11]: # commulative sum
df["math score"].cumsum()
[11]: 0 72
1 141
2 231
3 278
4 354
…
995 65823
996 65885
997 65944
998 66012
999 66089
Name: math score, Length: 1000, dtype: int64
[13]: df.drop_duplicates(subset =["gender", "race/ethnicity"])
[13]: gender race/ethnicity parental level of education lunch
0 female group B bachelor's degree standard \
1 female group C some college standard
3 male group A associate's degree free/reduced
4 male group C some college standard
7 male group B some college free/reduced
7
8 male group D high school free/reduced
14 female group A master's degree standard
29 female group D master's degree standard
32 female group E master's degree free/reduced
34 male group E some college standard
test preparation course math score reading score writing score
0 none 72 72 74
1 completed 69 90 88
3 none 47 57 44
4 none 76 78 75
7 none 40 43 39
8 completed 64 64 67
14 none 50 53 58
29 none 62 70 75
32 none 56 72 65
34 none 97 87 82
[14]: df["gender"].value_counts()
[14]: gender
female 518
male 482
Name: count, dtype: int64
[15]: df["gender"].value_counts(sort = True)
[15]: gender
female 518
male 482
Name: count, dtype: int64
[16]: df["reading score"].value_counts(normalize=True)
[16]: reading score
72 0.034
74 0.033
64 0.032
67 0.030
73 0.030
…
28 0.001
26 0.001
17 0.001
32 0.001
40 0.001
Name: proportion, Length: 72, dtype: float64
8
[25]: df[df["race/ethnicity"] == "group B"]["math score"].max()
[25]: 97
[26]: df.groupby("gender")["math score"].mean()
[26]: gender
female 63.633205
male 68.728216
Name: math score, dtype: float64
[28]: df.groupby("race/ethnicity")["writing score"].agg([min, max, sum])
[28]: min max sum
race/ethnicity
group A 19 97 5578
group B 15 96 12464
group C 10 100 21637
group D 32 100 18378
group E 22 100 9997
[29]: df.groupby(["gender", "race/ethnicity"])["math score"].mean()
[29]: gender race/ethnicity
female group A 58.527778
group B 61.403846
group C 62.033333
group D 65.248062
group E 70.811594
male group A 63.735849
group B 65.930233
group C 67.611511
group D 69.413534
group E 76.746479
Name: math score, dtype: float64
[5]: df.pivot_table(values = "math score", index = "race/ethnicity")
[5]: math score
race/ethnicity
group A 61.629213
group B 63.452632
group C 64.463950
group D 67.362595
group E 73.821429
[8]: import numpy as np
9
df.pivot_table(values= "reading score", index = "race/ethnicity", aggfunc=np.
↪median)
[8]: reading score
race/ethnicity
group A 64
group B 67
group C 71
group D 71
group E 74
[9]: df.pivot_table(values = "writing score", index = "race/ethnicity", aggfunc= [np.
↪mean, np.median])
[9]: mean median
writing score writing score
race/ethnicity
group A 62.674157 62
group B 65.600000 67
group C 67.827586 68
group D 70.145038 72
group E 71.407143 72
[11]: df.pivot_table(values = "writing score", index = "gender", columns="race/
↪ethnicity", aggfunc=np.median)
[11]: race/ethnicity group A group B group C group D group E
gender
female 67.0 71.5 74.0 76.0 76.0
male 60.0 60.0 62.0 67.0 70.0
[13]: df.pivot_table(values = "writing score", index = "gender", columns = "race/
↪ethnicity",
fill_value=0, margins=True, aggfunc=np.mean)
[13]: race/ethnicity group A group B group C group D group E
gender
female 67.861111 70.048077 71.777778 75.023256 75.536232 \
male 59.150943 60.220930 62.712230 65.413534 67.394366
All 62.674157 65.600000 67.827586 70.145038 71.407143
race/ethnicity All
gender
female 72.467181
male 63.311203
All 68.054000
[ ]:
10