Student Performance Analysis
Student Performance Analysis
# Load data
df = pd.read_csv("StudentsPerformance.csv")
# Preview
df.head()
df.tail(10)
990 completed 86 81 75
991 completed 65 82 78
992 none 55 76 76
993 none 62 72 74
994 none 63 63 62
995 completed 88 99 95
996 none 62 55 55
997 completed 59 71 65
998 completed 68 78 77
999 none 77 86 86
df.shape
(1000, 8)
df.dtypes
gender object
race/ethnicity object
parental level of education object
lunch object
test preparation course object
math score int64
reading score int64
writing score int64
dtype: object
df['math score'].describe()
count 1000.00000
mean 66.08900
std 15.16308
min 0.00000
25% 57.00000
50% 66.00000
75% 77.00000
max 100.00000
Name: math score, dtype: float64
df['lunch'].describe()
count 1000
unique 2
top standard
freq 645
Name: lunch, dtype: object
df.describe()
df.iloc[100]
gender male
race/ethnicity group B
parental level of education some college
lunch standard
test preparation course none
math score 79
reading score 67
writing score 67
Name: 100, dtype: object
df.loc[:,"lunch"]
0 standard
1 standard
2 standard
3 free/reduced
4 standard
...
995 standard
996 free/reduced
997 free/reduced
998 standard
999 free/reduced
Name: lunch, Length: 1000, dtype: object
df["lunch"].head(10)
0 standard
1 standard
2 standard
3 free/reduced
4 standard
5 standard
6 standard
7 free/reduced
8 free/reduced
9 free/reduced
Name: lunch, dtype: object
df[df["math score"]==99]
263 none 99 93 90
306 completed 99 87 81
Q1 = df['math score'].quantile(0.25)
Q3 = df['math score'].quantile(0.75)
IQR = Q3-Q1
df.isnull()
df.isnull().sum()
gender 0
race/ethnicity 0
parental level of education 0
test preparation course 0
math score 0
reading score 0
writing score 0
dtype: int64
df.count()
gender 1000
race/ethnicity 1000
parental level of education 1000
test preparation course 1000
math score 1000
reading score 1000
writing score 1000
dtype: int64
34 none 97 87 82
104 completed 98 86 90
121 completed 91 89 92
171 none 94 88 78
233 none 92 87 78
263 none 99 93 90
286 completed 97 82 88
306 completed 99 87 81
469 none 91 74 76
501 completed 94 87 92
503 completed 95 89 92
521 none 91 86 84
539 completed 97 92 86
562 completed 96 90 92
571 none 91 96 92
612 completed 94 90 91
618 none 95 81 84
689 none 93 90 83
710 completed 93 84 90
717 completed 96 96 99
719 completed 91 73 80
736 none 92 79 84
779 completed 94 85 82
784 completed 91 81 79
815 completed 94 86 87
846 completed 91 85 85
855 none 97 97 96
864 none 97 93 91
919 completed 91 96 91
934 completed 98 87 90
950 none 94 73 71
979 none 91 95 94
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')
plt.rcParams["figure.figsize"] = [10, 5]
# Load data
df = pd.read_csv("StudentsPerformance.csv")
df.head()
# Color palettes
sns.palplot(sns.color_palette("colorblind"))
plt.title("Color Palette: Colorblind")
plt.show()
sns.palplot(sns.color_palette("Reds"))
plt.title("Color Palette: Reds")
plt.show()
plt.figure(figsize=(8, 8))
sns.distplot(df['writing score'])
plt.title("Distribution of Writing Scores")
plt.show()
plt.figure(figsize=(8, 8))
sns.scatterplot(x="math score", y="writing score", hue="gender",
data=df)
plt.title("Math vs Writing Scores by Gender")
plt.show()
# 5. Bar Plot: Average Writing Score by Gender and Lunch Type
plt.figure(figsize=(8, 8))
sns.barplot(x="gender", y="writing score", hue="lunch", data=df)
plt.title("Average Writing Score by Gender & Lunch")
plt.show()
# 6. Relplot – math vs reading
sns.relplot(x="math score", y="reading score", hue="gender",
style="gender", kind="scatter", data=df)
plt.title("Math vs Reading by Gender")
plt.show()
# 8. Lineplot – reading vs writing by gender
plt.figure(figsize=(7, 7))
sns.lineplot(x="reading score", y="writing score", hue="gender",
data=df)
plt.title("Reading vs Writing by Gender (Lineplot)")
plt.show()
# 10. Barplot – math score by test prep and gender
plt.figure(figsize=(7, 7))
sns.barplot(x="test preparation course", y="math score", hue="gender",
data=df)
plt.title("Math Score by Test Prep and Gender")
plt.show()
# 11. Boxplot – reading score by parental education
plt.figure(figsize=(12, 6))
sns.boxplot(x="parental level of education", y="reading score",
data=df)
plt.title("Reading Score by Parental Education")
plt.xticks(rotation=45)
plt.show()
# 12. Violin plot – writing score by lunch
plt.figure(figsize=(6, 6))
sns.violinplot(x="lunch", y="writing score", data=df)
plt.title("Writing Score by Lunch Type")
plt.show()
# 13. Boxplot – writing score by gender
sns.boxplot(x="gender", y="writing score", data=df)
plt.title("Writing Score by Gender")
plt.show()
plt.figure(figsize=(8, 6))
sns.boxplot(x="race/ethnicity", y="math score", data=df)
plt.title("Math Score by Race/Ethnicity Group")
plt.show()
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
# ✂️ Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2, random_state=0)
LogisticRegression()
# ✅ Evaluate
from sklearn.metrics import accuracy_score, confusion_matrix
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc * 100:.2f}%")
Accuracy: 93.50%
Confusion Matrix:
[[ 20 7]
[ 6 167]]