9/20/23, 11:44 AM Samana Tatheer-Assign 7-20U00323.
ipynb - Colaboratory
import pandas as pd
import numpy as np
df323= pd.read_csv("/content/census (1).csv")
df323.shape
(48842, 15)
df323.head()
educational- marital-
age workclass fnlwgt education occupation relationship
num status
Never- Machine-
0 25 Private 226802 11th 7 Own-child
married op-inspct
Married-
Farming-
1 38 Private 89814 HS-grad 9 civ- Husband
fishing
spouse
Married-
Assoc- Protective-
2 28 Local-gov 336951 12 civ- Husband
acdm serv
spouse
Married-
Some- Machine-
3 44 Private 160323 10 civ- Husband
ll i t
df323.describe(include='all')
educational- mar
age workclass fnlwgt education
num s
count 48842.000000 48842 4.884200e+04 48842 48842.000000
unique NaN 9 NaN 16 NaN
M
top NaN Private NaN HS-grad NaN
s
freq NaN 33906 NaN 15784 NaN
mean 38.643585 NaN 1.896641e+05 NaN 10.078089
std 13.710510 NaN 1.056040e+05 NaN 2.570973
min 17.000000 NaN 1.228500e+04 NaN 1.000000
25% 28.000000 NaN 1.175505e+05 NaN 9.000000
50% 37.000000 NaN 1.781445e+05 NaN 10.000000
Q1
df323.dtypes
age int64
workclass object
fnlwgt int64
education object
educational-num int64
marital-status object
occupation object
relationship object
race object
gender object
capital-gain int64
capital-loss int64
hours-per-week int64
native-country object
https://colab.research.google.com/drive/1ufwLzyFmN4hwEmvRAryaKPBCLTroKV0U#scrollTo=PeAC18pQQlF3&printMode=true 1/9
9/20/23, 11:44 AM Samana Tatheer-Assign 7-20U00323.ipynb - Colaboratory
income object
dtype: object
Q2
cols=['education','workclass','marital-status','occupation','relationship','race','gender',]
df323[cols] =df323[cols].astype('category')
df323['educational-num'] =df323['educational-num'].astype('float64')
df323.dtypes
age int64
workclass category
fnlwgt int64
education category
educational-num float64
marital-status category
occupation category
relationship category
race category
gender category
capital-gain int64
capital-loss int64
hours-per-week int64
native-country object
income object
dtype: object
Q3
Q1,Q3= np.percentile(df323['capital-gain'],[25,75])
IQR=Q3-Q1
upper=np.where(df323['capital-gain']> (Q3+1.5*IQR))
lower= np.where(df323['capital-gain']<(Q1-1.5*IQR))
df323['capital-gain']= df323['capital-gain'].replace(upper, np.NaN)
df323['capital-gain']= df323['capital-gain'].replace(lower, np.NaN)
Q4
df3 = df323.drop(['fnlwgt'],axis=1)
Q5
df3.dtypes
age int64
workclass category
education category
educational-num float64
marital-status category
occupation category
relationship category
race category
gender category
capital-gain float64
capital-loss int64
hours-per-week int64
native-country object
income object
dtype: object
df3.isnull().sum()
age 0
workclass 0
education 0
https://colab.research.google.com/drive/1ufwLzyFmN4hwEmvRAryaKPBCLTroKV0U#scrollTo=PeAC18pQQlF3&printMode=true 2/9
9/20/23, 11:44 AM Samana Tatheer-Assign 7-20U00323.ipynb - Colaboratory
educational-num 0
marital-status 0
occupation 0
relationship 0
race 0
gender 0
capital-gain 342
capital-loss 0
hours-per-week 0
native-country 0
income 0
dtype: int64
Q6
df3.describe(include='all')
educational- marital-
age workclass education occupat
num status
count 48842.000000 48842 48842 48842.000000 48842 48
unique NaN 9 16 NaN 7
Married-
top NaN Private HS-grad NaN civ-
spec
spouse
freq NaN 33906 15784 NaN 22379 6
mean 38.643585 NaN NaN 10.078089 NaN
std 13.710510 NaN NaN 2.570973 NaN
min 17.000000 NaN NaN 1.000000 NaN
25% 28.000000 NaN NaN 9.000000 NaN
50% 37.000000 NaN NaN 10.000000 NaN
Q7
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
my_tab = pd.crosstab(index=df388["income"],
columns="count")
my_tab
col_0 count
income
<=50K 37155
>50K 11687
GC_DF = df323[['gender','income','hours-per-week']].groupby(['gender','income']).mean().reset_index()
GC_DF1 = df323[['gender','income','capital-gain']].groupby(['gender','income']).mean().reset_index()
GC_DF2 = df323[['gender','income','capital-loss']].groupby(['gender','income']).mean().reset_index()
Q8
sns.barplot(x='gender',y='capital-gain',data=df3)
plt.title('Average capital gain among males and females')
https://colab.research.google.com/drive/1ufwLzyFmN4hwEmvRAryaKPBCLTroKV0U#scrollTo=PeAC18pQQlF3&printMode=true 3/9
9/20/23, 11:44 AM Samana Tatheer-Assign 7-20U00323.ipynb - Colaboratory
Text(0.5, 1.0, 'Average capital gain among males and females')
work_DF = df323[['income','race','hours-per-week']].groupby(['income','race']).mean().reset_index()
work_DF
income race hours-per-week
0 <=50K Amer-Indian-Eskimo 39.816867
1 <=50K Asian-Pac-Islander 38.012613
2 <=50K Black 37.824958
3 <=50K Other 38.488764
4 <=50K White 38.994736
5 >50K Amer-Indian-Eskimo 43.709091
6 >50K Asian-Pac-Islander 44.965770
7 >50K Black 44.222615
8 >50K Other 44.280000
Q10
fig1,ax1=plt.subplots(figsize=(13,7))
sns.barplot(x='income',y='hours-per-week',hue= 'race',data = work_DF)
plt.title("Average working hours across income levels and race")
https://colab.research.google.com/drive/1ufwLzyFmN4hwEmvRAryaKPBCLTroKV0U#scrollTo=PeAC18pQQlF3&printMode=true 4/9
9/20/23, 11:44 AM Samana Tatheer-Assign 7-20U00323.ipynb - Colaboratory
Text(0.5, 1.0, 'Average working hours across income levels and race')
Q11
from IPython.display import IFrame
url= 'https://your-html-file-url.com'
IFrame(url, width=700, height=500)
Q12
df= df323[['hours-per-week', 'capital-gain','capital-loss','gender','income', 'race']]
df
hours-per- capital- capital-
gender income race
week gain loss
0 40 0.0 0 Male <=50K Black
1 50 0.0 0 Male <=50K White
2 40 0.0 0 Male >50K White
3 40 7688.0 0 Male >50K Black
4 30 0.0 0 Female <=50K White
... ... ... ... ... ... ...
48837 38 0.0 0 Female <=50K White
48838 40 0.0 0 Male >50K White
48839 40 0.0 0 Female <=50K White
48840 20 0.0 0 Male <=50K White
48841 40 15024 0 0 F l 50K Whit
https://colab.research.google.com/drive/1ufwLzyFmN4hwEmvRAryaKPBCLTroKV0U#scrollTo=PeAC18pQQlF3&printMode=true 5/9
9/20/23, 11:44 AM Samana Tatheer-Assign 7-20U00323.ipynb - Colaboratory
Q13
df_gender = pd.get_dummies(df['gender'],drop_first=True)
df_gender.head()
Male
0 1
1 1
2 1
3 1
4 0
df_race = pd.get_dummies(df['race'],drop_first=True)
df_race.head()
Asian-Pac-Islander Black Other White
0 0 1 0 0
1 0 0 0 1
2 0 0 0 1
3 0 1 0 0
4 0 0 0 1
df_income = pd.get_dummies(df['income'],drop_first=True)
df_income.head()
>50K
0 0
1 0
2 1
3 1
4 0
Q14
data_final = pd.concat([df323[['hours-per-week','capital-gain','capital-loss']], df_gender,df_race,df_income],axis=1)
data_final
hours- Asian-
capital- capital-
per- Male Pac- Black Other White >50
gain loss
week Islander
0 40 0.0 0 1 0 1 0 0
1 50 0.0 0 1 0 0 0 1
2 40 0.0 0 1 0 0 0 1
3 40 7688.0 0 1 0 1 0 0
4 30 0.0 0 0 0 0 0 1
... ... ... ... ... ... ... ... ...
48837 38 0.0 0 0 0 0 0 1
48838 40 0.0 0 1 0 0 0 1
48839 40 0.0 0 0 0 0 0 1
Q15
https://colab.research.google.com/drive/1ufwLzyFmN4hwEmvRAryaKPBCLTroKV0U#scrollTo=PeAC18pQQlF3&printMode=true 6/9
9/20/23, 11:44 AM Samana Tatheer-Assign 7-20U00323.ipynb - Colaboratory
data_final.dtypes
hours-per-week int64
capital-gain float64
capital-loss int64
Male uint8
Asian-Pac-Islander uint8
Black uint8
Other uint8
White uint8
>50K uint8
dtype: object
q16
cols= ['Male','Asian-Pac-Islander','Black','Other','White','>50K']
data_final[cols] =data_final[cols].astype('int')
data_final
hours- Asian-
capital- capital-
per- Male Pac- Black Other White >50
gain loss
week Islander
0 40 0.0 0 1 0 1 0 0
1 50 0.0 0 1 0 0 0 1
2 40 0.0 0 1 0 0 0 1
3 40 7688.0 0 1 0 1 0 0
4 30 0.0 0 0 0 0 0 1
... ... ... ... ... ... ... ... ...
48837 38 0.0 0 0 0 0 0 1
48838 40 0.0 0 1 0 0 0 1
48839 40 0.0 0 0 0 0 0 1
Q17
from pandas import read_csv
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
q18
x = data_final.drop('>50K',axis=1)
y = data_final['>50K']
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=5)
Q19
score=[]
clf1=LogisticRegression()
https://colab.research.google.com/drive/1ufwLzyFmN4hwEmvRAryaKPBCLTroKV0U#scrollTo=PeAC18pQQlF3&printMode=true 7/9
9/20/23, 11:44 AM Samana Tatheer-Assign 7-20U00323.ipynb - Colaboratory
score=[]
clf1=LogisticRegression()
clf1.fit(x_train,y_train)
pred1=clf1.predict(x_test)
s1=accuracy_score(y_test,pred1)
score.append(s1*100)
print(s1)
knn = KNeighborsClassifier()
knn.fit(x_train,y_train)
pred2 = knn.predict(x_test)
s2 = accuracy_score(y_test,pred2)
score.append(s2*100)
print(s2)
dtc = DecisionTreeClassifier()
dtc.fit(x_train,y_train)
pred3 = dtc.predict(x_test)
s3 = accuracy_score(y_test,pred3)
score.append(s3*100)
print(s3)
clf = LinearDiscriminantAnalysis()
clf.fit(x_train,y_train)
clf.fit(x_train,y_train)
pred4 = clf.predict(x_test)
s4 = accuracy_score(y_test,pred4)
score.append(s4*100)
print(s4)
0.7783805916675197
-------------------------------------------------------------------------
--
ValueError Traceback (most recent call
last)
<ipython-input-79-488fff80522e> in <cell line: 4>()
2
3 clf1=LogisticRegression()
----> 4 clf1.fit(x_train,y_train)
5 pred1=clf1.predict(x_test)
6 s1=accuracy_score(y_test,pred1)
4 frames
/usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py in
_assert_all_finite(X, allow_nan, msg_dtype, estimator_name, input_name)
159 "#estimators-that-handle-nan-values"
160 )
--> 161 raise ValueError(msg_err)
162
163
ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN
natively. For supervised learning, you might want to consider
sklearn.ensemble.HistGradientBoostingClassifier and Regressor which
t i i l d d N N ti l Alt ti l it i
https://colab.research.google.com/drive/1ufwLzyFmN4hwEmvRAryaKPBCLTroKV0U#scrollTo=PeAC18pQQlF3&printMode=true 8/9
9/20/23, 11:44 AM Samana Tatheer-Assign 7-20U00323.ipynb - Colaboratory
https://colab.research.google.com/drive/1ufwLzyFmN4hwEmvRAryaKPBCLTroKV0U#scrollTo=PeAC18pQQlF3&printMode=true 9/9