Samana Tatheer-Assign 7-20U00323.Ipynb - Colaboratory
Samana Tatheer-Assign 7-20U00323.Ipynb - Colaboratory
ipynb - Colaboratory
import pandas as pd
import numpy as np
df323.shape
(48842, 15)
df323.head()
educational- marital-
age workclass fnlwgt education occupation relationship
num status
Never- Machine-
0 25 Private 226802 11th 7 Own-child
married op-inspct
Married-
Farming-
1 38 Private 89814 HS-grad 9 civ- Husband
fishing
spouse
Married-
Assoc- Protective-
2 28 Local-gov 336951 12 civ- Husband
acdm serv
spouse
Married-
Some- Machine-
3 44 Private 160323 10 civ- Husband
ll i t
df323.describe(include='all')
educational- mar
age workclass fnlwgt education
num s
M
top NaN Private NaN HS-grad NaN
s
Q1
df323.dtypes
age int64
workclass object
fnlwgt int64
education object
educational-num int64
marital-status object
occupation object
relationship object
race object
gender object
capital-gain int64
capital-loss int64
hours-per-week int64
native-country object
https://colab.research.google.com/drive/1ufwLzyFmN4hwEmvRAryaKPBCLTroKV0U#scrollTo=PeAC18pQQlF3&printMode=true 1/9
9/20/23, 11:44 AM Samana Tatheer-Assign 7-20U00323.ipynb - Colaboratory
income object
dtype: object
Q2
cols=['education','workclass','marital-status','occupation','relationship','race','gender',]
df323[cols] =df323[cols].astype('category')
df323['educational-num'] =df323['educational-num'].astype('float64')
df323.dtypes
age int64
workclass category
fnlwgt int64
education category
educational-num float64
marital-status category
occupation category
relationship category
race category
gender category
capital-gain int64
capital-loss int64
hours-per-week int64
native-country object
income object
dtype: object
Q3
Q1,Q3= np.percentile(df323['capital-gain'],[25,75])
IQR=Q3-Q1
upper=np.where(df323['capital-gain']> (Q3+1.5*IQR))
lower= np.where(df323['capital-gain']<(Q1-1.5*IQR))
Q4
df3 = df323.drop(['fnlwgt'],axis=1)
Q5
df3.dtypes
age int64
workclass category
education category
educational-num float64
marital-status category
occupation category
relationship category
race category
gender category
capital-gain float64
capital-loss int64
hours-per-week int64
native-country object
income object
dtype: object
df3.isnull().sum()
age 0
workclass 0
education 0
https://colab.research.google.com/drive/1ufwLzyFmN4hwEmvRAryaKPBCLTroKV0U#scrollTo=PeAC18pQQlF3&printMode=true 2/9
9/20/23, 11:44 AM Samana Tatheer-Assign 7-20U00323.ipynb - Colaboratory
educational-num 0
marital-status 0
occupation 0
relationship 0
race 0
gender 0
capital-gain 342
capital-loss 0
hours-per-week 0
native-country 0
income 0
dtype: int64
Q6
df3.describe(include='all')
educational- marital-
age workclass education occupat
num status
Married-
top NaN Private HS-grad NaN civ-
spec
spouse
Q7
my_tab = pd.crosstab(index=df388["income"],
columns="count")
my_tab
col_0 count
income
<=50K 37155
>50K 11687
GC_DF = df323[['gender','income','hours-per-week']].groupby(['gender','income']).mean().reset_index()
GC_DF1 = df323[['gender','income','capital-gain']].groupby(['gender','income']).mean().reset_index()
GC_DF2 = df323[['gender','income','capital-loss']].groupby(['gender','income']).mean().reset_index()
Q8
sns.barplot(x='gender',y='capital-gain',data=df3)
plt.title('Average capital gain among males and females')
https://colab.research.google.com/drive/1ufwLzyFmN4hwEmvRAryaKPBCLTroKV0U#scrollTo=PeAC18pQQlF3&printMode=true 3/9
9/20/23, 11:44 AM Samana Tatheer-Assign 7-20U00323.ipynb - Colaboratory
work_DF = df323[['income','race','hours-per-week']].groupby(['income','race']).mean().reset_index()
work_DF
Q10
fig1,ax1=plt.subplots(figsize=(13,7))
sns.barplot(x='income',y='hours-per-week',hue= 'race',data = work_DF)
plt.title("Average working hours across income levels and race")
https://colab.research.google.com/drive/1ufwLzyFmN4hwEmvRAryaKPBCLTroKV0U#scrollTo=PeAC18pQQlF3&printMode=true 4/9
9/20/23, 11:44 AM Samana Tatheer-Assign 7-20U00323.ipynb - Colaboratory
Text(0.5, 1.0, 'Average working hours across income levels and race')
Q11
Q12
https://colab.research.google.com/drive/1ufwLzyFmN4hwEmvRAryaKPBCLTroKV0U#scrollTo=PeAC18pQQlF3&printMode=true 5/9
9/20/23, 11:44 AM Samana Tatheer-Assign 7-20U00323.ipynb - Colaboratory
Q13
df_gender = pd.get_dummies(df['gender'],drop_first=True)
df_gender.head()
Male
0 1
1 1
2 1
3 1
4 0
df_race = pd.get_dummies(df['race'],drop_first=True)
df_race.head()
0 0 1 0 0
1 0 0 0 1
2 0 0 0 1
3 0 1 0 0
4 0 0 0 1
df_income = pd.get_dummies(df['income'],drop_first=True)
df_income.head()
>50K
0 0
1 0
2 1
3 1
4 0
Q14
hours- Asian-
capital- capital-
per- Male Pac- Black Other White >50
gain loss
week Islander
0 40 0.0 0 1 0 1 0 0
1 50 0.0 0 1 0 0 0 1
2 40 0.0 0 1 0 0 0 1
3 40 7688.0 0 1 0 1 0 0
4 30 0.0 0 0 0 0 0 1
48837 38 0.0 0 0 0 0 0 1
48838 40 0.0 0 1 0 0 0 1
48839 40 0.0 0 0 0 0 0 1
Q15
https://colab.research.google.com/drive/1ufwLzyFmN4hwEmvRAryaKPBCLTroKV0U#scrollTo=PeAC18pQQlF3&printMode=true 6/9
9/20/23, 11:44 AM Samana Tatheer-Assign 7-20U00323.ipynb - Colaboratory
data_final.dtypes
hours-per-week int64
capital-gain float64
capital-loss int64
Male uint8
Asian-Pac-Islander uint8
Black uint8
Other uint8
White uint8
>50K uint8
dtype: object
q16
cols= ['Male','Asian-Pac-Islander','Black','Other','White','>50K']
data_final[cols] =data_final[cols].astype('int')
data_final
hours- Asian-
capital- capital-
per- Male Pac- Black Other White >50
gain loss
week Islander
0 40 0.0 0 1 0 1 0 0
1 50 0.0 0 1 0 0 0 1
2 40 0.0 0 1 0 0 0 1
3 40 7688.0 0 1 0 1 0 0
4 30 0.0 0 0 0 0 0 1
48837 38 0.0 0 0 0 0 0 1
48838 40 0.0 0 1 0 0 0 1
48839 40 0.0 0 0 0 0 0 1
Q17
q18
x = data_final.drop('>50K',axis=1)
y = data_final['>50K']
Q19
score=[]
clf1=LogisticRegression()
https://colab.research.google.com/drive/1ufwLzyFmN4hwEmvRAryaKPBCLTroKV0U#scrollTo=PeAC18pQQlF3&printMode=true 7/9
9/20/23, 11:44 AM Samana Tatheer-Assign 7-20U00323.ipynb - Colaboratory
score=[]
clf1=LogisticRegression()
clf1.fit(x_train,y_train)
pred1=clf1.predict(x_test)
s1=accuracy_score(y_test,pred1)
score.append(s1*100)
print(s1)
knn = KNeighborsClassifier()
knn.fit(x_train,y_train)
pred2 = knn.predict(x_test)
s2 = accuracy_score(y_test,pred2)
score.append(s2*100)
print(s2)
dtc = DecisionTreeClassifier()
dtc.fit(x_train,y_train)
pred3 = dtc.predict(x_test)
s3 = accuracy_score(y_test,pred3)
score.append(s3*100)
print(s3)
clf = LinearDiscriminantAnalysis()
clf.fit(x_train,y_train)
clf.fit(x_train,y_train)
pred4 = clf.predict(x_test)
s4 = accuracy_score(y_test,pred4)
score.append(s4*100)
print(s4)
0.7783805916675197
-------------------------------------------------------------------------
--
ValueError Traceback (most recent call
last)
<ipython-input-79-488fff80522e> in <cell line: 4>()
2
3 clf1=LogisticRegression()
----> 4 clf1.fit(x_train,y_train)
5 pred1=clf1.predict(x_test)
6 s1=accuracy_score(y_test,pred1)
4 frames
/usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py in
_assert_all_finite(X, allow_nan, msg_dtype, estimator_name, input_name)
159 "#estimators-that-handle-nan-values"
160 )
--> 161 raise ValueError(msg_err)
162
163
https://colab.research.google.com/drive/1ufwLzyFmN4hwEmvRAryaKPBCLTroKV0U#scrollTo=PeAC18pQQlF3&printMode=true 8/9
9/20/23, 11:44 AM Samana Tatheer-Assign 7-20U00323.ipynb - Colaboratory
https://colab.research.google.com/drive/1ufwLzyFmN4hwEmvRAryaKPBCLTroKV0U#scrollTo=PeAC18pQQlF3&printMode=true 9/9