Ensemmmmm
Ensemmmmm
import pandas as pd
from sklearn.linear_model import LinearRegression 999 car 4576 100 - 500 DM unemployed
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline percent_of_income years_at_residence age other_credit
from sklearn.model_selection import train_test_split housing \
from sklearn.tree import DecisionTreeClassifier 0 4 4 67 none own
from sklearn import tree
from sklearn import metrics 1 2 2 22 none own
from sklearn.metrics import confusion_matrix,classification_report
2 2 3 49 none own
from IPython.display import Image
from os import system 3 2 4 45 none other
(1000, 17)
996 car 3857 < 100 DM 1 - 4 years
df.describe()
997 furniture/appliances 804 < 100 DM > 7 years
months_loan_duration amount percent_of_income \ 14 dependents 1000 non-null int64
count 1000.000000 1000.000000 1000.000000 15 phone 1000 non-null object
mean 20.903000 3271.258000 2.973000 16 default 1000 non-null object
std 12.058814 2822.736876 1.118715 dtypes: int64(7), object(10)
min 4.000000 250.000000 1.000000 memory usage: 132.9+ KB
25% 12.000000 1365.500000 2.000000
50% 18.000000 2319.500000 3.000000 df.isnull().sum()
75% 24.000000 3972.250000 4.000000
checking_balance 0
max 72.000000 18424.000000 4.000000
months_loan_duration 0
years_at_residence age existing_loans_count credit_history 0
dependents purpose 0
count 1000.000000 1000.000000 1000.000000 amount 0
1000.000000 savings_balance 0
mean 2.845000 35.546000 1.407000 employment_duration 0
1.155000 percent_of_income 0
std 1.103718 11.375469 0.577654 years_at_residence 0
0.362086 age 0
min 1.000000 19.000000 1.000000 other_credit 0
1.000000 housing 0
25% 2.000000 27.000000 1.000000 existing_loans_count 0
1.000000 job 0
50% 3.000000 33.000000 1.000000 dependents 0
1.000000 phone 0
75% 4.000000 42.000000 2.000000 default 0
1.000000 dtype: int64
max 4.000000 75.000000 4.000000
#convert the columns with an 'object' into categorical variable
2.000000
for feature in df.columns:#Loop through all colulmns in the dataframe
df.info() if df[feature].dtype=='object': #only apply for columns with
categorial strings
<class 'pandas.core.frame.DataFrame'> df[feature]=pd.Categorical(df[feature])#Replace strings
RangeIndex: 1000 entries, 0 to 999 with an in
Data columns (total 17 columns):
# Column Non-Null Count Dtype print(df.checking_balance.value_counts())
--- ------ -------------- ----- print(df.credit_history.value_counts())
0 checking_balance 1000 non-null object print(df.purpose.value_counts())
1 months_loan_duration 1000 non-null int64 print(df.savings_balance.value_counts())
2 credit_history 1000 non-null object print(df.employment_duration.value_counts())
3 purpose 1000 non-null object print(df.other_credit.value_counts())
4 amount 1000 non-null int64 print(df.housing.value_counts())
5 savings_balance 1000 non-null object print(df.job.value_counts())
6 employment_duration 1000 non-null object print(df.phone.value_counts())
7 percent_of_income 1000 non-null int64 print(df.default.value_counts())
8 years_at_residence 1000 non-null int64
unknown 394
9 age 1000 non-null int64
< 0 DM 274
10 other_credit 1000 non-null object
1 - 200 DM 269
11 housing 1000 non-null object
> 200 DM 63
12 existing_loans_count 1000 non-null int64
Name: checking_balance, dtype: int64
13 job 1000 non-null object
good 530 "savings_balance":{"<100 DM":1,"100-500 DM":2,"500-1000 DM":3,">
critical 293 1000 DM":4,"unknown":-1},
poor 88 "employement_duaration":{"unemployed":1,"< 1 year":2,"1-4
very good 49 years":3,">1000 DM":4,">7 years":5},
perfect 40 "phone":{"no":1,"yes":2},
Name: credit_history, dtype: int64 #"job":{"unemployed":1,"unskilled":2,"skilled":3,"management":4},
furniture/appliances 473 "default":{"no":0,"yes":1}
car 337 }
business 97
education 59 oneHotCols=["purpose","housing","other_credit","job"]
renovations 22 oneHotCols
car0 12
['purpose', 'housing', 'other_credit', 'job']
Name: purpose, dtype: int64
< 100 DM 603 creditData=df.replace(replaceStruct)
unknown 183 creditData=pd.get_dummies(creditData,columns=oneHotCols)
100 - 500 DM 103 creditData.head(10)
500 - 1000 DM 63
> 1000 DM 48 checking_balance months_loan_duration credit_history amount \
Name: savings_balance, dtype: int64 0 1 6 1 1169
1 - 4 years 339 1 2 48 3 5951
> 7 years 253 2 -1 12 1 2096
4 - 7 years 174 3 1 42 3 7882
< 1 year 172 4 1 24 2 4870
unemployed 62 5 -1 36 3 9055
Name: employment_duration, dtype: int64 6 -1 24 3 2835
none 814 7 2 36 3 6948
bank 139 8 -1 12 3 3059
store 47 9 2 30 1 5234
Name: other_credit, dtype: int64
own 713 savings_balance employment_duration percent_of_income
rent 179 years_at_residence \
other 108 0 -1 > 7 years 4
Name: housing, dtype: int64 4
skilled 630 1 < 100 DM 1 - 4 years 2
unskilled 200 2
management 148 2 < 100 DM 4 - 7 years 2
unemployed 22 3
Name: job, dtype: int64 3 < 100 DM 4 - 7 years 2
no 596 4
yes 404 4 < 100 DM 1 - 4 years 3
Name: phone, dtype: int64 4
no 700 5 -1 1 - 4 years 2
yes 300 4
Name: default, dtype: int64 6 500 - 1000 DM > 7 years 3
4
replaceStruct={ 7 < 100 DM 1 - 4 years 2
"checking_balance":{"< 0 DM":1,"1 - 200 DM":2," > 200 2
DM":3,"unknown":-1}, 8 4 4 - 7 years 2
"credit_history":{"critical":1,"poor":2,"good":3,"very 4
good":4,"unknown":-1},
9 < 100 DM unemployed 4 0 1 0 0
2 1 1 0 0
2 0 0 1
age existing_loans_count ... housing_other housing_own 3 1 0 0
housing_rent \ 4 1 0 0
0 67 2 ... 0 1 5 0 0 1
0 6 1 0 0
1 22 1 ... 0 1 7 0 0 0
0 8 0 0 1
2 49 1 ... 0 1 9 0 0 0
0
3 45 1 ... 1 0 [10 rows x 29 columns]
0
4 53 2 ... 1 0 creditData.info()
0
<class 'pandas.core.frame.DataFrame'>
5 35 1 ... 1 0
RangeIndex: 1000 entries, 0 to 999
0
Data columns (total 29 columns):
6 53 1 ... 0 1
# Column Non-Null Count Dtype
0
--- ------ -------------- -----
7 35 1 ... 0 0
0 checking_balance 1000 non-null category
1
1 months_loan_duration 1000 non-null int64
8 61 1 ... 0 1
2 credit_history 1000 non-null category
0
3 amount 1000 non-null int64
9 28 2 ... 0 1
4 savings_balance 1000 non-null category
0
5 employment_duration 1000 non-null category
other_credit_bank other_credit_none other_credit_store 6 percent_of_income 1000 non-null int64
job_management \ 7 years_at_residence 1000 non-null int64
0 0 1 0 8 age 1000 non-null int64
0 9 existing_loans_count 1000 non-null int64
1 0 1 0 10 dependents 1000 non-null int64
0 11 phone 1000 non-null category
2 0 1 0 12 default 1000 non-null category
0 13 purpose_business 1000 non-null uint8
3 0 1 0 14 purpose_car 1000 non-null uint8
0 15 purpose_car0 1000 non-null uint8
4 0 1 0 16 purpose_education 1000 non-null uint8
0 17 purpose_furniture/appliances 1000 non-null uint8
5 0 1 0 18 purpose_renovations 1000 non-null uint8
0 19 housing_other 1000 non-null uint8
6 0 1 0 20 housing_own 1000 non-null uint8
0 21 housing_rent 1000 non-null uint8
7 0 1 0 22 other_credit_bank 1000 non-null uint8
1 23 other_credit_none 1000 non-null uint8
8 0 1 0 24 other_credit_store 1000 non-null uint8
0 25 job_management 1000 non-null uint8
9 0 1 0 26 job_skilled 1000 non-null uint8
1 27 job_unemployed 1000 non-null uint8
28 job_unskilled 1000 non-null uint8
job_skilled job_unemployed job_unskilled
dtypes: category(6), int64(7), uint8(16)
memory usage: 77.4 KB
split data
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.30,rando
for feature in creditData.columns: m_state=1)
if pd.api.types.is_categorical_dtype(creditData[feature]):
creditData[feature]=creditData[feature].cat.codes.astype(int) dTree=DecisionTreeClassifier(criterion='gini',random_state=1)
dTree.fit(X_train,y_train)
creditData.info()
DecisionTreeClassifier(random_state=1)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999 #scoring our decision tree
Data columns (total 29 columns):
# Column Non-Null Count Dtype dTree.score(X_train,y_train)
--- ------ -------------- ----- dTree.score(X_test,y_test)
0 checking_balance 1000 non-null int32
1 months_loan_duration 1000 non-null int64 0.6733333333333333
2 credit_history 1000 non-null int32
3 amount 1000 non-null int64
4
5
savings_balance
employment_duration
1000 non-null
1000 non-null
int32
int32 visualizing the decision tree
6 percent_of_income 1000 non-null int64 #w -read,r-right .dot-it store the file pwd- gives the exact location
7 years_at_residence 1000 non-null int64 where we are running the notebook
8 age 1000 non-null int64
9 existing_loans_count 1000 non-null int64 #pip install graphiz
10 dependents 1000 non-null int64
11 phone 1000 non-null int32 train_char_label=['No','Yes']
12 default 1000 non-null int32 Credit_Tree_File=open('credit_tree.dot','w')
13 purpose_business 1000 non-null uint8
14 purpose_car 1000 non-null uint8 dot_data=tree.export_graphviz(dTree,out_file=Credit_Tree_File,
15 purpose_car0 1000 non-null uint8 feature_names=list(X_train),class_names=list(train_char_label))
16 purpose_education 1000 non-null uint8 Credit_Tree_File.close()
17 purpose_furniture/appliances 1000 non-null uint8
# http://webgrapgviz/
18 purpose_renovations 1000 non-null uint8
19 housing_other 1000 non-null uint8 from sklearn.tree import plot_tree
20 housing_own 1000 non-null uint8 import matplotlib.pyplot as plt
21 housing_rent 1000 non-null uint8
22 other_credit_bank 1000 non-null uint8 # Assuming you have already trained the decision tree classifier
23 other_credit_none 1000 non-null uint8 (dTree)
24 other_credit_store 1000 non-null uint8
25 job_management 1000 non-null uint8 plt.figure(figsize=(20,10)) # Adjust the figure size as needed
26 job_skilled 1000 non-null uint8 plot_tree(dTree, feature_names=X_train.columns,
27 job_unemployed 1000 non-null uint8 class_names=train_char_label, filled=True)
28 job_unskilled 1000 non-null uint8 plt.show()
dtypes: int32(6), int64(7), uint8(16)
memory usage: 93.9 KB
Imp
dTree2=DecisionTreeClassifier(criterion='gini',random_state=1,max_dept
checking_balance 0.500592
h=3)
months_loan_duration 0.162298
dTree2.fit(X_train,y_train)
credit_history 0.129883
DecisionTreeClassifier(max_depth=3, random_state=1) amount 0.000000
savings_balance 0.107056
print(dTree2.score(X_train,y_train)) employment_duration 0.000000
print(dTree2.score(X_test,y_test)) percent_of_income 0.000000
years_at_residence 0.000000
0.7542857142857143 age 0.000000
0.7366666666666667 existing_loans_count 0.000000
dependents 0.000000
train_char_label=['No','Yes'] phone 0.000000
df_Tree_File=open('df_tree2.dot','w') purpose_business 0.044749
dot_data=tree.export_graphviz(dTree2,out_file=df_Tree_File,feature_nam purpose_car 0.000000
es=list(X_train),class_names=list(train_char_label)) purpose_car0 0.000000
df_Tree_File.close() purpose_education 0.000000
plt.figure(figsize=(20,10)) # Adjust the figure size as needed purpose_furniture/appliances 0.000000
plot_tree(dTree2, feature_names=X_train.columns, purpose_renovations 0.000000
class_names=train_char_label, filled=True) housing_other 0.000000
plt.show() housing_own 0.000000
housing_rent 0.000000
other_credit_bank 0.000000
other_credit_none 0.055422
other_credit_store 0.000000
job_management 0.000000
job_skilled 0.000000
job_unemployed 0.000000 y_predict=bgcl.predict(X_test)
job_unskilled 0.000000 print(bgcl.score(X_test,y_test))
#n_estimator=50
bgel=BaggingClassifier(n_estimators=50,random_state=1)
bgcl=bgcl.fit(X_train,y_train)
Ensemble Learning-Bagging y_predict=bgcl.predict(X_test)
print(bgcl.score(X_test,y_test))
from sklearn.ensemble import BaggingClassifier
cm=metrics.confusion_matrix(y_test,y_predict,labels=[0,1])
bgcl=BaggingClassifier(estimator=dTree,n_estimators=50,random_state=1) df_cm=pd.DataFrame(cm,index=[i for i in["No","yes"]],
#bgel=BaggingClassifier(n_estimators=50,random_state=1) columns=[i for i in["No","yes"]])
bgcl=bgcl.fit(X_train,y_train) plt.figure(figsize=(3,3))
sns.heatmap(df_cm,annot=True,fmt='g')
print(classification_report(y_test,y_predict))
0.7633333333333333 columns=[i for i in["No","yes"]])
precision recall f1-score support plt.figure(figsize=(5,3))
sns.heatmap(df_cm,annot=True,fmt='g')
0 0.81 0.88 0.84 214 print(classification_report(y_test,y_predict))
1 0.61 0.48 0.54 86
precision recall f1-score support
accuracy 0.76 300
macro avg 0.71 0.68 0.69 300 0 0.78 0.75 0.77 214
weighted avg 0.75 0.76 0.75 300 1 0.44 0.49 0.46 86
Ensemble learning-AdaBoosting
from sklearn.ensemble import AdaBoostClassifier
#n_estimator=50
abcl=AdaBoostClassifier(estimator=dTree,n_estimators=10,random_state=1 abcl=AdaBoostClassifier(n_estimators=50,random_state=1)
) abcl=abcl.fit(X_train,y_train)
#abcl=AdaBoostClassfier(n_estimators=50,random_state=1) y_predict=abcl.predict(X_test)
abcl=abcl.fit(X_train,y_train) print(abcl.score(X_test,y_test))
cm=metrics.confusion_matrix(y_test,y_predict,labels=[0,1])
C:\Users\leela\anaconda3\Lib\site-packages\sklearn\ensemble\ df_cm=pd.DataFrame(cm,index=[i for i in["No","yes"]],
_weight_boosting.py:519: FutureWarning: The SAMME.R algorithm (the columns=[i for i in["No","yes"]])
default) is deprecated and will be removed in 1.6. Use the SAMME plt.figure(figsize=(5,3))
algorithm to circumvent this warning. sns.heatmap(df_cm,annot=True,fmt='g')
warnings.warn( print(classification_report(y_test,y_predict))
y_predict=abcl.predict(X_test) C:\Users\leela\anaconda3\Lib\site-packages\sklearn\ensemble\
print(abcl.score(X_test,y_test)) _weight_boosting.py:519: FutureWarning: The SAMME.R algorithm (the
default) is deprecated and will be removed in 1.6. Use the SAMME
0.6733333333333333
algorithm to circumvent this warning.
cm=metrics.confusion_matrix(y_test,y_predict,labels=[0,1]) warnings.warn(
df_cm=pd.DataFrame(cm,index=[i for i in["No","yes"]],
0.7266666666666667 1 0.55 0.30 0.39 86
precision recall f1-score support
accuracy 0.73 300
0 0.78 0.86 0.82 214 macro avg 0.66 0.60 0.61 300
1 0.53 0.41 0.46 86 weighted avg 0.70 0.73 0.70 300
gbcl=GradientBoostingClassifier(n_estimators=50,random_state=1) y_predict=rfcl.predict(X_test)
gbcl=gbcl.fit(X_train,y_train) print(rfcl.score(X_test,y_test))
y_predict=gbcl.predict(X_test) 0.7666666666666667
print(gbcl.score(X_test,y_test))
cm=metrics.confusion_matrix(y_test,y_predict,labels=[0,1])
0.73 df_cm=pd.DataFrame(cm,index=[i for i in["No","yes"]],
columns=[i for i in["No","yes"]])
cm=metrics.confusion_matrix(y_test,y_predict,labels=[0,1]) plt.figure(figsize=(5,3))
df_cm=pd.DataFrame(cm,index=[i for i in["No","yes"]], sns.heatmap(df_cm,annot=True,fmt='g')
columns=[i for i in["No","yes"]]) print(classification_report(y_test,y_predict))
plt.figure(figsize=(5,3))
sns.heatmap(df_cm,annot=True,fmt='g') precision recall f1-score support
print(classification_report(y_test,y_predict))
0 0.80 0.89 0.85 214
precision recall f1-score support 1 0.63 0.45 0.53 86
print(dTree5.score(X_train,y_train))
print(dTree5.score(X_test,y_test))
0.8485714285714285
0.7333333333333333
dTree3=DecisionTreeClassifier(criterion='gini',random_state=1,max_dept
h=4)
dTree3.fit(X_train,y_train)
print(dTree3.score(X_train,y_train))
print(dTree3.score(X_test,y_test))
0.7728571428571429
0.75
dTree4=DecisionTreeClassifier(criterion='gini',random_state=1,max_dept
h=5)
dTree4.fit(X_train,y_train)
print(dTree4.score(X_train,y_train))
print(dTree4.score(X_test,y_test))
0.7985714285714286
0.7433333333333333
dTree5=DecisionTreeClassifier(criterion='gini',random_state=1,max_dept
h=6)
dTree5.fit(X_train,y_train)
print(dTree5.score(X_train,y_train))
print(dTree5.score(X_test,y_test))
0.8214285714285714
0.7466666666666667