0% found this document useful (0 votes)
22 views10 pages

Ensemmmmm

The document appears to be analyzing credit risk data from a dataset with 1000 rows and 17 columns. It loads necessary libraries, reads in the credit risk data, converts some columns to categorical variables, and provides some summary statistics and counts of the data.

Uploaded by

Manjunath kn
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
22 views10 pages

Ensemmmmm

The document appears to be analyzing credit risk data from a dataset with 1000 rows and 17 columns. It loads necessary libraries, reads in the credit risk data, converts some columns to categorical variables, and provides some summary statistics and counts of the data.

Uploaded by

Manjunath kn
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 10

import numpy as np 998 furniture/appliances 1845 < 100 DM 1 - 4 years

import pandas as pd
from sklearn.linear_model import LinearRegression 999 car 4576 100 - 500 DM unemployed
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline percent_of_income years_at_residence age other_credit
from sklearn.model_selection import train_test_split housing \
from sklearn.tree import DecisionTreeClassifier 0 4 4 67 none own
from sklearn import tree
from sklearn import metrics 1 2 2 22 none own
from sklearn.metrics import confusion_matrix,classification_report
2 2 3 49 none own
from IPython.display import Image
from os import system 3 2 4 45 none other

df=pd.read_csv("credit.csv") 4 3 4 53 none other


df
.. ... ... ... ... ...
checking_balance months_loan_duration credit_history \
0 < 0 DM 6 critical 995 3 4 31 none own
1 1 - 200 DM 48 good
2 unknown 12 critical 996 4 4 40 none own
3 < 0 DM 42 good
4 < 0 DM 24 poor 997 4 4 38 none own
.. ... ... ...
995 unknown 12 good 998 4 4 23 none other
996 < 0 DM 30 good
999 3 4 27 none own
997 unknown 12 good
998 < 0 DM 45 good
999 1 - 200 DM 45 critical existing_loans_count job dependents phone default
0 2 skilled 1 yes no
purpose amount savings_balance employment_duration 1 1 skilled 1 no yes
\ 2 1 unskilled 2 no no
0 furniture/appliances 1169 unknown > 7 years 3 1 skilled 2 no no
4 2 skilled 2 no yes
1 furniture/appliances 5951 < 100 DM 1 - 4 years
.. ... ... ... ... ...
2 education 2096 < 100 DM 4 - 7 years 995 1 unskilled 1 no no
996 1 management 1 yes no
3 furniture/appliances 7882 < 100 DM 4 - 7 years 997 1 skilled 1 no no
998 1 skilled 1 yes yes
4 car 4870 < 100 DM 1 - 4 years 999 1 skilled 1 no no

.. ... ... ... ... [1000 rows x 17 columns]

995 furniture/appliances 1736 < 100 DM 4 - 7 years df.shape

(1000, 17)
996 car 3857 < 100 DM 1 - 4 years
df.describe()
997 furniture/appliances 804 < 100 DM > 7 years
months_loan_duration amount percent_of_income \ 14 dependents 1000 non-null int64
count 1000.000000 1000.000000 1000.000000 15 phone 1000 non-null object
mean 20.903000 3271.258000 2.973000 16 default 1000 non-null object
std 12.058814 2822.736876 1.118715 dtypes: int64(7), object(10)
min 4.000000 250.000000 1.000000 memory usage: 132.9+ KB
25% 12.000000 1365.500000 2.000000
50% 18.000000 2319.500000 3.000000 df.isnull().sum()
75% 24.000000 3972.250000 4.000000
checking_balance 0
max 72.000000 18424.000000 4.000000
months_loan_duration 0
years_at_residence age existing_loans_count credit_history 0
dependents purpose 0
count 1000.000000 1000.000000 1000.000000 amount 0
1000.000000 savings_balance 0
mean 2.845000 35.546000 1.407000 employment_duration 0
1.155000 percent_of_income 0
std 1.103718 11.375469 0.577654 years_at_residence 0
0.362086 age 0
min 1.000000 19.000000 1.000000 other_credit 0
1.000000 housing 0
25% 2.000000 27.000000 1.000000 existing_loans_count 0
1.000000 job 0
50% 3.000000 33.000000 1.000000 dependents 0
1.000000 phone 0
75% 4.000000 42.000000 2.000000 default 0
1.000000 dtype: int64
max 4.000000 75.000000 4.000000
#convert the columns with an 'object' into categorical variable
2.000000
for feature in df.columns:#Loop through all colulmns in the dataframe
df.info() if df[feature].dtype=='object': #only apply for columns with
categorial strings
<class 'pandas.core.frame.DataFrame'> df[feature]=pd.Categorical(df[feature])#Replace strings
RangeIndex: 1000 entries, 0 to 999 with an in
Data columns (total 17 columns):
# Column Non-Null Count Dtype print(df.checking_balance.value_counts())
--- ------ -------------- ----- print(df.credit_history.value_counts())
0 checking_balance 1000 non-null object print(df.purpose.value_counts())
1 months_loan_duration 1000 non-null int64 print(df.savings_balance.value_counts())
2 credit_history 1000 non-null object print(df.employment_duration.value_counts())
3 purpose 1000 non-null object print(df.other_credit.value_counts())
4 amount 1000 non-null int64 print(df.housing.value_counts())
5 savings_balance 1000 non-null object print(df.job.value_counts())
6 employment_duration 1000 non-null object print(df.phone.value_counts())
7 percent_of_income 1000 non-null int64 print(df.default.value_counts())
8 years_at_residence 1000 non-null int64
unknown 394
9 age 1000 non-null int64
< 0 DM 274
10 other_credit 1000 non-null object
1 - 200 DM 269
11 housing 1000 non-null object
> 200 DM 63
12 existing_loans_count 1000 non-null int64
Name: checking_balance, dtype: int64
13 job 1000 non-null object
good 530 "savings_balance":{"<100 DM":1,"100-500 DM":2,"500-1000 DM":3,">
critical 293 1000 DM":4,"unknown":-1},
poor 88 "employement_duaration":{"unemployed":1,"< 1 year":2,"1-4
very good 49 years":3,">1000 DM":4,">7 years":5},
perfect 40 "phone":{"no":1,"yes":2},
Name: credit_history, dtype: int64 #"job":{"unemployed":1,"unskilled":2,"skilled":3,"management":4},
furniture/appliances 473 "default":{"no":0,"yes":1}
car 337 }
business 97
education 59 oneHotCols=["purpose","housing","other_credit","job"]
renovations 22 oneHotCols
car0 12
['purpose', 'housing', 'other_credit', 'job']
Name: purpose, dtype: int64
< 100 DM 603 creditData=df.replace(replaceStruct)
unknown 183 creditData=pd.get_dummies(creditData,columns=oneHotCols)
100 - 500 DM 103 creditData.head(10)
500 - 1000 DM 63
> 1000 DM 48 checking_balance months_loan_duration credit_history amount \
Name: savings_balance, dtype: int64 0 1 6 1 1169
1 - 4 years 339 1 2 48 3 5951
> 7 years 253 2 -1 12 1 2096
4 - 7 years 174 3 1 42 3 7882
< 1 year 172 4 1 24 2 4870
unemployed 62 5 -1 36 3 9055
Name: employment_duration, dtype: int64 6 -1 24 3 2835
none 814 7 2 36 3 6948
bank 139 8 -1 12 3 3059
store 47 9 2 30 1 5234
Name: other_credit, dtype: int64
own 713 savings_balance employment_duration percent_of_income
rent 179 years_at_residence \
other 108 0 -1 > 7 years 4
Name: housing, dtype: int64 4
skilled 630 1 < 100 DM 1 - 4 years 2
unskilled 200 2
management 148 2 < 100 DM 4 - 7 years 2
unemployed 22 3
Name: job, dtype: int64 3 < 100 DM 4 - 7 years 2
no 596 4
yes 404 4 < 100 DM 1 - 4 years 3
Name: phone, dtype: int64 4
no 700 5 -1 1 - 4 years 2
yes 300 4
Name: default, dtype: int64 6 500 - 1000 DM > 7 years 3
4
replaceStruct={ 7 < 100 DM 1 - 4 years 2
"checking_balance":{"< 0 DM":1,"1 - 200 DM":2," > 200 2
DM":3,"unknown":-1}, 8 4 4 - 7 years 2
"credit_history":{"critical":1,"poor":2,"good":3,"very 4
good":4,"unknown":-1},
9 < 100 DM unemployed 4 0 1 0 0
2 1 1 0 0
2 0 0 1
age existing_loans_count ... housing_other housing_own 3 1 0 0
housing_rent \ 4 1 0 0
0 67 2 ... 0 1 5 0 0 1
0 6 1 0 0
1 22 1 ... 0 1 7 0 0 0
0 8 0 0 1
2 49 1 ... 0 1 9 0 0 0
0
3 45 1 ... 1 0 [10 rows x 29 columns]
0
4 53 2 ... 1 0 creditData.info()
0
<class 'pandas.core.frame.DataFrame'>
5 35 1 ... 1 0
RangeIndex: 1000 entries, 0 to 999
0
Data columns (total 29 columns):
6 53 1 ... 0 1
# Column Non-Null Count Dtype
0
--- ------ -------------- -----
7 35 1 ... 0 0
0 checking_balance 1000 non-null category
1
1 months_loan_duration 1000 non-null int64
8 61 1 ... 0 1
2 credit_history 1000 non-null category
0
3 amount 1000 non-null int64
9 28 2 ... 0 1
4 savings_balance 1000 non-null category
0
5 employment_duration 1000 non-null category
other_credit_bank other_credit_none other_credit_store 6 percent_of_income 1000 non-null int64
job_management \ 7 years_at_residence 1000 non-null int64
0 0 1 0 8 age 1000 non-null int64
0 9 existing_loans_count 1000 non-null int64
1 0 1 0 10 dependents 1000 non-null int64
0 11 phone 1000 non-null category
2 0 1 0 12 default 1000 non-null category
0 13 purpose_business 1000 non-null uint8
3 0 1 0 14 purpose_car 1000 non-null uint8
0 15 purpose_car0 1000 non-null uint8
4 0 1 0 16 purpose_education 1000 non-null uint8
0 17 purpose_furniture/appliances 1000 non-null uint8
5 0 1 0 18 purpose_renovations 1000 non-null uint8
0 19 housing_other 1000 non-null uint8
6 0 1 0 20 housing_own 1000 non-null uint8
0 21 housing_rent 1000 non-null uint8
7 0 1 0 22 other_credit_bank 1000 non-null uint8
1 23 other_credit_none 1000 non-null uint8
8 0 1 0 24 other_credit_store 1000 non-null uint8
0 25 job_management 1000 non-null uint8
9 0 1 0 26 job_skilled 1000 non-null uint8
1 27 job_unemployed 1000 non-null uint8
28 job_unskilled 1000 non-null uint8
job_skilled job_unemployed job_unskilled
dtypes: category(6), int64(7), uint8(16)
memory usage: 77.4 KB
split data
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.30,rando
for feature in creditData.columns: m_state=1)
if pd.api.types.is_categorical_dtype(creditData[feature]):
creditData[feature]=creditData[feature].cat.codes.astype(int) dTree=DecisionTreeClassifier(criterion='gini',random_state=1)
dTree.fit(X_train,y_train)
creditData.info()
DecisionTreeClassifier(random_state=1)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999 #scoring our decision tree
Data columns (total 29 columns):
# Column Non-Null Count Dtype dTree.score(X_train,y_train)
--- ------ -------------- ----- dTree.score(X_test,y_test)
0 checking_balance 1000 non-null int32
1 months_loan_duration 1000 non-null int64 0.6733333333333333
2 credit_history 1000 non-null int32
3 amount 1000 non-null int64
4
5
savings_balance
employment_duration
1000 non-null
1000 non-null
int32
int32 visualizing the decision tree
6 percent_of_income 1000 non-null int64 #w -read,r-right .dot-it store the file pwd- gives the exact location
7 years_at_residence 1000 non-null int64 where we are running the notebook
8 age 1000 non-null int64
9 existing_loans_count 1000 non-null int64 #pip install graphiz
10 dependents 1000 non-null int64
11 phone 1000 non-null int32 train_char_label=['No','Yes']
12 default 1000 non-null int32 Credit_Tree_File=open('credit_tree.dot','w')
13 purpose_business 1000 non-null uint8
14 purpose_car 1000 non-null uint8 dot_data=tree.export_graphviz(dTree,out_file=Credit_Tree_File,
15 purpose_car0 1000 non-null uint8 feature_names=list(X_train),class_names=list(train_char_label))
16 purpose_education 1000 non-null uint8 Credit_Tree_File.close()
17 purpose_furniture/appliances 1000 non-null uint8
# http://webgrapgviz/
18 purpose_renovations 1000 non-null uint8
19 housing_other 1000 non-null uint8 from sklearn.tree import plot_tree
20 housing_own 1000 non-null uint8 import matplotlib.pyplot as plt
21 housing_rent 1000 non-null uint8
22 other_credit_bank 1000 non-null uint8 # Assuming you have already trained the decision tree classifier
23 other_credit_none 1000 non-null uint8 (dTree)
24 other_credit_store 1000 non-null uint8
25 job_management 1000 non-null uint8 plt.figure(figsize=(20,10)) # Adjust the figure size as needed
26 job_skilled 1000 non-null uint8 plot_tree(dTree, feature_names=X_train.columns,
27 job_unemployed 1000 non-null uint8 class_names=train_char_label, filled=True)
28 job_unskilled 1000 non-null uint8 plt.show()
dtypes: int32(6), int64(7), uint8(16)
memory usage: 93.9 KB

X=creditData.drop("default",axis=1) #x is everything but default


y=creditData.pop("default")
print(pd.DataFrame(dTree2.feature_importances_,columns=['Imp'],index=X
Reducing over fitting _train.columns))

Imp
dTree2=DecisionTreeClassifier(criterion='gini',random_state=1,max_dept
checking_balance 0.500592
h=3)
months_loan_duration 0.162298
dTree2.fit(X_train,y_train)
credit_history 0.129883
DecisionTreeClassifier(max_depth=3, random_state=1) amount 0.000000
savings_balance 0.107056
print(dTree2.score(X_train,y_train)) employment_duration 0.000000
print(dTree2.score(X_test,y_test)) percent_of_income 0.000000
years_at_residence 0.000000
0.7542857142857143 age 0.000000
0.7366666666666667 existing_loans_count 0.000000
dependents 0.000000
train_char_label=['No','Yes'] phone 0.000000
df_Tree_File=open('df_tree2.dot','w') purpose_business 0.044749
dot_data=tree.export_graphviz(dTree2,out_file=df_Tree_File,feature_nam purpose_car 0.000000
es=list(X_train),class_names=list(train_char_label)) purpose_car0 0.000000
df_Tree_File.close() purpose_education 0.000000
plt.figure(figsize=(20,10)) # Adjust the figure size as needed purpose_furniture/appliances 0.000000
plot_tree(dTree2, feature_names=X_train.columns, purpose_renovations 0.000000
class_names=train_char_label, filled=True) housing_other 0.000000
plt.show() housing_own 0.000000
housing_rent 0.000000
other_credit_bank 0.000000
other_credit_none 0.055422
other_credit_store 0.000000
job_management 0.000000
job_skilled 0.000000
job_unemployed 0.000000 y_predict=bgcl.predict(X_test)
job_unskilled 0.000000 print(bgcl.score(X_test,y_test))

print(dTree2.score(X_test, y_test)) 0.7633333333333333


y_predict=dTree2.predict(X_test)
cm=metrics.confusion_matrix(y_test, y_predict, labels=[0,1]) cm=metrics.confusion_matrix(y_test,y_predict,labels=[0,1])
df_cm=pd.DataFrame(cm,index=[i for i in['No','Yes']], df_cm=pd.DataFrame(cm,index=[i for i in["No","yes"]],
columns=[i for i in ['No','Yes']]) columns=[i for i in["No","yes"]])
plt.figure(figsize=(3,3)) plt.figure(figsize=(3,3))
sns.heatmap(df_cm,annot=True,fmt='g') sns.heatmap(df_cm,annot=True,fmt='g')
print(classification_report(y_test,y_predict)) print(classification_report(y_test,y_predict))

0.7366666666666667 precision recall f1-score support


precision recall f1-score support
0 0.81 0.88 0.84 214
0 0.78 0.87 0.83 214 1 0.61 0.48 0.54 86
1 0.56 0.40 0.46 86
accuracy 0.76 300
accuracy 0.74 300 macro avg 0.71 0.68 0.69 300
macro avg 0.67 0.63 0.64 300 weighted avg 0.75 0.76 0.75 300
weighted avg 0.72 0.74 0.72 300

#n_estimator=50
bgel=BaggingClassifier(n_estimators=50,random_state=1)
bgcl=bgcl.fit(X_train,y_train)
Ensemble Learning-Bagging y_predict=bgcl.predict(X_test)
print(bgcl.score(X_test,y_test))
from sklearn.ensemble import BaggingClassifier
cm=metrics.confusion_matrix(y_test,y_predict,labels=[0,1])
bgcl=BaggingClassifier(estimator=dTree,n_estimators=50,random_state=1) df_cm=pd.DataFrame(cm,index=[i for i in["No","yes"]],
#bgel=BaggingClassifier(n_estimators=50,random_state=1) columns=[i for i in["No","yes"]])
bgcl=bgcl.fit(X_train,y_train) plt.figure(figsize=(3,3))
sns.heatmap(df_cm,annot=True,fmt='g')
print(classification_report(y_test,y_predict))
0.7633333333333333 columns=[i for i in["No","yes"]])
precision recall f1-score support plt.figure(figsize=(5,3))
sns.heatmap(df_cm,annot=True,fmt='g')
0 0.81 0.88 0.84 214 print(classification_report(y_test,y_predict))
1 0.61 0.48 0.54 86
precision recall f1-score support
accuracy 0.76 300
macro avg 0.71 0.68 0.69 300 0 0.78 0.75 0.77 214
weighted avg 0.75 0.76 0.75 300 1 0.44 0.49 0.46 86

accuracy 0.67 300


macro avg 0.61 0.62 0.61 300
weighted avg 0.68 0.67 0.68 300

Ensemble learning-AdaBoosting
from sklearn.ensemble import AdaBoostClassifier
#n_estimator=50
abcl=AdaBoostClassifier(estimator=dTree,n_estimators=10,random_state=1 abcl=AdaBoostClassifier(n_estimators=50,random_state=1)
) abcl=abcl.fit(X_train,y_train)
#abcl=AdaBoostClassfier(n_estimators=50,random_state=1) y_predict=abcl.predict(X_test)
abcl=abcl.fit(X_train,y_train) print(abcl.score(X_test,y_test))
cm=metrics.confusion_matrix(y_test,y_predict,labels=[0,1])
C:\Users\leela\anaconda3\Lib\site-packages\sklearn\ensemble\ df_cm=pd.DataFrame(cm,index=[i for i in["No","yes"]],
_weight_boosting.py:519: FutureWarning: The SAMME.R algorithm (the columns=[i for i in["No","yes"]])
default) is deprecated and will be removed in 1.6. Use the SAMME plt.figure(figsize=(5,3))
algorithm to circumvent this warning. sns.heatmap(df_cm,annot=True,fmt='g')
warnings.warn( print(classification_report(y_test,y_predict))
y_predict=abcl.predict(X_test) C:\Users\leela\anaconda3\Lib\site-packages\sklearn\ensemble\
print(abcl.score(X_test,y_test)) _weight_boosting.py:519: FutureWarning: The SAMME.R algorithm (the
default) is deprecated and will be removed in 1.6. Use the SAMME
0.6733333333333333
algorithm to circumvent this warning.
cm=metrics.confusion_matrix(y_test,y_predict,labels=[0,1]) warnings.warn(
df_cm=pd.DataFrame(cm,index=[i for i in["No","yes"]],
0.7266666666666667 1 0.55 0.30 0.39 86
precision recall f1-score support
accuracy 0.73 300
0 0.78 0.86 0.82 214 macro avg 0.66 0.60 0.61 300
1 0.53 0.41 0.46 86 weighted avg 0.70 0.73 0.70 300

accuracy 0.73 300


macro avg 0.66 0.63 0.64 300
weighted avg 0.71 0.73 0.71 300

Ensemble Learnig -RandomForestClassifier


from sklearn.ensemble import RandomForestClassifier

Ensemble Learnig -GradientBoost rfcl=RandomForestClassifier(n_estimators=50,


random_state=1,max_features=12)
from sklearn.ensemble import GradientBoostingClassifier rfcl=rfcl.fit(X_train,y_train)

gbcl=GradientBoostingClassifier(n_estimators=50,random_state=1) y_predict=rfcl.predict(X_test)
gbcl=gbcl.fit(X_train,y_train) print(rfcl.score(X_test,y_test))

y_predict=gbcl.predict(X_test) 0.7666666666666667
print(gbcl.score(X_test,y_test))
cm=metrics.confusion_matrix(y_test,y_predict,labels=[0,1])
0.73 df_cm=pd.DataFrame(cm,index=[i for i in["No","yes"]],
columns=[i for i in["No","yes"]])
cm=metrics.confusion_matrix(y_test,y_predict,labels=[0,1]) plt.figure(figsize=(5,3))
df_cm=pd.DataFrame(cm,index=[i for i in["No","yes"]], sns.heatmap(df_cm,annot=True,fmt='g')
columns=[i for i in["No","yes"]]) print(classification_report(y_test,y_predict))
plt.figure(figsize=(5,3))
sns.heatmap(df_cm,annot=True,fmt='g') precision recall f1-score support
print(classification_report(y_test,y_predict))
0 0.80 0.89 0.85 214
precision recall f1-score support 1 0.63 0.45 0.53 86

0 0.76 0.90 0.83 214 accuracy 0.77 300


macro avg 0.72 0.67 0.69 300 dTree5=DecisionTreeClassifier(criterion='gini',random_state=1,max_dept
weighted avg 0.75 0.77 0.75 300 h=7)
dTree5.fit(X_train,y_train)

print(dTree5.score(X_train,y_train))
print(dTree5.score(X_test,y_test))

0.8485714285714285
0.7333333333333333

#bagging classifier in general,benefit from having compplex individual


models
#while as boosting classifier in general,benefit from having simple
models

#try max_depth=3,4mor ...10 then observe the accuracy

dTree3=DecisionTreeClassifier(criterion='gini',random_state=1,max_dept
h=4)
dTree3.fit(X_train,y_train)
print(dTree3.score(X_train,y_train))
print(dTree3.score(X_test,y_test))

0.7728571428571429
0.75

dTree4=DecisionTreeClassifier(criterion='gini',random_state=1,max_dept
h=5)
dTree4.fit(X_train,y_train)

print(dTree4.score(X_train,y_train))
print(dTree4.score(X_test,y_test))

0.7985714285714286
0.7433333333333333

dTree5=DecisionTreeClassifier(criterion='gini',random_state=1,max_dept
h=6)
dTree5.fit(X_train,y_train)

print(dTree5.score(X_train,y_train))
print(dTree5.score(X_test,y_test))

0.8214285714285714
0.7466666666666667

You might also like