Decision Tree
Decision Tree
# Detectoutliersusingboxplots
def detect_outliers():
fig, axs=plt.subplots(2, 3, figsize=(10, 5))
sns.boxplot(x=data[ 'Feature1'] , ax=axs[ 0, 0] )
sns.boxplot(x=data[ 'Feature2'] , ax=axs[ 0, 1] )
# Repeat for other features..
plt.show()
# MultipleLinear Regression
X=data[ [ 'Feature1', 'Feature2'] ] # Independent variables
y =data[ 'HousePrice'] # Dependent variable
# Addaconstant termfor intercept
X=sm.add_constant(X)
# Fit themodel
model =sm.OLS(y, X).fit()
# Get model summary
model_summary =model.summary()
print(model_summary)
output
Avg.AreaIncome Avg.AreaHouseAge Avg.AreaNumber of Rooms \
0 79545.45857 5.682861 7.009188
1 79248.64245 6.002900 6.730821
2 61287.06718 5.865890 8.512727
3 63345.24005 7.188236 5.586729
4 59982.19723 5.040555 7.839388
Address
0 208 Michael Ferry Apt.674\ nLaurabury, NE 3701..
1 188 JohnsonViewsSuite079\ nLake Kathleen, CA..
2 9127 ElizabethStravenue\ nDanieltown, WI 06482..
3 USSBarnett\ nFPO AP44820
4 USNSRaymond\ nFPO AE 09386
<class'pandas.core.frame.DataFrame'>
RangeIndex:5000 entries, 0 to4999
Datacolumns(total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Avg.AreaIncome 5000 non-null float64
1 Avg.AreaHouseAge 5000 non-null float64
2 Avg.AreaNumber of Rooms 5000 non-null float64
3 Avg.AreaNumber of Bedrooms 5000 non-null float64
4 AreaPopulation 5000 non-null float64
5 Price 5000 non-null float64
6 Address 5000 non-null object
dtypes:float64(6), object(1)
memory usage:273.6+KB
None
Avg.AreaIncome Avg.AreaHouseAge Avg.AreaNumber of Rooms \
count 5000.000000 5000.000000 5000.000000
mean 68583.108984 5.977222 6.987792
std 10657.991214 0.991456 1.005833
min 17796.631190 2.644304 3.236194
25% 61480.562390 5.322283 6.299250
50% 68804.286405 5.970429 7.002902
75% 75783.338665 6.650808 7.665871
max 107701.748400 9.519088 10.759588
# Hyperparameter tofinetune
param_grid={
'max_depth': range(1,10, 1),
'min_samples_leaf': range(1,20, 2),
'min_samples_split': range(2, 20, 2),
'criterion': [ "entropy", "gini"]
}
# Decisiontreeclassifier
tree=DecisionTreeClassifier(random_state=1)
# GridSearchCV
grid_search=GridSearchCV(estimator=tree, param_grid=param_grid,
cv=5, verbose=True)
grid_search.fit(X_train, y_train)
# Best scoreandestimator
print("best accuracy", grid_search.best_score_)
print(grid_search.best_estimator_)
fromsklearn.tree import plot_tree
import matplotlib.pyplotasplt
# best estimator
tree_clf =grid_search.best_estimator_
# plot
plt.figure(figsize=(18, 15))
plot_tree(tree_clf, filled=True, feature_names=iris.feature_names,
class_names=iris.target_names)
plt.show()
output:
Accuracy:0.9555555555555556
Fitting5 foldsfor eachof 1620 candidates, totalling8100 fits
best accuracy 0.9714285714285715
DecisionTreeClassifier(criterion='entropy', max_depth=4, min_samples_leaf=3,
random_state=1)