0% found this document useful (0 votes)
12 views

Decision Tree

Uploaded by

ruthwin77
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
12 views

Decision Tree

Uploaded by

ruthwin77
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 4

5.

implementationof mutiplelinear regressionfor housepriceprecedtionusingsklearn


import pandasaspd
import numpy asnp
import matplotlib.pyplotasplt
import seabornassns
import statsmodels.api assm
# Loadthedataset (youcanreplacethiswithyour owndata)
data=pd.read_csv("/ content/ sample_data/ housing.csv") # Replace'Housing.csv' withyour dataset filepath
# Datainspection
print(data.head(5)) # Display first 5 records
print(data.info()) # Showdatatypedefinitionsfor columns
print(data.describe()) # Descriptivestatistics
print(f"Total rowsandcolumns:{data.shape}")
# Check for null values
print(f"Null values:\ n{data.isnull().sum()}")

# Detectoutliersusingboxplots
def detect_outliers():
fig, axs=plt.subplots(2, 3, figsize=(10, 5))
sns.boxplot(x=data[ 'Feature1'] , ax=axs[ 0, 0] )
sns.boxplot(x=data[ 'Feature2'] , ax=axs[ 0, 1] )
# Repeat for other features..
plt.show()
# MultipleLinear Regression
X=data[ [ 'Feature1', 'Feature2'] ] # Independent variables
y =data[ 'HousePrice'] # Dependent variable
# Addaconstant termfor intercept
X=sm.add_constant(X)
# Fit themodel
model =sm.OLS(y, X).fit()
# Get model summary
model_summary =model.summary()
print(model_summary)

output
Avg.AreaIncome Avg.AreaHouseAge Avg.AreaNumber of Rooms \
0 79545.45857 5.682861 7.009188
1 79248.64245 6.002900 6.730821
2 61287.06718 5.865890 8.512727
3 63345.24005 7.188236 5.586729
4 59982.19723 5.040555 7.839388

Avg.AreaNumber of Bedrooms AreaPopulation Price \


0 4.09 23086.80050 1.059034e+06
1 3.09 40173.07217 1.505891e+06
2 5.13 36882.15940 1.058988e+06
3 3.26 34310.24283 1.260617e+06
4 4.23 26354.10947 6.309435e+05

Address
0 208 Michael Ferry Apt.674\ nLaurabury, NE 3701..
1 188 JohnsonViewsSuite079\ nLake Kathleen, CA..
2 9127 ElizabethStravenue\ nDanieltown, WI 06482..
3 USSBarnett\ nFPO AP44820
4 USNSRaymond\ nFPO AE 09386
<class'pandas.core.frame.DataFrame'>
RangeIndex:5000 entries, 0 to4999
Datacolumns(total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Avg.AreaIncome 5000 non-null float64
1 Avg.AreaHouseAge 5000 non-null float64
2 Avg.AreaNumber of Rooms 5000 non-null float64
3 Avg.AreaNumber of Bedrooms 5000 non-null float64
4 AreaPopulation 5000 non-null float64
5 Price 5000 non-null float64
6 Address 5000 non-null object
dtypes:float64(6), object(1)
memory usage:273.6+KB
None
Avg.AreaIncome Avg.AreaHouseAge Avg.AreaNumber of Rooms \
count 5000.000000 5000.000000 5000.000000
mean 68583.108984 5.977222 6.987792
std 10657.991214 0.991456 1.005833
min 17796.631190 2.644304 3.236194
25% 61480.562390 5.322283 6.299250
50% 68804.286405 5.970429 7.002902
75% 75783.338665 6.650808 7.665871
max 107701.748400 9.519088 10.759588

Avg.AreaNumber of Bedrooms AreaPopulation Price


count 5000.000000 5000.000000 5.000000e+03
mean 3.981330 36163.516039 1.232073e+06
std 1.234137 9925.650114 3.531176e+05
min 2.000000 172.610686 1.593866e+04
25% 3.140000 29403.928700 9.975771e+05
50% 4.050000 36199.406690 1.232669e+06
75% 4.490000 42861.290770 1.471210e+06
max 6.500000 69621.713380 2.469066e+06
Total rowsandcolumns:(5000, 7)
Null values:
Avg.AreaIncome 0
Avg.AreaHouseAge 0
Avg.AreaNumber of Rooms 0
Avg.AreaNumber of Bedrooms 0
AreaPopulation 0
Price 0
Address 0
dtype:int64
6.implementationof decisiontreeusingsklearnanditsparameterstuning
fromsklearn.datasetsimport load_iris
fromsklearn.model_selectionimport train_test_split
fromsklearn.tree import DecisionTreeClassifier
fromsklearn.metricsimport accuracy_score
iris=load_iris()
X=iris.data
y =iris.target
X_train, X_test, y_train, y_test =train_test_split(X, y,test_size=0.3, random_state=99)
clf =DecisionTreeClassifier(random_state=1)
clf.fit(X_train, y_train)
y_pred=clf.predict(X_test)

accuracy =accuracy_score(y_test, y_pred)


print(f'Accuracy:{accuracy}')
fromsklearn.model_selectionimport GridSearchCV

# Hyperparameter tofinetune
param_grid={
'max_depth': range(1,10, 1),
'min_samples_leaf': range(1,20, 2),
'min_samples_split': range(2, 20, 2),
'criterion': [ "entropy", "gini"]
}
# Decisiontreeclassifier
tree=DecisionTreeClassifier(random_state=1)
# GridSearchCV
grid_search=GridSearchCV(estimator=tree, param_grid=param_grid,
cv=5, verbose=True)
grid_search.fit(X_train, y_train)

# Best scoreandestimator
print("best accuracy", grid_search.best_score_)
print(grid_search.best_estimator_)
fromsklearn.tree import plot_tree
import matplotlib.pyplotasplt

# best estimator
tree_clf =grid_search.best_estimator_
# plot
plt.figure(figsize=(18, 15))
plot_tree(tree_clf, filled=True, feature_names=iris.feature_names,
class_names=iris.target_names)
plt.show()
output:
Accuracy:0.9555555555555556
Fitting5 foldsfor eachof 1620 candidates, totalling8100 fits
best accuracy 0.9714285714285715
DecisionTreeClassifier(criterion='entropy', max_depth=4, min_samples_leaf=3,
random_state=1)

You might also like