New Opendocument Text
New Opendocument Text
"""383project.ipynb
Our goal is to build a system to predict home prices in Placer County based on the home sales data in
the same County in 2023.
"""
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.ticker as mticker
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeClassifier, export_graphviz, DecisionTreeRegressor
import graphviz
#dataset
url = 'https://drive.google.com/file/d/1Y8OEKy6qnbGYbldcojWVLo1r4WIZHqx7/view'
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
df = pd.read_csv(path)
predictors = ['bedrooms','sqft','fbathrooms','pbathrooms','ybd']
target = 'cprice'
closing_price = df['cprice']
X = df[predictors].values
y = df[target].values
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
regr = LinearRegression()
regr.fit(X_train, y_train)
predictions = regr.predict(X_test)
print(np.around(predictions[:10]))
print(np.around(y_test[:10]))
formatter = mticker.FuncFormatter(price_formatter)
plt.gca().yaxis.set_major_formatter(formatter)
plt.gca().xaxis.set_major_formatter(formatter)
plt.tick_params(axis='x', labelrotation=45)
---
"""
closing_price = df['cprice']
listed_price = df['lprice']
model = LinearRegression()
model.fit(closing_price.values.reshape(-1,1), listed_price)
predictions_cprice_vs_lprice = model.predict(closing_price.values.reshape(-1,1))
formatter = mticker.FuncFormatter(price_formatter)
plt.gca().yaxis.set_major_formatter(formatter)
plt.gca().xaxis.set_major_formatter(formatter)
plt.tick_params(axis='x', labelrotation=45)
plt.legend()
closing_price = df['cprice']
number_bedrooms = df['bedrooms']
model = LinearRegression()
model.fit(closing_price.values.reshape(-1,1), number_bedrooms)
predictions_cprice_vs_bed = model.predict(closing_price.values.reshape(-1,1))
formatter = mticker.FuncFormatter(price_formatter)
plt.gca().yaxis.set_major_formatter(formatter)
closing_price = df['cprice']
lot_size_acer = df['lsacres']
model = LinearRegression()
model.fit(closing_price.values.reshape(-1,1), lot_size_acer)
predictions_cprice_vs_acer = model.predict(closing_price.values.reshape(-1,1))
formatter = mticker.FuncFormatter(price_formatter)
plt.gca().xaxis.set_major_formatter(formatter)
plt.tick_params(axis='x', labelrotation=45)
plt.legend()
closing_price = df['cprice']
lot_size_acer = df['lsacres']
model = LinearRegression()
model.fit(closing_price.values.reshape(-1,1), lot_size_acer)
predictions_cprice_vs_acer = model.predict(closing_price.values.reshape(-1,1))
formatter = mticker.FuncFormatter(price_formatter)
plt.gca().yaxis.set_major_formatter(formatter)
plt.legend()
DecisionTreeRegressor
"""
"""KNN Regression"""
ks = np.arange(1, 30, 2)
for k in ks:
knn = KNeighborsRegressor(n_neighbors=k)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)
"""PolynomialFeatures
"""
pf = PolynomialFeatures(degree=3, include_bias=False)
pf.fit(X)
X_poly = pf.transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.2, random_state=42)
polyLR = LinearRegression()
polyKNNR = KNeighborsRegressor()
polyLR.fit(X_train, y_train)
polyKNNR.fit(X_train, y_train)
polyLR_predictions = polyLR.predict(X_test)
polyKNNR_predictions = polyKNNR.predict(X_test)
sns.scatterplot(x=y_test, y=polyLR_predictions)
min_value = min(np.min(y_test), np.min(polyLR_predictions))
max_value = max(np.max(y_test), np.max(polyLR_predictions))
plt.plot([min_value, max_value], [min_value, max_value], linewidth=2, linestyle='dashed',
color='black')
plt.show()
sns.scatterplot(x=y_test, y=polyKNNR_predictions)
min_value = min(np.min(y_test), np.min(polyKNNR_predictions))
max_value = max(np.max(y_test), np.max(polyKNNR_predictions))
plt.plot([min_value, max_value], [min_value, max_value], linewidth=2, linestyle='dashed',
color='black')
"""Forward Selection"""
#using only float columns, remove lprice dom oprice because they are not property attributes
drop_columns = ['cprice', 'lnumber', 'ldate' , 'omd','pdate', 'cdate',
'street','city','zip','cooling','heating','efeatures','ffireplace','parking','levels','pool','fpool','patio','style','subd
ivision','premarks', 'lprice','dom','oprice']
predictors = df.drop(columns=drop_columns).columns
X = df[predictors]
y = df['cprice'].values
remaining = list(range(X_train.shape[1]))
selected = []
curr_features = []
rmse_min = -1
i_min = -1
#get train and test using only current column + selected columns
X_curr_train = X_train[:, curr_features]
X_curr_test = X_test[:, curr_features]
regr.fit(X_curr_train, y_train)
X_curr_prediction = regr.predict(X_curr_test)
remaining.remove(i_min)
selected.append(i_min)
print('best feature {}: {}'.format(len(selected), X.columns[i_min]))
print('num features: {}; rmse: {:.2f}'.format(len(selected), rmse_min))