6/8/23, 1:10 PM Data Analysis with Python - Jupyter Notebook
In [2]: import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
%matplotlib inline
In [3]: file_name='https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/DA0101EN/coursera/project/
df=pd.read_csv(file_name)
In [4]: df.head()
Out[4]:
Unnamed:
id date price bedrooms bathrooms sqft_living sqft_lot floors waterfront ... grade sqft_above sqft_
0
0 0 7129300520 20141013T000000 221900.0 3.0 1.00 1180 5650 1.0 0 ... 7 1180
1 1 6414100192 20141209T000000 538000.0 3.0 2.25 2570 7242 2.0 0 ... 7 2170
2 2 5631500400 20150225T000000 180000.0 2.0 1.00 770 10000 1.0 0 ... 6 770
3 3 2487200875 20141209T000000 604000.0 4.0 3.00 1960 5000 1.0 0 ... 7 1050
4 4 1954400510 20150218T000000 510000.0 3.0 2.00 1680 8080 1.0 0 ... 8 1680
5 rows × 22 columns
localhost:8888/notebooks/Data Analysis with Python.ipynb# 1/10
6/8/23, 1:10 PM Data Analysis with Python - Jupyter Notebook
In [6]: print(df.dtypes)
Unnamed: 0 int64
id int64
date object
price float64
bedrooms float64
bathrooms float64
sqft_living int64
sqft_lot int64
floors float64
waterfront int64
view int64
condition int64
grade int64
sqft_above int64
sqft_basement int64
yr_built int64
yr_renovated int64
zipcode int64
lat float64
long float64
sqft_living15 int64
sqft_lot15 int64
dtype: object
localhost:8888/notebooks/Data Analysis with Python.ipynb# 2/10
6/8/23, 1:10 PM Data Analysis with Python - Jupyter Notebook
In [7]: df.describe()
Out[7]:
Unnamed: 0 id price bedrooms bathrooms sqft_living sqft_lot floors waterfront vi
count 21613.00000 2.161300e+04 2.161300e+04 21600.000000 21603.000000 21613.000000 2.161300e+04 21613.000000 21613.000000 21613.0000
mean 10806.00000 4.580302e+09 5.400881e+05 3.372870 2.115736 2079.899736 1.510697e+04 1.494309 0.007542 0.2343
std 6239.28002 2.876566e+09 3.671272e+05 0.926657 0.768996 918.440897 4.142051e+04 0.539989 0.086517 0.7663
min 0.00000 1.000102e+06 7.500000e+04 1.000000 0.500000 290.000000 5.200000e+02 1.000000 0.000000 0.0000
25% 5403.00000 2.123049e+09 3.219500e+05 3.000000 1.750000 1427.000000 5.040000e+03 1.000000 0.000000 0.0000
50% 10806.00000 3.904930e+09 4.500000e+05 3.000000 2.250000 1910.000000 7.618000e+03 1.500000 0.000000 0.0000
75% 16209.00000 7.308900e+09 6.450000e+05 4.000000 2.500000 2550.000000 1.068800e+04 2.000000 0.000000 0.0000
max 21612.00000 9.900000e+09 7.700000e+06 33.000000 8.000000 13540.000000 1.651359e+06 3.500000 1.000000 4.0000
8 rows × 21 columns
In [8]: df.drop(['id', 'Unnamed: 0'], axis=1, inplace=True)
df.describe()
Out[8]:
price bedrooms bathrooms sqft_living sqft_lot floors waterfront view condition gr
count 2.161300e+04 21600.000000 21603.000000 21613.000000 2.161300e+04 21613.000000 21613.000000 21613.000000 21613.000000 21613.000
mean 5.400881e+05 3.372870 2.115736 2079.899736 1.510697e+04 1.494309 0.007542 0.234303 3.409430 7.656
std 3.671272e+05 0.926657 0.768996 918.440897 4.142051e+04 0.539989 0.086517 0.766318 0.650743 1.175
min 7.500000e+04 1.000000 0.500000 290.000000 5.200000e+02 1.000000 0.000000 0.000000 1.000000 1.000
25% 3.219500e+05 3.000000 1.750000 1427.000000 5.040000e+03 1.000000 0.000000 0.000000 3.000000 7.000
50% 4.500000e+05 3.000000 2.250000 1910.000000 7.618000e+03 1.500000 0.000000 0.000000 3.000000 7.000
75% 6.450000e+05 4.000000 2.500000 2550.000000 1.068800e+04 2.000000 0.000000 0.000000 4.000000 8.000
max 7.700000e+06 33.000000 8.000000 13540.000000 1.651359e+06 3.500000 1.000000 4.000000 5.000000 13.000
localhost:8888/notebooks/Data Analysis with Python.ipynb# 3/10
6/8/23, 1:10 PM Data Analysis with Python - Jupyter Notebook
In [9]: print("number of NaN values for the column bedrooms :", df['bedrooms'].isnull().sum())
print("number of NaN values for the column bathrooms :", df['bathrooms'].isnull().sum())
number of NaN values for the column bedrooms : 13
number of NaN values for the column bathrooms : 10
In [11]: mean=df['bedrooms'].mean()
df['bedrooms'].replace(np.nan,mean, inplace=True)
In [12]: mean=df['bathrooms'].mean()
df['bathrooms'].replace(np.nan,mean, inplace=True)
In [13]: print("number of NaN values for the column bedrooms :", df['bedrooms'].isnull().sum())
print("number of NaN values for the column bathrooms :", df['bathrooms'].isnull().sum())
number of NaN values for the column bedrooms : 0
number of NaN values for the column bathrooms : 0
In [14]: df['floors'].value_counts().to_frame()
Out[14]:
floors
1.0 10680
2.0 8241
1.5 1910
3.0 613
2.5 161
3.5 8
localhost:8888/notebooks/Data Analysis with Python.ipynb# 4/10
6/8/23, 1:10 PM Data Analysis with Python - Jupyter Notebook
In [15]: sns.boxplot(x='waterfront', y='price', data=df)
Out[15]: <AxesSubplot:xlabel='waterfront', ylabel='price'>
localhost:8888/notebooks/Data Analysis with Python.ipynb# 5/10
6/8/23, 1:10 PM Data Analysis with Python - Jupyter Notebook
In [16]: sns.regplot(x='sqft_above', y='price', data=df)
Out[16]: <AxesSubplot:xlabel='sqft_above', ylabel='price'>
localhost:8888/notebooks/Data Analysis with Python.ipynb# 6/10
6/8/23, 1:10 PM Data Analysis with Python - Jupyter Notebook
In [17]: df.corr()['price'].sort_values()
Out[17]: zipcode -0.053203
long 0.021626
condition 0.036362
yr_built 0.054012
sqft_lot15 0.082447
sqft_lot 0.089661
yr_renovated 0.126434
floors 0.256794
waterfront 0.266369
lat 0.307003
bedrooms 0.308797
sqft_basement 0.323816
view 0.397293
bathrooms 0.525738
sqft_living15 0.585379
sqft_above 0.605567
grade 0.667434
sqft_living 0.702035
price 1.000000
Name: price, dtype: float64
In [18]: import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
X = df[['long']]
Y = df['price']
lm = LinearRegression()
lm
lm.fit(X,Y)
lm.score(X, Y)
Out[18]: 0.00046769430149007363
localhost:8888/notebooks/Data Analysis with Python.ipynb# 7/10
6/8/23, 1:10 PM Data Analysis with Python - Jupyter Notebook
In [19]: X = df[['sqft_living']]
Y = df['price']
lm = LinearRegression()
lm.fit(X, Y)
lm.score(X, Y)
Out[19]: 0.4928532179037931
In [21]: features =["floors", "waterfront","lat" ,"bedrooms" ,"sqft_basement" ,"view" ,"bathrooms","sqft_living15","sqft_above"
In [22]: X = df[features]
Y= df['price']
lm = LinearRegression()
lm.fit(X, Y)
lm.score(X, Y)
Out[22]: 0.6576435664044019
In [23]: Input=[('scale',StandardScaler()),('polynomial', PolynomialFeatures(include_bias=False)),('model',LinearRegression())]
In [24]: pipe=Pipeline(Input)
pipe
Out[24]: Pipeline(steps=[('scale', StandardScaler()),
('polynomial', PolynomialFeatures(include_bias=False)),
('model', LinearRegression())])
In [25]: pipe.fit(X,Y)
Out[25]: Pipeline(steps=[('scale', StandardScaler()),
('polynomial', PolynomialFeatures(include_bias=False)),
('model', LinearRegression())])
localhost:8888/notebooks/Data Analysis with Python.ipynb# 8/10
6/8/23, 1:10 PM Data Analysis with Python - Jupyter Notebook
In [26]: pipe.score(X,Y)
Out[26]: 0.750441999451871
In [27]: from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
print("done")
done
In [28]: features =["floors", "waterfront","lat" ,"bedrooms" ,"sqft_basement" ,"view" ,"bathrooms","sqft_living15","sqft_above"
X = df[features ]
Y = df['price']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.15, random_state=1)
print("number of test samples :", x_test.shape[0])
print("number of training samples:",x_train.shape[0])
number of test samples : 3242
number of training samples: 18371
In [29]: from sklearn.linear_model import Ridge
RidgeModel = Ridge(alpha = 0.1)
RidgeModel.fit(x_train, y_train)
RidgeModel.score(x_test, y_test)
Out[29]: 0.6478759163939111
localhost:8888/notebooks/Data Analysis with Python.ipynb# 9/10
6/8/23, 1:10 PM Data Analysis with Python - Jupyter Notebook
In [30]: from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
pr = PolynomialFeatures(degree=2)
x_train_pr = pr.fit_transform(x_train)
x_test_pr = pr.fit_transform(x_test)
poly = Ridge(alpha=0.1)
poly.fit(x_train_pr, y_train)
poly.score(x_test_pr, y_test)
Out[30]: 0.7002744282813562
In [ ]:
localhost:8888/notebooks/Data Analysis with Python.ipynb# 10/10