Data Analysis With Python - Jupyter Notebook
Data Analysis With Python - Jupyter Notebook
In [3]: file_name='https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/DA0101EN/coursera/project/
df=pd.read_csv(file_name)
In [4]: df.head()
Out[4]:
Unnamed:
id date price bedrooms bathrooms sqft_living sqft_lot floors waterfront ... grade sqft_above sqft_
0
0 0 7129300520 20141013T000000 221900.0 3.0 1.00 1180 5650 1.0 0 ... 7 1180
1 1 6414100192 20141209T000000 538000.0 3.0 2.25 2570 7242 2.0 0 ... 7 2170
2 2 5631500400 20150225T000000 180000.0 2.0 1.00 770 10000 1.0 0 ... 6 770
3 3 2487200875 20141209T000000 604000.0 4.0 3.00 1960 5000 1.0 0 ... 7 1050
4 4 1954400510 20150218T000000 510000.0 3.0 2.00 1680 8080 1.0 0 ... 8 1680
5 rows × 22 columns
In [6]: print(df.dtypes)
Unnamed: 0 int64
id int64
date object
price float64
bedrooms float64
bathrooms float64
sqft_living int64
sqft_lot int64
floors float64
waterfront int64
view int64
condition int64
grade int64
sqft_above int64
sqft_basement int64
yr_built int64
yr_renovated int64
zipcode int64
lat float64
long float64
sqft_living15 int64
sqft_lot15 int64
dtype: object
In [7]: df.describe()
Out[7]:
Unnamed: 0 id price bedrooms bathrooms sqft_living sqft_lot floors waterfront vi
count 21613.00000 2.161300e+04 2.161300e+04 21600.000000 21603.000000 21613.000000 2.161300e+04 21613.000000 21613.000000 21613.0000
mean 10806.00000 4.580302e+09 5.400881e+05 3.372870 2.115736 2079.899736 1.510697e+04 1.494309 0.007542 0.2343
std 6239.28002 2.876566e+09 3.671272e+05 0.926657 0.768996 918.440897 4.142051e+04 0.539989 0.086517 0.7663
min 0.00000 1.000102e+06 7.500000e+04 1.000000 0.500000 290.000000 5.200000e+02 1.000000 0.000000 0.0000
25% 5403.00000 2.123049e+09 3.219500e+05 3.000000 1.750000 1427.000000 5.040000e+03 1.000000 0.000000 0.0000
50% 10806.00000 3.904930e+09 4.500000e+05 3.000000 2.250000 1910.000000 7.618000e+03 1.500000 0.000000 0.0000
75% 16209.00000 7.308900e+09 6.450000e+05 4.000000 2.500000 2550.000000 1.068800e+04 2.000000 0.000000 0.0000
max 21612.00000 9.900000e+09 7.700000e+06 33.000000 8.000000 13540.000000 1.651359e+06 3.500000 1.000000 4.0000
8 rows × 21 columns
Out[8]:
price bedrooms bathrooms sqft_living sqft_lot floors waterfront view condition gr
count 2.161300e+04 21600.000000 21603.000000 21613.000000 2.161300e+04 21613.000000 21613.000000 21613.000000 21613.000000 21613.000
mean 5.400881e+05 3.372870 2.115736 2079.899736 1.510697e+04 1.494309 0.007542 0.234303 3.409430 7.656
std 3.671272e+05 0.926657 0.768996 918.440897 4.142051e+04 0.539989 0.086517 0.766318 0.650743 1.175
min 7.500000e+04 1.000000 0.500000 290.000000 5.200000e+02 1.000000 0.000000 0.000000 1.000000 1.000
25% 3.219500e+05 3.000000 1.750000 1427.000000 5.040000e+03 1.000000 0.000000 0.000000 3.000000 7.000
50% 4.500000e+05 3.000000 2.250000 1910.000000 7.618000e+03 1.500000 0.000000 0.000000 3.000000 7.000
75% 6.450000e+05 4.000000 2.500000 2550.000000 1.068800e+04 2.000000 0.000000 0.000000 4.000000 8.000
max 7.700000e+06 33.000000 8.000000 13540.000000 1.651359e+06 3.500000 1.000000 4.000000 5.000000 13.000
In [9]: print("number of NaN values for the column bedrooms :", df['bedrooms'].isnull().sum())
print("number of NaN values for the column bathrooms :", df['bathrooms'].isnull().sum())
In [11]: mean=df['bedrooms'].mean()
df['bedrooms'].replace(np.nan,mean, inplace=True)
In [12]: mean=df['bathrooms'].mean()
df['bathrooms'].replace(np.nan,mean, inplace=True)
In [13]: print("number of NaN values for the column bedrooms :", df['bedrooms'].isnull().sum())
print("number of NaN values for the column bathrooms :", df['bathrooms'].isnull().sum())
In [14]: df['floors'].value_counts().to_frame()
Out[14]:
floors
1.0 10680
2.0 8241
1.5 1910
3.0 613
2.5 161
3.5 8
In [17]: df.corr()['price'].sort_values()
Out[18]: 0.00046769430149007363
In [19]: X = df[['sqft_living']]
Y = df['price']
lm = LinearRegression()
lm.fit(X, Y)
lm.score(X, Y)
Out[19]: 0.4928532179037931
In [22]: X = df[features]
Y= df['price']
lm = LinearRegression()
lm.fit(X, Y)
lm.score(X, Y)
Out[22]: 0.6576435664044019
In [24]: pipe=Pipeline(Input)
pipe
In [25]: pipe.fit(X,Y)
In [26]: pipe.score(X,Y)
Out[26]: 0.750441999451871
done
Out[29]: 0.6478759163939111
Out[30]: 0.7002744282813562
In [ ]: