Machine Learning Lab
Machine Learning Lab
Aim:
To predict the price of houses using linear regression
Procedure:
1. Import libraries
In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
/kaggle/input/housing-prices-dataset/Housing.csv
2. Reading data
In [2]:
data_frame = pd.read_csv('/kaggle/input/housing-prices-dataset/Housing.csv')
data_frame
Out[2]:
are bedro bathro stor mainr guestr base hotwaterh aircondit park pref furnishin
price
a oms oms ies oad oom ment eating ioning ing area gstatus
1330 74
0 4 2 3 yes no no no yes 2 yes furnished
0000 20
1225 89
1 4 4 4 yes no no no yes 3 no furnished
0000 60
1225 99 semi-
2 3 2 2 yes no yes no no 2 yes
0000 60 furnished
1221 75
3 4 2 2 yes no yes no yes 3 yes furnished
5000 00
Page | 1
are bedro bathro stor mainr guestr base hotwaterh aircondit park pref furnishin
price
a oms oms ies oad oom ment eating ioning ing area gstatus
1141 74
4 4 1 2 yes yes yes no yes 2 no furnished
0000 20
... ... ... ... ... ... ... ... ... ... ... ... ... ...
5
1820 30 unfurnish
4 2 1 1 yes no yes no no 2 no
000 00 ed
0
5
1767 24 semi-
4 3 1 1 no no no no no 0 no
150 00 furnished
1
5
1750 36 unfurnish
4 2 1 1 yes no no no no 0 no
000 20 ed
2
5
1750 29
4 3 1 1 no no no no no 0 no furnished
000 10
3
5
1750 38 unfurnish
4 3 1 2 yes no no no no 0 no
000 50 ed
4
data_frame['guestroom'] = data_frame['guestroom'].astype('category')
data_frame['guestroom'] = data_frame['guestroom'].cat.codes
Page | 2
data_frame['basement'] = data_frame['basement'].astype('category')
data_frame['basement'] = data_frame['basement'].cat.codes
data_frame['hotwaterheating'] = data_frame['hotwaterheating'].astype('category')
data_frame['hotwaterheating'] = data_frame['hotwaterheating'].cat.codes
data_frame['airconditioning'] = data_frame['airconditioning'].astype('category')
data_frame['airconditioning'] = data_frame['airconditioning'].cat.codes
data_frame['prefarea'] = data_frame['prefarea'].astype('category')
data_frame['prefarea'] = data_frame['prefarea'].cat.codes
## We need to map the value of furnishingstatus with the numbers above since it will increase the accuracy of
the model, because of the fact that the more furniture we have, the more expensive the house will be
data_frame
Out[3]:
are bedro bathro stor mainr guestr base hotwaterh aircondit park pref furnishin
price
a oms oms ies oad oom ment eating ioning ing area gstatus
1330 74
0 4 2 3 1 0 0 0 1 2 1 2
0000 20
1225 89
1 4 4 4 1 0 0 0 1 3 0 2
0000 60
1225 99
2 3 2 2 1 0 1 0 0 2 1 1
0000 60
1221 75
3 4 2 2 1 0 1 0 1 3 1 2
5000 00
1141 74
4 4 1 2 1 1 1 0 1 2 0 2
0000 20
... ... ... ... ... ... ... ... ... ... ... ... ... ...
Page | 3
are bedro bathro stor mainr guestr base hotwaterh aircondit park pref furnishin
price
a oms oms ies oad oom ment eating ioning ing area gstatus
5
1820 30
4 2 1 1 1 0 1 0 0 2 0 0
000 00
0
5
1767 24
4 3 1 1 0 0 0 0 0 0 0 1
150 00
1
5
1750 36
4 2 1 1 1 0 0 0 0 0 0 0
000 20
2
5
1750 29
4 3 1 1 0 0 0 0 0 0 0 2
000 10
3
5
1750 38
4 3 1 2 1 0 0 0 0 0 0 0
000 50
4
x_clean = x[outliers == 1]
y_clean = y[outliers == 1]
#x = x_clean
#y = y_clean
## In this version, I DON'T USE the x_clean, and y_clean, but I have noted the result with different
contaminaitons at the end of the file (I have runned it sereval times with different contaminations before)
/opt/conda/lib/python3.10/site-packages/sklearn/base.py:439: UserWarning: X does not have valid feature
names, but IsolationForest was fitted with feature names
warnings.warn(
4. Split data
In [8]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)
print("Done")
Done
5. Training the model
In [9]:
model = LinearRegression()
model.fit(x_train, y_train)
Page | 5
print("Done")
Done
In [10]:
c = model.intercept_
m = model.coef_
print(c)
print(m)
In [12]:
import matplotlib.pyplot as plt
plt.scatter(y_train, y_pred_train)
plt.xlabel("Actual result")
plt.ylabel("Predicted result")
x_point = np.array([0,14000000])
y_point = np.array([0,14000000])
# max value of y is around 13 million
plt.plot(x_point, y_point, c = 'r')
print("RESULT WITH TRAINED DATA")
print("Number of data train: ", len(x_train))
plt.show()
Page | 6
## I create the red line because it is easier to visualize the data, if the dot is near the red line, that means the
model is quite accurate
RESULT WITH TRAINED DATA
Number of data train: 381
In [13]:
from sklearn.metrics import r2_score
r2_score_without_test = r2_score(y_train, y_pred_train)
print(r2_score_without_test)
0.6575703217254214
6. Test the model with tested data
In [14]:
y_pred_test = model.predict(x_test)
print("Done")
Done
In [15]:
import matplotlib.pyplot as plt
plt.scatter(y_test, y_pred_test)
plt.xlabel("Actual result")
plt.ylabel("Predicted result")
x_point = np.array([0,14000000])
y_point = np.array([0,14000000])
plt.plot(x_point, y_point, c = 'r')
print("RESULT WITH TESTED DATA")
print("Number of data test: ", len(x_test))
Page | 7
plt.show()
In [16]:
from sklearn.metrics import r2_score
r2_score_with_test = r2_score(y_test, y_pred_test)
print(r2_score_with_test)
0.723501522320035
7. Result
Result with different contaminations
linkcode
Contaminations 0% 3% 5% 7%
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you
create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
/kaggle/input/housing-prices-dataset/Housing.csv
In [2]:
# Import necessary libraries
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
Page | 8
import matplotlib.pyplot as plt
df = pd.read_csv(r'/kaggle/input/housing-prices-dataset/Housing.csv')
df['newFurnish'] = LabelEncoder().fit_transform(df['furnishingstatus'])
are bedro bathro stor mainr guestr base hotwaterh airconditi park prefa newFur
price
a oms oms ies oad oom ment eating oning ing rea nish
13300 74
0 4 2 3 1 0 0 0 1 2 1 0
000 20
12250 89
1 4 4 4 1 0 0 0 1 3 0 0
000 60
12250 99
2 3 2 2 1 0 1 0 0 2 1 1
000 60
12215 75
3 4 2 2 1 0 1 0 1 3 1 0
000 00
11410 74
4 4 1 2 1 1 1 0 1 2 0 0
000 20
In [3]:
# Drop missing and invalid values
df = df.dropna()
Page | 10
Page | 11
2. Data Project - Stock Market Analysis
Time Series data is a series of data points indexed in time order. Time series data is everywhere, so
manipulating them is important for any data analyst or data scientist.
we will discover and explore data from the stock market, particularly some technology stocks (Apple,
Amazon, Google, and Microsoft). We will learn how to use yfinance to get stock information, and visualize
different aspects of it using Seaborn and Matplotlib. we will look at a few ways of analyzing the risk of a stock,
based on its previous performance history. We will also be predicting future stock prices through a Long Short
Term Memory (LSTM) method!
We'll be answering the following questions along the way:
1.) What was the change in price of the stock over time?
2.) What was the daily return of the stock on average?
3.) What was the moving average of the various stocks?
yf.pdr_override()
end = datetime.now()
start = datetime(end.year - 1, end.month, end.day)
df = pd.concat(company_list, axis=0)
df.tail(10)
[*********************100%***********************] 1 of 1 completed
[*********************100%***********************] 1 of 1 completed
[*********************100%***********************] 1 of 1 completed
[*********************100%***********************] 1 of 1 completed
Out[2]:
Date
Page | 13
Open High Low Close Adj Close Volume company_name
Date
05:00
2023-01-18
00:00:00- 97.250000 99.320000 95.379997 95.459999 95.459999 79570400 AMAZON
05:00
2023-01-19
00:00:00- 94.739998 95.440002 92.860001 93.680000 93.680000 69002700 AMAZON
05:00
2023-01-20
00:00:00- 93.860001 97.349998 93.199997 97.250000 97.250000 67307100 AMAZON
05:00
2023-01-23
00:00:00- 97.559998 97.779999 95.860001 97.519997 97.519997 76501100 AMAZON
05:00
2023-01-24
00:00:00- 96.930000 98.089996 96.000000 96.320000 96.320000 66929500 AMAZON
05:00
2023-01-25
00:00:00- 92.559998 97.239998 91.519997 97.180000 97.180000 94261600 AMAZON
05:00
2023-01-26
00:00:00- 98.239998 99.489998 96.919998 99.220001 99.220001 68523600 AMAZON
05:00
Date
05:00
2023-01-30
00:00:00- 101.089996 101.739998 99.010002 100.550003 100.550003 70566100 AMAZON
05:00
Reviewing the content of our data, we can see that the data is numeric and the date is the index of the data.
Notice also that weekends are missing from the records.
Quick note: Using globals() is a sloppy way of setting the DataFrame names, but it's simple. Now we have our
data, let's perform some basic data analysis and check our data.
Descriptive Statistics about the Data
.describe() generates descriptive statistics. Descriptive statistics include those that summarize the central
tendency, dispersion, and shape of a dataset’s distribution, excluding NaN values.
Analyzes both numeric and object series, as well as DataFrame column sets of mixed data types. The output will
vary depending on what is provided. Refer to the notes below for more detail.
In [3]:
# Summary Stats
AAPL.describe()
Out[3]:
Page | 15
Open High Low Close Adj Close Volume
We have only 255 records in one year because weekends are not included in the data.
Information About the Data
.info() method prints information about a DataFrame including the index dtype and columns, non-null values,
and memory usage.
In [4]:
# General info
AAPL.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 251 entries, 2022-01-31 00:00:00-05:00 to 2023-01-30 00:00:00-05:00
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Open 251 non-null float64
1 High 251 non-null float64
2 Low 251 non-null float64
3 Close 251 non-null float64
4 Adj Close 251 non-null float64
5 Volume 251 non-null int64
6 company_name 251 non-null object
dtypes: float64(5), int64(1), object(1)
memory usage: 23.8+ KB
Closing Price
The closing price is the last price at which the stock is traded during the regular trading day. A stock’s closing
price is the standard benchmark used by investors to track its performance over time.
In [5]:
linkcode
# Let's see a historical view of the closing price
plt.figure(figsize=(15, 10))
plt.subplots_adjust(top=1.25, bottom=1.2)
Page | 16
for i, company in enumerate(company_list, 1):
plt.subplot(2, 2, i)
company['Adj Close'].plot()
plt.ylabel('Adj Close')
plt.xlabel(None)
plt.title(f"Closing Price of {tech_list[i - 1]}")
plt.tight_layout()
Volume of Sales
Volume is the amount of an asset or security that changes hands over some period of time, often over the course
of a day. For instance, the stock trading volume would refer to the number of shares of security traded between
its daily open and close. Trading volume, and changes to volume over the course of time, are important inputs
for technical traders.
In [6]:
linkcode
# Now let's plot the total volume of stock being traded each day
plt.figure(figsize=(15, 10))
plt.subplots_adjust(top=1.25, bottom=1.2)
Now that we've seen the visualizations for the closing price and the volume traded each day, let's go
ahead and caculate the moving average for the stock.
for ma in ma_day:
for company in company_list:
column_name = f"MA for {ma} days"
company[column_name] = company['Adj Close'].rolling(ma).mean()
AAPL[['Adj Close', 'MA for 10 days', 'MA for 20 days', 'MA for 50 days']].plot(ax=axes[0,0])
Page | 18
axes[0,0].set_title('APPLE')
GOOG[['Adj Close', 'MA for 10 days', 'MA for 20 days', 'MA for 50 days']].plot(ax=axes[0,1])
axes[0,1].set_title('GOOGLE')
MSFT[['Adj Close', 'MA for 10 days', 'MA for 20 days', 'MA for 50 days']].plot(ax=axes[1,0])
axes[1,0].set_title('MICROSOFT')
AMZN[['Adj Close', 'MA for 10 days', 'MA for 20 days', 'MA for 50 days']].plot(ax=axes[1,1])
axes[1,1].set_title('AMAZON')
fig.tight_layout()
We see in the graph that the best values to measure the moving average are 10 and 20 days because we still
capture trends in the data without noise.
Page | 19
Page | 20