07/02/2023, 23:27 BTVN1 - Colaboratory
import numpy as np
import pandas as pd
import sklearn
import scipy
import matplotlib.pyplot as plt
import statistics
from sklearn.datasets import load_boston
boston = load_boston();
/usr/local/lib/python3.8/dist-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function load_boston is deprecated; `l
The Boston housing prices dataset has an ethical problem. You can refer to
the documentation of this function for further details.
The scikit-learn maintainers therefore strongly discourage the use of this
dataset unless the purpose of the code is to study and educate about
ethical issues in data science and machine learning.
In this special case, you can fetch the dataset from the original
source::
import pandas as pd
import numpy as np
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]
Alternative datasets include the California housing dataset (i.e.
:func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
dataset. You can load the datasets as follows::
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
for the California housing dataset and::
from sklearn.datasets import fetch_openml
housing = fetch_openml(name="house_prices", as_frame=True)
for the Ames housing dataset.
warnings.warn(msg, category=FutureWarning)
x = boston.data
y = boston.target
print("min y: ", np.min(y))
print("max y: ", np.max(y))
print("trung binh cua y: ", np.mean(y))
print("trung vi cua y: ", np.median(y))
print("mode cua y: ", statistics.mode(y))
print("phuong sai cua y: ", np.var(y))
print("do lech chuan cua y: ", np.std(y))
print("he so tuong quan cua y: ", np.cov(y))
min y: 5.0
max y: 50.0
trung binh cua y: 22.532806324110673
trung vi cua y: 21.2
mode cua y: 50.0
phuong sai cua y: 84.41955615616554
do lech chuan cua y: 9.188011545278203
he so tuong quan cua y: 84.58672359409846
#min
min = 1e9
for i in y:
if (i < min):
https://colab.research.google.com/drive/1TunkxkXexb5FlH_g8lO4LhgPqtTAmvmV#scrollTo=xV6gjZIYqnrN&printMode=true 1/4
07/02/2023, 23:27 BTVN1 - Colaboratory
min = i
print(min)
5.0
#max
max = -1e9
for i in y:
if (i > max):
max = i
print(max)
50.0
#mean
print("trung binh cua y: ", sum(y)/len(y))
trung binh cua y: 22.532806324110673
#median
y.sort()
n = len(y)
if n % 2 == 0:
median = (y[n//2 - 1] + y[n//2]) / 2
else:
median = y[n//2]
print("trung vi cua y: ", median)
trung vi cua y: 21.2
from collections import Counter
n = len(y)
data = Counter(y)
get_mode = dict(data)
mode = [k for k, v in get_mode.items() if v == np.max(list(data.values()))]
if len(mode) == n:
get_mode = "no mode found"
else:
get_mode = "mode is / are: " + ', '.join(map(str, mode))
print(get_mode)
mode is / are: 50.0
#variance
print("phuong sai cua y: ", sum((np.mean(y) - i)**2 for i in y)/len(y))
phuong sai cua y: 84.41955615616554
#standard deviation
import math
print("do lech chuan cua y: ", math.sqrt(sum((np.mean(y) - i)**2 for i in y)/len(y)))
do lech chuan cua y: 9.188011545278203
data = pd.DataFrame(boston.data)
data.columns = boston.feature_names
data.head
<bound method NDFrame.head of CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX \
0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0
1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 242.0
2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2.0 242.0
3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3.0 222.0
4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3.0 222.0
.. ... ... ... ... ... ... ... ... ... ...
501 0.06263 0.0 11.93 0.0 0.573 6.593 69.1 2.4786 1.0 273.0
502 0.04527 0.0 11.93 0.0 0.573 6.120 76.7 2.2875 1.0 273.0
503 0.06076 0.0 11.93 0.0 0.573 6.976 91.0 2.1675 1.0 273.0
504 0.10959 0.0 11.93 0.0 0.573 6.794 89.3 2.3889 1.0 273.0
505 0.04741 0.0 11.93 0.0 0.573 6.030 80.8 2.5050 1.0 273.0
PTRATIO B LSTAT
0 15.3 396.90 4.98
https://colab.research.google.com/drive/1TunkxkXexb5FlH_g8lO4LhgPqtTAmvmV#scrollTo=xV6gjZIYqnrN&printMode=true 2/4
07/02/2023, 23:27 BTVN1 - Colaboratory
1 17.8 396.90 9.14
2 17.8 392.83 4.03
3 18.7 394.63 2.94
4 18.7 396.90 5.33
.. ... ... ...
501 21.0 391.99 9.67
502 21.0 396.90 9.08
503 21.0 396.90 5.64
504 21.0 393.45 6.48
505 21.0 396.90 7.88
[506 rows x 13 columns]>
z = data.CRIM
#correlation coefficient
def correlation(x, y):
mean_x = sum(x)/float(len(x))
mean_y = sum(y)/float(len(y))
sub_x = [i-mean_x for i in x]
sub_y = [i-mean_y for i in y]
numerator = sum([sub_x[i]*sub_y[i] for i in range(len(sub_x))])
std_deviation_x = sum([sub_x[i]**2.0 for i in range(len(sub_x))])
std_deviation_y = sum([sub_y[i]**2.0 for i in range(len(sub_y))])
denominator = (std_deviation_x*std_deviation_y)**0.5
cor = numerator/denominator
return cor
print("he so tuong quan (y,z): ", correlation(y,z))
he so tuong quan (y,z): 0.2883473338560153
#Histogram
fig = plt.figure(figsize =(10,7))
plt.hist(z, bins=25, color='grey')
plt.title("crime rate")
plt.xlabel("cRIM")
plt.ylabel("frequency")
plt.show()
#Boxplot
plt.boxplot(z)
plt.title("crime rate")
plt.ylabel("crime")
plt.show()
https://colab.research.google.com/drive/1TunkxkXexb5FlH_g8lO4LhgPqtTAmvmV#scrollTo=xV6gjZIYqnrN&printMode=true 3/4
07/02/2023, 23:27 BTVN1 - Colaboratory
Các sản phẩm có tính phí của Colab - Huỷ hợp đồng tại đây
check 0 giây hoàn thành lúc 23:27
https://colab.research.google.com/drive/1TunkxkXexb5FlH_g8lO4LhgPqtTAmvmV#scrollTo=xV6gjZIYqnrN&printMode=true 4/4