ML LAB 12 - Jupyter Notebook
ML LAB 12 - Jupyter Notebook
ML LAB-12
In [ ]:
import numpy as np
import pandas as pd
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
from scipy.stats import norm
In [ ]:
df=pd.read_csv('winequalityN.csv')
df.head()
Out[3]:
free total
fixed volatile citric residual
type chlorides sulfur sulfur density pH sulphates
acidity acidity acid sugar
dioxide dioxide
0 white 7.0 0.27 0.36 20.7 0.045 45.0 170.0 1.0010 3.00 0.45
1 white 6.3 0.30 0.34 1.6 0.049 14.0 132.0 0.9940 3.30 0.49
2 white 8.1 0.28 0.40 6.9 0.050 30.0 97.0 0.9951 3.26 0.44
3 white 7.2 0.23 0.32 8.5 0.058 47.0 186.0 0.9956 3.19 0.40
4 white 7.2 0.23 0.32 8.5 0.058 47.0 186.0 0.9956 3.19 0.40
In [ ]:
dfv=df['quality'].value_counts()
print(dfv)
6 2836
5 2138
7 1079
4 216
8 193
3 30
9 5
In [ ]:
for a in range(len(df.corr().columns)):
for b in range(a):
if abs(df.corr().iloc[a,b]) >0.7:
name = df.corr().columns[a]
print(name)
In [ ]:
Out[6]:
Total Percent
pH 9 0.001385
sulphates 4 0.000616
chlorides 2 0.000308
quality 0 0.000000
alcohol 0 0.000000
density 0 0.000000
type 0 0.000000
In [ ]:
next_df = pd.get_dummies(new_df,drop_first=True)
next_df
Out[7]:
free
fixed volatile citric residual
chlorides sulfur density pH sulphates alcohol qu
acidity acidity acid sugar
dioxide
0 7.0 0.270 0.36 20.7 0.045 45.0 1.00100 3.00 0.45 8.8
1 6.3 0.300 0.34 1.6 0.049 14.0 0.99400 3.30 0.49 9.5
2 8.1 0.280 0.40 6.9 0.050 30.0 0.99510 3.26 0.44 10.1
3 7.2 0.230 0.32 8.5 0.058 47.0 0.99560 3.19 0.40 9.9
4 7.2 0.230 0.32 8.5 0.058 47.0 0.99560 3.19 0.40 9.9
... ... ... ... ... ... ... ... ... ... ...
6492 6.2 0.600 0.08 2.0 0.090 32.0 0.99490 3.45 0.58 10.5
6493 5.9 0.550 0.10 2.2 0.062 39.0 0.99512 3.52 NaN 11.2
6494 6.3 0.510 0.13 2.3 0.076 29.0 0.99574 3.42 0.75 11.0
6495 5.9 0.645 0.12 2.0 0.075 32.0 0.99547 3.57 0.71 10.2
6496 6.0 0.310 0.47 3.6 0.067 18.0 0.99549 3.39 0.66 11.0
In [ ]:
next_df1=next_df
next_df1["best quality"] = [ 1 if x>=7 else 0 for x in df.quality]
print(next_df1)
0 1 0
1 1 0
2 1 0
3 1 0
4 1 0
6492 0 0
6493 0 0
6494 0 0
6495 0 0
6496 0 0
In [ ]:
In [ ]:
X=next_df[features]
y=next_df['quality']
In [ ]:
In [ ]:
...
In [ ]:
import numpy as np
import pandas as pd
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
Note: you may need to restart the kernel to use updated packages.
In [ ]:
import time
import random
from scipy.stats import uniform
In [ ]:
Note: you may need to restart the kernel to use updated packages.
In [ ]:
data = pd.read_csv(
"winequalityN.csv",
header = 0,
skiprows = lambda i: (i>0) and (random.random() > 0.9)
)
In [ ]:
Out[27]:
In [ ]:
data["quality"] = data["quality"].astype(float)
data.head(10)
Out[28]:
free total
fixed volatile citric residual
type chlorides sulfur sulfur density pH sulphates
acidity acidity acid sugar
dioxide dioxide
0 white 6.3 0.30 0.34 1.60 0.049 14.0 132.0 0.9940 3.30 0.49
1 white 8.1 0.28 0.40 6.90 0.050 30.0 97.0 0.9951 3.26 0.44
2 white 7.2 0.23 0.32 8.50 0.058 47.0 186.0 0.9956 3.19 0.40
3 white 8.1 0.28 0.40 6.90 0.050 30.0 97.0 0.9951 3.26 0.44
4 white 6.2 0.32 0.16 7.00 0.045 30.0 136.0 0.9949 3.18 0.47
5 white 7.0 0.27 0.36 20.70 0.045 45.0 170.0 1.0010 3.00 0.45
6 white 6.3 0.30 0.34 1.60 0.049 14.0 132.0 0.9940 3.30 0.49
7 white 8.1 0.22 0.43 1.50 0.044 28.0 129.0 0.9938 3.22 0.45
8 white 8.1 0.27 0.41 1.45 0.033 11.0 63.0 0.9908 2.99 0.56
9 white 8.6 0.23 0.40 4.20 0.035 17.0 109.0 0.9947 3.14 0.53
In [ ]:
df = data.isnull().any().reset_index()
df.head()
Out[29]:
index 0
0 type False
In [ ]:
Columns list:
fixed acidity
volatile acidity
citric acid
residual sugar
chlorides
pH
sulphates
In [ ]:
Mode Values:
chlorides 0.036
pH 3.2
sulphates 0.5
In [ ]:
In [ ]:
sns.pairplot(data.iloc[:, 4:10])
Out[33]:
<seaborn.axisgrid.PairGrid at 0x21b8cbc8130>
In [ ]:
X = data.drop("type", axis=1)
X.head()
Out[34]:
free total
fixed volatile citric residual
chlorides sulfur sulfur density pH sulphates alcoh
acidity acidity acid sugar
dioxide dioxide
0 6.3 0.30 0.34 1.6 0.049 14.0 132.0 0.9940 3.30 0.49 9
1 8.1 0.28 0.40 6.9 0.050 30.0 97.0 0.9951 3.26 0.44 10
2 7.2 0.23 0.32 8.5 0.058 47.0 186.0 0.9956 3.19 0.40 9
3 8.1 0.28 0.40 6.9 0.050 30.0 97.0 0.9951 3.26 0.44 10
4 6.2 0.32 0.16 7.0 0.045 30.0 136.0 0.9949 3.18 0.47 9
In [ ]:
y = data["type"]
y.value_counts()
Out[35]:
white 4455
red 1450
In [ ]:
Out[36]:
1 4455
0 1450
In [ ]:
In [ ]:
y_train : (4428,)
y_test : (1477,)
In [ ]:
In [ ]: