0% found this document useful (0 votes)
15 views

ML Lab-1

The document describes analyzing a dataset using Python libraries like Pandas and Scikit-learn. It loads a CSV dataset, cleans missing values, encodes categorical variables, splits the data into train and test sets, and fits linear and random forest regression models to make predictions on the test set. Model performance is evaluated using mean squared error.

Uploaded by

shrinkhal03
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
15 views

ML Lab-1

The document describes analyzing a dataset using Python libraries like Pandas and Scikit-learn. It loads a CSV dataset, cleans missing values, encodes categorical variables, splits the data into train and test sets, and fits linear and random forest regression models to make predictions on the test set. Model performance is evaluated using mean squared error.

Uploaded by

shrinkhal03
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 5

import numpy as np

import pandas as pd

dataset = pd.read_csv("Data.csv")
dataset

{"summary":"{\n \"name\": \"dataset\",\n \"rows\": 10,\n


\"fields\": [\n {\n \"column\": \"Country\",\n
\"properties\": {\n \"dtype\": \"category\",\n
\"num_unique_values\": 3,\n \"samples\": [\n
\"France\",\n \"Spain\",\n \"Germany\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"Age\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 7.693792591722527,\n
\"min\": 27.0,\n \"max\": 50.0,\n \"num_unique_values\":
9,\n \"samples\": [\n 50.0,\n 27.0,\n
35.0\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"Salary\",\n \"properties\": {\n \"dtype\": \"number\",\n
\"std\": 12265.579661982732,\n \"min\": 48000.0,\n
\"max\": 83000.0,\n \"num_unique_values\": 9,\n
\"samples\": [\n 83000.0,\n 48000.0,\n
52000.0\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"Purchased\",\n \"properties\": {\n \"dtype\":
\"category\",\n \"num_unique_values\": 2,\n \"samples\":
[\n \"Yes\",\n \"No\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n }\n ]\n}","type":"dataframe","variable_name":"dataset"}

dataset["Age"].fillna(np.mean(dataset["Age"]))

0 44.000000
1 27.000000
2 30.000000
3 38.000000
4 40.000000
5 35.000000
6 38.777778
7 48.000000
8 50.000000
9 37.000000
Name: Age, dtype: float64

dataset["Age"] = dataset["Age"].fillna(np.mean(dataset["Age"]))

dataset

{"summary":"{\n \"name\": \"dataset\",\n \"rows\": 10,\n


\"fields\": [\n {\n \"column\": \"Country\",\n
\"properties\": {\n \"dtype\": \"category\",\n
\"num_unique_values\": 3,\n \"samples\": [\n
\"France\",\n \"Spain\",\n \"Germany\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"Age\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 7.253777219533095,\n
\"min\": 27.0,\n \"max\": 50.0,\n \"num_unique_values\":
10,\n \"samples\": [\n 50.0,\n 27.0,\n
35.0\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"Salary\",\n \"properties\": {\n \"dtype\": \"number\",\n
\"std\": 12265.579661982732,\n \"min\": 48000.0,\n
\"max\": 83000.0,\n \"num_unique_values\": 9,\n
\"samples\": [\n 83000.0,\n 48000.0,\n
52000.0\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"Purchased\",\n \"properties\": {\n \"dtype\":
\"category\",\n \"num_unique_values\": 2,\n \"samples\":
[\n \"Yes\",\n \"No\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n }\n ]\n}","type":"dataframe","variable_name":"dataset"}

dataset["Salary"] =
dataset["Salary"].fillna(np.mean(dataset["Salary"]))

dataset

{"summary":"{\n \"name\": \"dataset\",\n \"rows\": 10,\n


\"fields\": [\n {\n \"column\": \"Country\",\n
\"properties\": {\n \"dtype\": \"category\",\n
\"num_unique_values\": 3,\n \"samples\": [\n
\"France\",\n \"Spain\",\n \"Germany\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"Age\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 7.253777219533095,\n
\"min\": 27.0,\n \"max\": 50.0,\n \"num_unique_values\":
10,\n \"samples\": [\n 50.0,\n 27.0,\n
35.0\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"Salary\",\n \"properties\": {\n \"dtype\": \"number\",\n
\"std\": 11564.099405562389,\n \"min\": 48000.0,\n
\"max\": 83000.0,\n \"num_unique_values\": 10,\n
\"samples\": [\n 83000.0,\n 48000.0,\n
58000.0\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"Purchased\",\n \"properties\": {\n \"dtype\":
\"category\",\n \"num_unique_values\": 2,\n \"samples\":
[\n \"Yes\",\n \"No\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n }\n ]\n}","type":"dataframe","variable_name":"dataset"}
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

dataset["Purchased"] = le.fit_transform(dataset["Purchased"])

dataset["Country"] = le.fit_transform(dataset["Country"])

dataset

{"summary":"{\n \"name\": \"dataset\",\n \"rows\": 10,\n


\"fields\": [\n {\n \"column\": \"Country\",\n
\"properties\": {\n \"dtype\": \"number\",\n \"std\":
0,\n \"min\": 0,\n \"max\": 2,\n
\"num_unique_values\": 3,\n \"samples\": [\n 0,\n
2,\n 1\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"Age\",\n \"properties\": {\n \"dtype\": \"number\",\n
\"std\": 7.253777219533095,\n \"min\": 27.0,\n \"max\":
50.0,\n \"num_unique_values\": 10,\n \"samples\": [\n
50.0,\n 27.0,\n 35.0\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"Salary\",\n \"properties\":
{\n \"dtype\": \"number\",\n \"std\":
11564.099405562389,\n \"min\": 48000.0,\n \"max\":
83000.0,\n \"num_unique_values\": 10,\n \"samples\": [\n
83000.0,\n 48000.0,\n 58000.0\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"Purchased\",\n
\"properties\": {\n \"dtype\": \"number\",\n \"std\":
0,\n \"min\": 0,\n \"max\": 1,\n
\"num_unique_values\": 2,\n \"samples\": [\n 1,\n
0\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n }\n ]\
n}","type":"dataframe","variable_name":"dataset"}

dataset.iloc[:,:-1]

{"summary":"{\n \"name\": \"dataset\",\n \"rows\": 10,\n


\"fields\": [\n {\n \"column\": \"Country\",\n
\"properties\": {\n \"dtype\": \"number\",\n \"std\":
0,\n \"min\": 0,\n \"max\": 2,\n
\"num_unique_values\": 3,\n \"samples\": [\n 0,\n
2,\n 1\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"Age\",\n \"properties\": {\n \"dtype\": \"number\",\n
\"std\": 7.253777219533095,\n \"min\": 27.0,\n \"max\":
50.0,\n \"num_unique_values\": 10,\n \"samples\": [\n
50.0,\n 27.0,\n 35.0\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"Salary\",\n \"properties\":
{\n \"dtype\": \"number\",\n \"std\":
11564.099405562389,\n \"min\": 48000.0,\n \"max\":
83000.0,\n \"num_unique_values\": 10,\n \"samples\": [\n
83000.0,\n 48000.0,\n 58000.0\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n }\n ]\n}","type":"dataframe"}

from sklearn.model_selection import train_test_split


x_train, x_test, y_train, y_test = train_test_split(dataset.iloc[:,:-
1], dataset["Purchased"], test_size=0.2)
# print(x_train, x_test, y_train, y_test)

y_train

2 0
8 0
4 1
6 0
5 1
0 0
1 1
9 1
Name: Purchased, dtype: int64

from sklearn.linear_model import LinearRegression


lr = LinearRegression()
lr.fit(x_train, y_train)

LinearRegression()

x_test

{"summary":"{\n \"name\": \"x_test\",\n \"rows\": 2,\n \"fields\":


[\n {\n \"column\": \"Country\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 1,\n \"min\": 0,\n
\"max\": 2,\n \"num_unique_values\": 2,\n \"samples\":
[\n 2,\n 0\n ],\n \"semantic_type\":
\"\",\n \"description\": \"\"\n }\n },\n {\n
\"column\": \"Age\",\n \"properties\": {\n \"dtype\":
\"number\",\n \"std\": 7.0710678118654755,\n \"min\":
38.0,\n \"max\": 48.0,\n \"num_unique_values\": 2,\n
\"samples\": [\n 38.0,\n 48.0\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"Salary\",\n \"properties\":
{\n \"dtype\": \"number\",\n \"std\":
12727.922061357855,\n \"min\": 61000.0,\n \"max\":
79000.0,\n \"num_unique_values\": 2,\n \"samples\": [\n
61000.0,\n 79000.0\n ],\n \"semantic_type\":
\"\",\n \"description\": \"\"\n }\n }\n ]\
n}","type":"dataframe","variable_name":"x_test"}
predict = lr.predict(x_test)

from sklearn.metrics import mean_squared_error

mean_squared_error(y_test, predict)

0.29618249533841706

from sklearn.ensemble import RandomForestClassifier


rfc = RandomForestClassifier(max_depth=150)
rfc.fit(x_train,y_train)

RandomForestClassifier(max_depth=150)

rfc.predict(x_test)

array([0, 1])

x_test

{"summary":"{\n \"name\": \"x_test\",\n \"rows\": 2,\n \"fields\":


[\n {\n \"column\": \"Country\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 1,\n \"min\": 0,\n
\"max\": 2,\n \"num_unique_values\": 2,\n \"samples\":
[\n 2,\n 0\n ],\n \"semantic_type\":
\"\",\n \"description\": \"\"\n }\n },\n {\n
\"column\": \"Age\",\n \"properties\": {\n \"dtype\":
\"number\",\n \"std\": 7.0710678118654755,\n \"min\":
38.0,\n \"max\": 48.0,\n \"num_unique_values\": 2,\n
\"samples\": [\n 38.0,\n 48.0\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"Salary\",\n \"properties\":
{\n \"dtype\": \"number\",\n \"std\":
12727.922061357855,\n \"min\": 61000.0,\n \"max\":
79000.0,\n \"num_unique_values\": 2,\n \"samples\": [\n
61000.0,\n 79000.0\n ],\n \"semantic_type\":
\"\",\n \"description\": \"\"\n }\n }\n ]\
n}","type":"dataframe","variable_name":"x_test"}

y_test

7 1
3 0
Name: Purchased, dtype: int64

You might also like