0% found this document useful (0 votes)
9 views

# Importing Necessary Libraries: Import As Import As Import As Import As

Technique for ML project

Uploaded by

aakash.t.yadav
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
9 views

# Importing Necessary Libraries: Import As Import As Import As Import As

Technique for ML project

Uploaded by

aakash.t.yadav
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 21

# Importing Necessary Libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

#Importing dataset
ipl_df = pd.read_csv('/content/ipl_data.csv')
print(f"Dataset successfully Imported of Shape : {ipl_df.shape}")

Dataset successfully Imported of Shape : (76014, 15)

# First 5 Columns Data


ipl_df.head()

{"summary":"{\n \"name\": \"ipl_df\",\n \"rows\": 76014,\n


\"fields\": [\n {\n \"column\": \"mid\",\n
\"properties\": {\n \"dtype\": \"number\",\n \"std\":
178,\n \"min\": 1,\n \"max\": 617,\n
\"num_unique_values\": 617,\n \"samples\": [\n 50,\n
582,\n 83\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"date\",\n \"properties\": {\n \"dtype\": \"object\",\n
\"num_unique_values\": 442,\n \"samples\": [\n \"2014-
05-02\",\n \"2012-05-15\",\n \"2009-05-18\"\n
],\n \"semantic_type\": \"\",\n \"description\": \"\"\n
}\n },\n {\n \"column\": \"venue\",\n \"properties\":
{\n \"dtype\": \"category\",\n \"num_unique_values\":
35,\n \"samples\": [\n \"Shaheed Veer Narayan Singh
International Stadium\",\n \"Buffalo Park\",\n \"Dr.
Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"bat_team\",\n \"properties\":
{\n \"dtype\": \"category\",\n \"num_unique_values\":
14,\n \"samples\": [\n \"Pune Warriors\",\n
\"Rising Pune Supergiants\",\n \"Kolkata Knight Riders\"\n
],\n \"semantic_type\": \"\",\n \"description\": \"\"\n
}\n },\n {\n \"column\": \"bowl_team\",\n
\"properties\": {\n \"dtype\": \"category\",\n
\"num_unique_values\": 14,\n \"samples\": [\n \"Kochi
Tuskers Kerala\",\n \"Rising Pune Supergiants\",\n
\"Royal Challengers Bangalore\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"batsman\",\n \"properties\":
{\n \"dtype\": \"category\",\n \"num_unique_values\":
411,\n \"samples\": [\n \"A Nehra\",\n \"A
Symonds\",\n \"DJ Bravo\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"bowler\",\n \"properties\":
{\n \"dtype\": \"category\",\n \"num_unique_values\":
329,\n \"samples\": [\n \"IK Pathan\",\n \"AB
McDonald\",\n \"JM Kemp\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"runs\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 48,\n \"min\": 0,\n
\"max\": 263,\n \"num_unique_values\": 252,\n
\"samples\": [\n 106,\n 21,\n 97\
n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"wickets\",\n \"properties\": {\n \"dtype\": \"number\",\
n \"std\": 2,\n \"min\": 0,\n \"max\": 10,\n
\"num_unique_values\": 11,\n \"samples\": [\n 5,\n
0,\n 9\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"overs\",\n \"properties\": {\n \"dtype\": \"number\",\n
\"std\": 5.772586873852113,\n \"min\": 0.0,\n \"max\":
19.6,\n \"num_unique_values\": 140,\n \"samples\": [\n
17.6,\n 11.1,\n 5.1\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"runs_last_5\",\n
\"properties\": {\n \"dtype\": \"number\",\n \"std\":
14,\n \"min\": 0,\n \"max\": 113,\n
\"num_unique_values\": 102,\n \"samples\": [\n 37,\n
18,\n 74\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"wickets_last_5\",\n \"properties\": {\n \"dtype\":
\"number\",\n \"std\": 1,\n \"min\": 0,\n
\"max\": 7,\n \"num_unique_values\": 8,\n \"samples\":
[\n 1,\n 5,\n 0\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"striker\",\n \"properties\":
{\n \"dtype\": \"number\",\n \"std\": 20,\n
\"min\": 0,\n \"max\": 175,\n \"num_unique_values\":
155,\n \"samples\": [\n 95,\n 160,\n
80\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"non-striker\",\n \"properties\": {\n \"dtype\":
\"number\",\n \"std\": 10,\n \"min\": 0,\n
\"max\": 109,\n \"num_unique_values\": 88,\n
\"samples\": [\n 69,\n 0,\n 18\n ],\
n \"semantic_type\": \"\",\n \"description\": \"\"\n
}\n },\n {\n \"column\": \"total\",\n \"properties\":
{\n \"dtype\": \"number\",\n \"std\": 29,\n
\"min\": 67,\n \"max\": 263,\n \"num_unique_values\":
138,\n \"samples\": [\n 132,\n 115,\n
154\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n }\n ]\
n}","type":"dataframe","variable_name":"ipl_df"}

# Describing the ipl_dfset


ipl_df.describe()

{"summary":"{\n \"name\": \"ipl_df\",\n \"rows\": 8,\n \"fields\":


[\n {\n \"column\": \"mid\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 6155.813307577099,\n
\"min\": 1.0,\n \"max\": 17478.0,\n
\"num_unique_values\": 8,\n \"samples\": [\n
71.40788419727657,\n 72.0,\n 17478.0\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"runs\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 6150.2944315706645,\n
\"min\": 0.0,\n \"max\": 17477.0,\n
\"num_unique_values\": 8,\n \"samples\": [\n
74.54814899582308,\n 70.0,\n 17477.0\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"wickets\",\n \"properties\":
{\n \"dtype\": \"number\",\n \"std\":
6177.954883141603,\n \"min\": 0.0,\n \"max\": 17477.0,\n
\"num_unique_values\": 8,\n \"samples\": [\n
2.6637866910797046,\n 2.0,\n 17477.0\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"overs\",\n \"properties\": {\
n \"dtype\": \"number\",\n \"std\": 6175.826215003261,\n
\"min\": 0.0,\n \"max\": 17477.0,\n
\"num_unique_values\": 8,\n \"samples\": [\n
9.75828803570407,\n 9.6,\n 17477.0\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"runs_last_5\",\n
\"properties\": {\n \"dtype\": \"number\",\n \"std\":
6167.329477872403,\n \"min\": 0.0,\n \"max\": 17477.0,\n
\"num_unique_values\": 8,\n \"samples\": [\n
32.71310865709218,\n 34.0,\n 17477.0\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"wickets_last_5\",\n
\"properties\": {\n \"dtype\": \"number\",\n \"std\":
6178.484772156044,\n \"min\": 0.0,\n \"max\": 17477.0,\n
\"num_unique_values\": 7,\n \"samples\": [\n 17477.0,\
n 1.1984322252102764,\n 2.0\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"striker\",\n \"properties\":
{\n \"dtype\": \"number\",\n \"std\":
6165.786654907889,\n \"min\": 0.0,\n \"max\": 17477.0,\n
\"num_unique_values\": 8,\n \"samples\": [\n
24.57475539280197,\n 20.0,\n 17477.0\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"non-striker\",\n
\"properties\": {\n \"dtype\": \"number\",\n \"std\":
6174.403089883773,\n \"min\": 0.0,\n \"max\": 17477.0,\n
\"num_unique_values\": 8,\n \"samples\": [\n
7.979802025519254,\n 4.0,\n 17477.0\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"total\",\n \"properties\": {\
n \"dtype\": \"number\",\n \"std\": 6129.802688655688,\n
\"min\": 28.746713170397086,\n \"max\": 17477.0,\n
\"num_unique_values\": 8,\n \"samples\": [\n
159.5532986210448,\n 161.0,\n 17477.0\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n }\n ]\n}","type":"dataframe"}

# Information about Each Column


ipl_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17478 entries, 0 to 17477
Data columns (total 15 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 mid 17478 non-null int64
1 date 17478 non-null object
2 venue 17478 non-null object
3 bat_team 17478 non-null object
4 bowl_team 17478 non-null object
5 batsman 17478 non-null object
6 bowler 17478 non-null object
7 runs 17477 non-null float64
8 wickets 17477 non-null float64
9 overs 17477 non-null float64
10 runs_last_5 17477 non-null float64
11 wickets_last_5 17477 non-null float64
12 striker 17477 non-null float64
13 non-striker 17477 non-null float64
14 total 17477 non-null float64
dtypes: float64(8), int64(1), object(6)
memory usage: 2.0+ MB

# Number of Unique Values in each column


ipl_df.nunique()

mid 142
date 102
venue 20
bat_team 8
bowl_team 8
batsman 214
bowler 155
runs 225
wickets 11
overs 139
runs_last_5 85
wickets_last_5 7
striker 129
non-striker 59
total 84
dtype: int64

# ipl_df types of all Columns


ipl_df.dtypes

mid int64
date object
venue object
bat_team object
bowl_team object
batsman object
bowler object
runs float64
wickets float64
overs float64
runs_last_5 float64
wickets_last_5 float64
striker float64
non-striker float64
total float64
dtype: object

#Wickets Distribution
sns.displot(ipl_df['wickets'],kde=False,bins=10)
plt.title("Wickets Distribution")

plt.show()
#Runs Distribution
sns.displot(ipl_df['total'],kde=False,bins=10)
plt.title("Runs Distribution")

plt.show()
# Names of all columns
ipl_df.columns

Index(['mid', 'date', 'venue', 'bat_team', 'bowl_team', 'batsman',


'bowler',
'runs', 'wickets', 'overs', 'runs_last_5', 'wickets_last_5',
'striker',
'non-striker', 'total'],
dtype='object')

irrelevant = ['mid', 'date', 'venue','batsman', 'bowler', 'striker', 'non-striker'] print(f'Before


Removing Irrelevant Columns : {ipl_df.shape}') ipl_df = ipl_df.drop(irrelevant, axis=1) # Drop
Irrelevant Columns print(f'After Removing Irrelevant Columns : {ipl_df.shape}') ipl_df.head()

# Define Consistent Teams


const_teams = ['Kolkata Knight Riders', 'Chennai Super Kings',
'Rajasthan Royals',
'Mumbai Indians', 'Kings XI Punjab', 'Royal Challengers
Bangalore',
'Delhi Daredevils', 'Sunrisers Hyderabad']

print(f'Before Removing Inconsistent Teams : {ipl_df.shape}')


ipl_df = ipl_df[(ipl_df['bat_team'].isin(const_teams)) &
(ipl_df['bowl_team'].isin(const_teams))]
print(f'After Removing Irrelevant Columns : {ipl_df.shape}')
print(f"Consistent Teams : \n{ipl_df['bat_team'].unique()}")
ipl_df.head()

Before Removing Inconsistent Teams : (76014, 8)


After Removing Irrelevant Columns : (53811, 8)
Consistent Teams :
['Kolkata Knight Riders' 'Chennai Super Kings' 'Rajasthan Royals'
'Mumbai Indians' 'Kings XI Punjab' 'Royal Challengers Bangalore'
'Delhi Daredevils' 'Sunrisers Hyderabad']

{"summary":"{\n \"name\": \"ipl_df\",\n \"rows\": 53811,\n


\"fields\": [\n {\n \"column\": \"bat_team\",\n
\"properties\": {\n \"dtype\": \"category\",\n
\"num_unique_values\": 8,\n \"samples\": [\n \"Chennai
Super Kings\",\n \"Royal Challengers Bangalore\",\n
\"Kolkata Knight Riders\"\n ],\n \"semantic_type\":
\"\",\n \"description\": \"\"\n }\n },\n {\n
\"column\": \"bowl_team\",\n \"properties\": {\n
\"dtype\": \"category\",\n \"num_unique_values\": 8,\n
\"samples\": [\n \"Kings XI Punjab\",\n \"Chennai
Super Kings\",\n \"Royal Challengers Bangalore\"\n ],\
n \"semantic_type\": \"\",\n \"description\": \"\"\n
}\n },\n {\n \"column\": \"runs\",\n \"properties\":
{\n \"dtype\": \"number\",\n \"std\": 49,\n
\"min\": 0,\n \"max\": 246,\n \"num_unique_values\":
239,\n \"samples\": [\n 66,\n 21,\n
20\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"wickets\",\n \"properties\": {\n \"dtype\": \"number\",\
n \"std\": 2,\n \"min\": 0,\n \"max\": 10,\n
\"num_unique_values\": 11,\n \"samples\": [\n 5,\n
0,\n 9\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"overs\",\n \"properties\": {\n \"dtype\": \"number\",\n
\"std\": 5.773906711218136,\n \"min\": 0.0,\n \"max\":
19.6,\n \"num_unique_values\": 140,\n \"samples\": [\n
17.6,\n 11.1,\n 5.1\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"runs_last_5\",\n
\"properties\": {\n \"dtype\": \"number\",\n \"std\":
15,\n \"min\": 0,\n \"max\": 94,\n
\"num_unique_values\": 94,\n \"samples\": [\n 67,\n
45,\n 42\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"wickets_last_5\",\n \"properties\": {\n \"dtype\":
\"number\",\n \"std\": 1,\n \"min\": 0,\n
\"max\": 7,\n \"num_unique_values\": 8,\n \"samples\":
[\n 1,\n 5,\n 0\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"total\",\n \"properties\": {\
n \"dtype\": \"number\",\n \"std\": 30,\n
\"min\": 67,\n \"max\": 246,\n \"num_unique_values\":
128,\n \"samples\": [\n 101,\n 149,\n
187\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n }\n ]\
n}","type":"dataframe","variable_name":"ipl_df"}

print(f'Before Removing Overs : {ipl_df.shape}')


ipl_df = ipl_df[ipl_df['overs'] >= 5.0]
print(f'After Removing Overs : {ipl_df.shape}')
ipl_df.head()

Before Removing Overs : (53811, 8)


After Removing Overs : (40108, 8)

{"summary":"{\n \"name\": \"ipl_df\",\n \"rows\": 40108,\n


\"fields\": [\n {\n \"column\": \"bat_team\",\n
\"properties\": {\n \"dtype\": \"category\",\n
\"num_unique_values\": 8,\n \"samples\": [\n \"Chennai
Super Kings\",\n \"Royal Challengers Bangalore\",\n
\"Kolkata Knight Riders\"\n ],\n \"semantic_type\":
\"\",\n \"description\": \"\"\n }\n },\n {\n
\"column\": \"bowl_team\",\n \"properties\": {\n
\"dtype\": \"category\",\n \"num_unique_values\": 8,\n
\"samples\": [\n \"Kings XI Punjab\",\n \"Chennai
Super Kings\",\n \"Royal Challengers Bangalore\"\n ],\
n \"semantic_type\": \"\",\n \"description\": \"\"\n
}\n },\n {\n \"column\": \"runs\",\n \"properties\":
{\n \"dtype\": \"number\",\n \"std\": 40,\n
\"min\": 13,\n \"max\": 246,\n \"num_unique_values\":
226,\n \"samples\": [\n 71,\n 20,\n
43\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"wickets\",\n \"properties\": {\n \"dtype\": \"number\",\
n \"std\": 1,\n \"min\": 0,\n \"max\": 10,\n
\"num_unique_values\": 11,\n \"samples\": [\n 5,\n
0,\n 9\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"overs\",\n \"properties\": {\n \"dtype\": \"number\",\n
\"std\": 4.32300139521351,\n \"min\": 5.0,\n \"max\":
19.6,\n \"num_unique_values\": 105,\n \"samples\": [\n
10.1,\n 15.6,\n 15.5\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"runs_last_5\",\n
\"properties\": {\n \"dtype\": \"number\",\n \"std\":
11,\n \"min\": 10,\n \"max\": 94,\n
\"num_unique_values\": 84,\n \"samples\": [\n 93,\n
59,\n 36\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"wickets_last_5\",\n \"properties\": {\n \"dtype\":
\"number\",\n \"std\": 1,\n \"min\": 0,\n
\"max\": 7,\n \"num_unique_values\": 8,\n \"samples\":
[\n 1,\n 5,\n 0\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"total\",\n \"properties\": {\
n \"dtype\": \"number\",\n \"std\": 29,\n
\"min\": 67,\n \"max\": 246,\n \"num_unique_values\":
128,\n \"samples\": [\n 101,\n 149,\n
187\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n }\n ]\
n}","type":"dataframe","variable_name":"ipl_df"}

from seaborn import heatmap


heatmap(data=ipl_df.corr(), annot=True)

<ipython-input-28-8fde0e13fc28>:2: FutureWarning: The default value of


numeric_only in DataFrame.corr is deprecated. In a future version, it
will default to False. Select only valid columns or specify the value
of numeric_only to silence this warning.
heatmap(data=ipl_df.corr(), annot=True)

<Axes: >
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
le = LabelEncoder()
for col in ['bat_team', 'bowl_team']:
ipl_df[col] = le.fit_transform(ipl_df[col])
ipl_df.head()

{"summary":"{\n \"name\": \"ipl_df\",\n \"rows\": 40108,\n


\"fields\": [\n {\n \"column\": \"bat_team\",\n
\"properties\": {\n \"dtype\": \"number\",\n \"std\":
2,\n \"min\": 0,\n \"max\": 7,\n
\"num_unique_values\": 8,\n \"samples\": [\n 0,\n
6,\n 3\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"bowl_team\",\n \"properties\": {\n \"dtype\":
\"number\",\n \"std\": 2,\n \"min\": 0,\n
\"max\": 7,\n \"num_unique_values\": 8,\n \"samples\":
[\n 2,\n 0,\n 6\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"runs\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 40,\n \"min\": 13,\n
\"max\": 246,\n \"num_unique_values\": 226,\n
\"samples\": [\n 71,\n 20,\n 43\
n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"wickets\",\n \"properties\": {\n \"dtype\": \"number\",\
n \"std\": 1,\n \"min\": 0,\n \"max\": 10,\n
\"num_unique_values\": 11,\n \"samples\": [\n 5,\n
0,\n 9\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"overs\",\n \"properties\": {\n \"dtype\": \"number\",\n
\"std\": 4.32300139521351,\n \"min\": 5.0,\n \"max\":
19.6,\n \"num_unique_values\": 105,\n \"samples\": [\n
10.1,\n 15.6,\n 15.5\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"runs_last_5\",\n
\"properties\": {\n \"dtype\": \"number\",\n \"std\":
11,\n \"min\": 10,\n \"max\": 94,\n
\"num_unique_values\": 84,\n \"samples\": [\n 93,\n
59,\n 36\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"wickets_last_5\",\n \"properties\": {\n \"dtype\":
\"number\",\n \"std\": 1,\n \"min\": 0,\n
\"max\": 7,\n \"num_unique_values\": 8,\n \"samples\":
[\n 1,\n 5,\n 0\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"total\",\n \"properties\": {\
n \"dtype\": \"number\",\n \"std\": 29,\n
\"min\": 67,\n \"max\": 246,\n \"num_unique_values\":
128,\n \"samples\": [\n 101,\n 149,\n
187\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n }\n ]\
n}","type":"dataframe","variable_name":"ipl_df"}

from sklearn.compose import ColumnTransformer


columnTransformer = ColumnTransformer([('encoder',
OneHotEncoder(),
[0, 1])],
remainder='passthrough')

ipl_df = np.array(columnTransformer.fit_transform(ipl_df))

cols = ['batting_team_Chennai Super Kings', 'batting_team_Delhi


Daredevils', 'batting_team_Kings XI Punjab',
'batting_team_Kolkata Knight Riders',
'batting_team_Mumbai Indians', 'batting_team_Rajasthan Royals',
'batting_team_Royal Challengers Bangalore',
'batting_team_Sunrisers Hyderabad',
'bowling_team_Chennai Super Kings', 'bowling_team_Delhi
Daredevils', 'bowling_team_Kings XI Punjab',
'bowling_team_Kolkata Knight Riders',
'bowling_team_Mumbai Indians', 'bowling_team_Rajasthan Royals',
'bowling_team_Royal Challengers Bangalore',
'bowling_team_Sunrisers Hyderabad', 'runs', 'wickets', 'overs',
'runs_last_5', 'wickets_last_5', 'total']
df = pd.DataFrame(ipl_df, columns=cols)

# Encoded Data
df.head()

{"type":"dataframe","variable_name":"df"}

features = df.drop(['total'], axis=1)


labels = df['total']

from sklearn.model_selection import train_test_split


train_features, test_features, train_labels, test_labels =
train_test_split(features, labels, test_size=0.20, shuffle=True)
print(f"Training Set : {train_features.shape}\nTesting Set :
{test_features.shape}")

Training Set : (32086, 21)


Testing Set : (8022, 21)

models = dict()

from sklearn.tree import DecisionTreeRegressor


tree = DecisionTreeRegressor()
# Train Model
tree.fit(train_features, train_labels)

DecisionTreeRegressor()

# Evaluate Model
train_score_tree = str(tree.score(train_features, train_labels) * 100)
test_score_tree = str(tree.score(test_features, test_labels) * 100)
print(f'Train Score : {train_score_tree[:5]}%\nTest Score :
{test_score_tree[:5]}%')
models["tree"] = test_score_tree

Train Score : 99.98%


Test Score : 87.48%

from sklearn.metrics import mean_absolute_error as mae,


mean_squared_error as mse
print("---- Decision Tree Regressor - Model Evaluation ----")
print("Mean Absolute Error (MAE): {}".format(mae(test_labels,
tree.predict(test_features))))
print("Mean Squared Error (MSE): {}".format(mse(test_labels,
tree.predict(test_features))))
print("Root Mean Squared Error (RMSE):
{}".format(np.sqrt(mse(test_labels, tree.predict(test_features)))))
---- Decision Tree Regressor - Model Evaluation ----
Mean Absolute Error (MAE): 3.855958613812017
Mean Squared Error (MSE): 113.42043754674644
Root Mean Squared Error (RMSE): 10.64990317076857

from sklearn.linear_model import LinearRegression


linreg = LinearRegression()
# Train Model
linreg.fit(train_features, train_labels)

LinearRegression()

# Evaluate Model
train_score_linreg = str(linreg.score(train_features, train_labels) *
100)
test_score_linreg = str(linreg.score(test_features, test_labels) *
100)
print(f'Train Score : {train_score_linreg[:5]}%\nTest Score :
{test_score_linreg[:5]}%')
models["linreg"] = test_score_linreg

Train Score : 65.85%


Test Score : 66.14%

print("---- Linear Regression - Model Evaluation ----")


print("Mean Absolute Error (MAE): {}".format(mae(test_labels,
linreg.predict(test_features))))
print("Mean Squared Error (MSE): {}".format(mse(test_labels,
linreg.predict(test_features))))
print("Root Mean Squared Error (RMSE):
{}".format(np.sqrt(mse(test_labels, linreg.predict(test_features)))))

---- Linear Regression - Model Evaluation ----


Mean Absolute Error (MAE): 13.13286821326581
Mean Squared Error (MSE): 306.74069830363675
Root Mean Squared Error (RMSE): 17.51401434005456

from sklearn.ensemble import RandomForestRegressor


forest = RandomForestRegressor()
# Train Model
forest.fit(train_features, train_labels)

RandomForestRegressor()

# Evaluate Model
train_score_forest = str(forest.score(train_features,
train_labels)*100)
test_score_forest = str(forest.score(test_features, test_labels)*100)
print(f'Train Score : {train_score_forest[:5]}%\nTest Score :
{test_score_forest[:5]}%')
models["forest"] = test_score_forest
Train Score : 99.04%
Test Score : 93.78%

print("---- Random Forest Regression - Model Evaluation ----")


print("Mean Absolute Error (MAE): {}".format(mae(test_labels,
forest.predict(test_features))))
print("Mean Squared Error (MSE): {}".format(mse(test_labels,
forest.predict(test_features))))
print("Root Mean Squared Error (RMSE):
{}".format(np.sqrt(mse(test_labels, forest.predict(test_features)))))

---- Random Forest Regression - Model Evaluation ----


Mean Absolute Error (MAE): 4.4322614269093314
Mean Squared Error (MSE): 56.283472061009306
Root Mean Squared Error (RMSE): 7.502231138868577

from sklearn.svm import SVR


svm = SVR()
# Train Model
svm.fit(train_features, train_labels)

SVR()

from sklearn.svm import SVR


svm = SVR()
# Train Model
svm.fit(train_features, train_labels)

SVR()

train_score_svm = str(svm.score(train_features, train_labels)*100)


test_score_svm = str(svm.score(test_features, test_labels)*100)
print(f'Train Score : {train_score_svm[:5]}%\nTest Score :
{test_score_svm[:5]}%')
models["svm"] = test_score_svm

Train Score : 57.34%


Test Score : 57.79%

print("---- Support Vector Regression - Model Evaluation ----")


print("Mean Absolute Error (MAE): {}".format(mae(test_labels,
svm.predict(test_features))))
print("Mean Squared Error (MSE): {}".format(mse(test_labels,
svm.predict(test_features))))
print("Root Mean Squared Error (RMSE):
{}".format(np.sqrt(mse(test_labels, svm.predict(test_features)))))

---- Support Vector Regression - Model Evaluation ----


Mean Absolute Error (MAE): 14.746880132948581
Mean Squared Error (MSE): 382.4076559758673
Root Mean Squared Error (RMSE): 19.555246251987402
from xgboost import XGBRegressor
xgb = XGBRegressor()
# Train Model
xgb.fit(train_features, train_labels)

XGBRegressor(base_score=None, booster=None, callbacks=None,


colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None,
early_stopping_rounds=None,
enable_categorical=False, eval_metric=None,
feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None,
max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan,
monotone_constraints=None,
multi_strategy=None, n_estimators=None, n_jobs=None,
num_parallel_tree=None, random_state=None, ...)

train_score_xgb = str(xgb.score(train_features, train_labels)*100)


test_score_xgb = str(xgb.score(test_features, test_labels)*100)
print(f'Train Score : {train_score_xgb[:5]}%\nTest Score :
{test_score_xgb[:5]}%')
models["xgb"] = test_score_xgb

Train Score : 88.63%


Test Score : 85.09%

print("---- XGB Regression - Model Evaluation ----")


print("Mean Absolute Error (MAE): {}".format(mae(test_labels,
xgb.predict(test_features))))
print("Mean Squared Error (MSE): {}".format(mse(test_labels,
xgb.predict(test_features))))
print("Root Mean Squared Error (RMSE):
{}".format(np.sqrt(mse(test_labels, xgb.predict(test_features)))))

---- XGB Regression - Model Evaluation ----


Mean Absolute Error (MAE): 8.323914393555997
Mean Squared Error (MSE): 135.04082317310105
Root Mean Squared Error (RMSE): 11.620706655496518

from sklearn.neighbors import KNeighborsRegressor


knr = KNeighborsRegressor()
# Train Model
knr.fit(train_features, train_labels)

KNeighborsRegressor()
train_score_knr = str(knr.score(train_features, train_labels)*100)
test_score_knr = str(knr.score(test_features, test_labels)*100)
print(f'Train Score : {train_score_knr[:5]}%\nTest Score :
{test_score_knr[:5]}%')
models["knr"] = test_score_knr

Train Score : 86.83%


Test Score : 77.01%

print("---- KNR - Model Evaluation ----")


print("Mean Absolute Error (MAE): {}".format(mae(test_labels,
knr.predict(test_features))))
print("Mean Squared Error (MSE): {}".format(mse(test_labels,
knr.predict(test_features))))
print("Root Mean Squared Error (RMSE):
{}".format(np.sqrt(mse(test_labels, knr.predict(test_features)))))

---- KNR - Model Evaluation ----


Mean Absolute Error (MAE): 10.000822737471953
Mean Squared Error (MSE): 208.29277985539767
Root Mean Squared Error (RMSE): 14.432351847685728

import matplotlib.pyplot as plt


model_names = list(models.keys())
accuracy = list(map(float, models.values()))
# creating the bar plot
plt.bar(model_names, accuracy)

<BarContainer object of 6 artists>


def score_predict(batting_team, bowling_team, runs, wickets, overs,
runs_last_5, wickets_last_5, model=forest):
prediction_array = []
# Batting Team
if batting_team == 'Chennai Super Kings':
prediction_array = prediction_array + [1,0,0,0,0,0,0,0]
elif batting_team == 'Delhi Daredevils':
prediction_array = prediction_array + [0,1,0,0,0,0,0,0]
elif batting_team == 'Kings XI Punjab':
prediction_array = prediction_array + [0,0,1,0,0,0,0,0]
elif batting_team == 'Kolkata Knight Riders':
prediction_array = prediction_array + [0,0,0,1,0,0,0,0]
elif batting_team == 'Mumbai Indians':
prediction_array = prediction_array + [0,0,0,0,1,0,0,0]
elif batting_team == 'Rajasthan Royals':
prediction_array = prediction_array + [0,0,0,0,0,1,0,0]
elif batting_team == 'Royal Challengers Bangalore':
prediction_array = prediction_array + [0,0,0,0,0,0,1,0]
elif batting_team == 'Sunrisers Hyderabad':
prediction_array = prediction_array + [0,0,0,0,0,0,0,1]
# Bowling Team
if bowling_team == 'Chennai Super Kings':
prediction_array = prediction_array + [1,0,0,0,0,0,0,0]
elif bowling_team == 'Delhi Daredevils':
prediction_array = prediction_array + [0,1,0,0,0,0,0,0]
elif bowling_team == 'Kings XI Punjab':
prediction_array = prediction_array + [0,0,1,0,0,0,0,0]
elif bowling_team == 'Kolkata Knight Riders':
prediction_array = prediction_array + [0,0,0,1,0,0,0,0]
elif bowling_team == 'Mumbai Indians':
prediction_array = prediction_array + [0,0,0,0,1,0,0,0]
elif bowling_team == 'Rajasthan Royals':
prediction_array = prediction_array + [0,0,0,0,0,1,0,0]
elif bowling_team == 'Royal Challengers Bangalore':
prediction_array = prediction_array + [0,0,0,0,0,0,1,0]
elif bowling_team == 'Sunrisers Hyderabad':
prediction_array = prediction_array + [0,0,0,0,0,0,0,1]
prediction_array = prediction_array + [runs, wickets, overs,
runs_last_5, wickets_last_5]
prediction_array = np.array([prediction_array])
pred = model.predict(prediction_array)
return int(round(pred[0]))

batting_team='Delhi Daredevils'
bowling_team='Chennai Super Kings'
score = score_predict(batting_team, bowling_team, overs=10.2, runs=68,
wickets=3, runs_last_5=29, wickets_last_5=1)
print(f'Predicted Score : {score} || Actual Score : 147')

Predicted Score : 151 || Actual Score : 147

/usr/local/lib/python3.10/dist-packages/sklearn/base.py:439:
UserWarning: X does not have valid feature names, but
RandomForestRegressor was fitted with feature names
warnings.warn(

batting_team='Mumbai Indians'
bowling_team='Kings XI Punjab'
score = score_predict(batting_team, bowling_team, overs=12.3,
runs=113, wickets=2, runs_last_5=55, wickets_last_5=0)
print(f'Predicted Score : {score} || Actual Score : 176')

Predicted Score : 188 || Actual Score : 176

/usr/local/lib/python3.10/dist-packages/sklearn/base.py:439:
UserWarning: X does not have valid feature names, but
RandomForestRegressor was fitted with feature names
warnings.warn(

batting_team="Kings XI Punjab"
bowling_team="Rajasthan Royals"
score =score_predict(batting_team, bowling_team, overs=14.0, runs=118,
wickets=1, runs_last_5=45, wickets_last_5=0)
print(f'Predicted Score : {score} || Actual Score : 185')

Predicted Score : 175 || Actual Score : 185


/usr/local/lib/python3.10/dist-packages/sklearn/base.py:439:
UserWarning: X does not have valid feature names, but
RandomForestRegressor was fitted with feature names
warnings.warn(

batting_team="Kolkata Knight Riders"


bowling_team="Chennai Super Kings"
score = score_predict(batting_team, bowling_team, overs=18.0,
runs=150, wickets=4, runs_last_5=57, wickets_last_5=1)
print(f'Predicted Score : {score} || Actual Score : 172')

Predicted Score : 172 || Actual Score : 172

/usr/local/lib/python3.10/dist-packages/sklearn/base.py:439:
UserWarning: X does not have valid feature names, but
RandomForestRegressor was fitted with feature names
warnings.warn(

batting_team='Delhi Daredevils'
bowling_team='Mumbai Indians'
score = score_predict(batting_team, bowling_team, overs=18.0, runs=96,
wickets=8, runs_last_5=18, wickets_last_5=4)
print(f'Predicted Score : {score} || Actual Score : 110')

Predicted Score : 108 || Actual Score : 110

/usr/local/lib/python3.10/dist-packages/sklearn/base.py:439:
UserWarning: X does not have valid feature names, but
RandomForestRegressor was fitted with feature names
warnings.warn(

batting_team='Kings XI Punjab'
bowling_team='Chennai Super Kings'
score = score_predict(batting_team, bowling_team, overs=18.0,
runs=129, wickets=6, runs_last_5=34, wickets_last_5=2)
print(f'Predicted Score : {score} || Actual Score : 153')

Predicted Score : 148 || Actual Score : 153

/usr/local/lib/python3.10/dist-packages/sklearn/base.py:439:
UserWarning: X does not have valid feature names, but
RandomForestRegressor was fitted with feature names
warnings.warn(

batting_team='Sunrisers Hyderabad'
bowling_team='Royal Challengers Bangalore'
score = score_predict(batting_team, bowling_team, overs=10.5, runs=67,
wickets=3, runs_last_5=29, wickets_last_5=1)
print(f'Predicted Score : {score} || Actual Score : 146')

Predicted Score : 153 || Actual Score : 146


/usr/local/lib/python3.10/dist-packages/sklearn/base.py:439:
UserWarning: X does not have valid feature names, but
RandomForestRegressor was fitted with feature names
warnings.warn(

import pickle
filename = "ml_model.pkl"
pickle.dump(forest, open(filename, "wb"))

You might also like