Virat Kohil
Virat Kohil
Virat Kohil
In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
In [2]:
import warnings
warnings.filterwarnings('ignore')
In [3]:
df = pd.read_csv("virat_centuries.csv")
In [4]:
df.head()
Out[4]:
24-
0 116 Out Australia 6 2 NaN Adelaide Oval Adelaide Away 01-
2012
M. 31-
New
1 103 Out 5 2 NaN Chinnaswamy Bangalore Home 08-
Zealand
Stadium 2012
Vidarbha
13-
Cricket
2 103 Out England 5 2 NaN Nagpur Home 12-
Association
2012
Stadium
M. A. 22-
3 107 Out Australia 5 2 NaN Chidambaram Chennai Home 02-
Stadium 2013
18-
South Wanderers
4 119 Out 4 1 NaN Johannesburg Away 12-
Africa Stadium
2013
In [5]:
df.tail()
Out[5]:
Zohur
71 113 Out Bangladesh 3 1 124.80 Ahmed Chittagong Away
Chaudhary 20
73 166 Not Out Sri Lanka 3 1 150.91 Green field Thiruvanantpuram Home
20
Queen's
75 121 Out West Indies 4 1 58.74 Port of Spain Away
Park Oval
20
In [6]:
df.shape
Out[6]:
(76, 14)
In [7]:
df.columns
Out[7]:
In [8]:
df.duplicated().sum()
Out[8]:
In [9]:
df.isnull().sum()
Out[9]:
Score 0
Out/Not Out 0
Against 0
Batting Order 0
Inn. 0
Strike Rate 27
Venue 0
Column1 0
H/A 0
Date 0
Result 0
Format 0
Man of the Match 0
Captain 0
dtype: int64
In [10]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76 entries, 0 to 75
Data columns (total 14 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Score 76 non-null int64
1 Out/Not Out 76 non-null object
2 Against 76 non-null object
3 Batting Order 76 non-null int64
4 Inn. 76 non-null int64
5 Strike Rate 49 non-null float64
6 Venue 76 non-null object
7 Column1 76 non-null object
8 H/A 76 non-null object
9 Date 76 non-null object
10 Result 76 non-null object
11 Format 76 non-null object
12 Man of the Match 76 non-null object
13 Captain 76 non-null object
dtypes: float64(1), int64(3), object(10)
memory usage: 8.4+ KB
In [11]:
In [12]:
df.describe()
Out[12]:
new_column_names = {
'Score': 'Batting Score',
'Out/Not Out': 'Batting Status',
'Against': 'Opponent Team',
'Batting Order': 'Batting Position',
'Inn.': 'Inning Number',
'Strike Rate': 'Batting Strike Rate',
'Venue': 'Match Venue',
'Column1': 'City',
'H/A': 'Home/Away',
'Date': 'Match Date',
'Result': 'Match Result',
'Format': 'Match Format',
'Man of the Match': 'Player of the Match',
'Captain': 'Team Captain'
}
In [14]:
df.rename(columns=new_column_names, inplace=True)
In [15]:
df.nunique()
Out[15]:
Batting Score 50
Batting Status 2
Opponent Team 10
Batting Position 5
Inning Number 4
Batting Strike Rate 50
Match Venue 49
City 47
Home/Away 2
Match Date 75
Match Result 6
Match Format 3
Player of the Match 2
Team Captain 2
dtype: int64
In [16]:
column_data_types = df.dtypes
In [17]:
for i in categorical_columns:
print(i, '- unique values are:')
print(df[i].unique())
print()
Batting Status - unique values are:
['Out' 'Not Out']
In [19]:
for i in categorical_columns:
print(i, '- value counts are:')
print(df[i].value_counts())
print()
for i in categorical_columns:
if i != 'Match Date':
print(i, '- Countplot:')
plt.figure(figsize=(15,6))
sns.countplot(df[i], data = df, palette = 'hls')
plt.xticks(rotation = 90)
plt.show()
for i in categorical_columns:
if i != 'Match Date':
print(i, '- Pieplot:')
plt.figure(figsize=(10, 10))
counts = df[i].value_counts()
plt.pie(counts, labels=counts.index, autopct='%1.1f%%', colors=sns.color_palette
plt.title(i)
plt.show()
In [22]:
for i in categorical_columns:
if i != 'Match Date':
fig = go.Figure(data=[go.Bar(x=df[i].value_counts().index, y=df[i].value_counts(
fig.update_layout(title=i,xaxis_title="Categories",yaxis_title="Count")
fig.show()
Batting Status
50
40
Count
30
In [23]:
for i in categorical_columns:
if i != 'Match Date':
counts = df[i].value_counts()
fig = go.Figure(data=[go.Pie(labels=counts.index, values=counts)])
fig.update_layout(title=i)
fig.show()
Batting Status
27.6%
72 4%
In [24]:
for i in numerical_columns:
plt.figure(figsize=(15, 6))
sns.histplot(df[i], kde=True, bins=20, palette='hls')
plt.xticks(rotation=90)
plt.show()
In [25]:
for i in numerical_columns:
plt.figure(figsize=(15, 6))
sns.distplot(df[i], kde = True, bins = 20)
plt.xticks(rotation=90)
plt.show()
In [26]:
for i in numerical_columns:
plt.figure(figsize=(15, 6))
sns.boxplot(df[i], data = df, palette='hls')
plt.xticks(rotation=90)
plt.show()
In [27]:
for i in numerical_columns:
plt.figure(figsize=(15, 6))
sns.violinplot(df[i], data = df, palette='hls')
plt.xticks(rotation=90)
plt.show()
In [28]:
20
15
Frequency
10
In [29]:
260
240
220
200
Batting Score
180
160
In [30]:
250
Batting Score
200
150
In [31]:
for i in numerical_columns:
for j in categorical_columns:
if j != 'Match Date':
plt.figure(figsize=(15, 6))
sns.barplot(x = df[j], y = df[i], ci = None, data = df, palette='hls')
plt.xticks(rotation=90)
plt.show()
In [32]:
for i in numerical_columns:
for j in categorical_columns:
if j != 'Match Date':
plt.figure(figsize=(15, 6))
sns.boxplot(x = df[j], y = df[i], data = df, palette='hls')
plt.xticks(rotation=90)
plt.show()
In [33]:
for i in numerical_columns:
for j in categorical_columns:
if j != 'Match Date':
plt.figure(figsize=(15, 6))
sns.violinplot(x = df[j], y = df[i], data = df, palette='hls')
plt.xticks(rotation=90)
plt.show()
In [34]:
for i in numerical_columns:
for j in categorical_columns:
fig = px.bar(df, x=j, y=i, title=f'{i} vs {j} - Bar Plot', labels={j: 'Category'
fig.show()
7000
6000
5000
Value
4000
3000
In [35]:
for i in numerical_columns:
for j in categorical_columns:
fig = px.box(df, x=j, y=i, title=f'{i} vs {j} - Bar Plot', labels={j: 'Category'
fig.show()
260
240
220
200
Value
180
160
In [36]:
for i in numerical_columns:
for j in categorical_columns:
fig = px.violin(df, x=j, y=i, title=f'{i} vs {j} - Bar Plot', labels={j: 'Catego
fig.show()
250
200
Value
150
In [37]:
plt.figure(figsize=(10, 6))
sns.heatmap(crosstab_result, annot=True, cmap="YlGnBu", fmt='d')
plt.title(f"Heatmap: Crosstab Analysis between '{col1}' and '{col2}'")
plt.xlabel(col2)
plt.ylabel(col1)
plt.show()
Opponent Team Pakistan South Africa Sri Lanka West Indies Zimbabw
e
Batting Status
Not Out 0 3 7 3
0
Out 2 4 8 9
1
In [38]:
fig = px.imshow(crosstab_result,
x=crosstab_result.columns,
y=crosstab_result.index,
color_continuous_scale="YlGnBu",
title=f"Heatmap: Crosstab Analysis between '{col1}' and '{co
fig.update_xaxes(side="top")
fig.show()
Opponent Team
Ba Ne So W
Af Au ng w ut Sr
ga En Ze Pa hA iL
nis stra lad gla a kist a
tan es nd lan an fric nk
lia h d a a
Batting Status
Not Out
Out
In [39]:
In [40]:
In [41]:
matches_per_year = df.groupby('Year').size().reset_index(name='Matches')
matches_per_month = df.groupby(['Year', 'Month']).size().reset_index(name='Matches')
In [42]:
10
8
Matches
4
In [43]:
1.8
1.6
Matches
1.4
In [44]:
corr_matrix = df.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()
In [45]:
3.5
3
Inning Number
2.5
1.5
In [46]:
260
240
220
200
Batting Score
180
160
140