Bollywood and Heart Data Analysis
Bollywood and Heart Data Analysis
In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(color_codes=True)
import warnings
warnings.filterwarnings('ignore')
warnings.warn('DelftStack')
warnings.warn('Do not show this message')
print("No Warning Shown")
No Warning Shown
In [2]:
BW = pd.read_csv('bollywood.csv')
BW.head()
Out[2]: Release
SlNo MovieName ReleaseTime Genre Budget BoxOfficeCollection YoutubeViews You
Date
18-Apr-
0 1 2 States LW Romance 36 104.00 8576361
14
4-Jan-
1 2 Table No. 21 N Thriller 10 12.00 1087320
13
4-Jan- Rajdhani
3 4 N Drama 7 0.35 42626
13 Express
Bobby
4 5 4-Jul-14 N Comedy 18 10.80 3113427
Jasoos
In [3]:
print(BW.shape)
BW.info()
(149, 10)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149 entries, 0 to 148
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 SlNo 149 non-null int64
1 Release Date 149 non-null object
2 MovieName 149 non-null object
3 ReleaseTime 149 non-null object
4 Genre 149 non-null object
5 Budget 149 non-null int64
6 BoxOfficeCollection 149 non-null float64
7 YoutubeViews 149 non-null int64
8 YoutubeLikes 149 non-null int64
9 YoutubeDislikes 149 non-null int64
In [4]:
Movies_by_genre = BW.groupby('Genre')['MovieName'].count().reset_index(name="MovieName_
print(Movies_by_genre.sort_values('MovieName_count',ascending = False))
sns.set_context("paper", font_scale= 1.5)
plt.title("MovieName_count vs Month")
sns.barplot(x='Genre',y ='MovieName_count', data = Movies_by_genre)
plt.xticks(rotation= 80)
plt.show()
Movies_by_genre['MovieName_count'].max()
Genre MovieName_count
3 Comedy 36
0 Drama 35
5 Thriller 26
4 Romance 25
1 Action 21
2 Action 3
6 Thriller 3
Out[4]: 36
In [5]:
cross_tab = pd.crosstab(BW.Genre, BW.ReleaseTime)
cross_tab
Out[5]: ReleaseTime FS HS LW N
Genre
Drama 4 6 1 24
Action 3 3 3 12
Action 0 0 0 3
Comedy 3 5 5 23
Romance 3 3 4 15
ReleaseTime FS HS LW N
Genre
Thriller 4 1 1 20
Thriller 0 0 1 2
In [6]:
BW['Month'] = pd.DatetimeIndex(BW['Release Date']).month
BW.head(2)
Movies_by_month = BW.groupby('Month')['MovieName'].count().reset_index(name="Movie_coun
print(Movies_by_month.sort_values('Movie_count',ascending = False))
sns.set_context("paper", font_scale= 1.5)
plt.title("Movie Count vs Month")
sns.barplot(x='Month',y ='Movie_count', data = Movies_by_month)
plt.show()
Movies_by_month['Movie_count'].max()
Month Movie_count
0 1 20
2 3 19
4 5 18
1 2 16
6 7 16
3 4 11
5 6 10
8 9 10
10 11 10
9 10 9
7 8 8
11 12 2
Out[6]: 20
In [7]:
High_budget = BW[(BW['Budget'] > 25)]
HighBudgetMovies_by_month = High_budget.groupby('Month')['MovieName'].count().reset_ind
print(HighBudgetMovies_by_month.sort_values('Movie_count',ascending = False))
sns.set_context("paper", font_scale= 1.5)
plt.title("Movie Count vs Month")
sns.barplot(x='Month',y ='Movie_count', data = HighBudgetMovies_by_month)
Month Movie_count
1 2 9
7 8 7
0 1 6
2 3 6
6 7 6
10 11 6
5 6 5
3 4 4
8 9 4
9 10 4
4 5 3
11 12 2
Out[7]: 9
In [8]:
BW['ROI'] = (BW['BoxOfficeCollection']-BW['Budget'])/BW['Budget']
Top10_ROI = BW.sort_values('ROI',ascending = False)
Top10 = Top10_ROI[['MovieName','ROI','ReleaseTime']].head(10)
Top10
64 Aashiqui 2 8.166667 N
89 PK 7.647059 HS
87 Fukrey 6.240000 N
In [9]:
cross_tab_ROI = pd.crosstab(Top10.ROI,Top10.ReleaseTime)
print(cross_tab_ROI)
Avg_ROI = Top10.groupby('ReleaseTime')['ROI'].mean()
Avg_ROI
ReleaseTime FS HS LW N
ROI
4.266667 1 0 0 0
4.466667 0 0 0 1
5.500000 0 0 0 1
5.666667 1 0 0 0
5.933333 0 0 0 1
6.240000 0 0 0 1
7.500000 0 0 0 1
7.514286 0 0 1 0
7.647059 0 1 0 0
8.166667 0 0 0 1
Out[9]: ReleaseTime
FS 4.966667
HS 7.647059
LW 7.514286
N 6.301111
Name: ROI, dtype: float64
In [27]:
sns.set_context("paper", font_scale= 1.5)
plt.title("Histogram+Density Plot(Budget)")
sns.distplot(BW['Budget'], hist = True, color ='r')
In [11]:
Comedy_ROI = BW[(BW['Genre'] == 'Comedy')]
localhost:8888/nbconvert/html/Bollywood and Heart Data Analysis.ipynb?download=false 5/15
9/20/22, 8:59 PM Bollywood and Heart Data Analysis
Drama_ROI = BW[(BW['Genre'] == ' Drama ')]
Drama_ROI.head(2)
plt.figure(figsize=(12,10))
sns.distplot(Drama_ROI['ROI'], hist = True, color = 'r', label = 'Drama')
sns.distplot(Comedy_ROI['ROI'], hist = True, color = 'b', label = 'Comedy')
plt.title('Drama vs Comedy', fontsize = 16)
plt.xlabel('Values', fontsize = 14)
plt.ylabel('Frequency', fontsize = 14)
plt.legend(loc = 'upper left', fontsize = 13)
plt.show()
In [12]:
sns.set_context("paper", font_scale= 1.5)
sns.lmplot(y="YoutubeLikes", x="BoxOfficeCollection", data=BW)
### Yes There is positive correlation between BoxOfficeCollection and YoutubeLikes
In [13]:
### Box Plots ###
plt.figure(figsize=(10,8))
sns.set_context("paper", font_scale= 1.5)
sns.boxplot(x="Genre", y="YoutubeLikes", data= BW, palette="Set3")
plt.xticks(rotation= 80)
plt.show()
In [14]:
plt.figure(figsize=(10,8))
Numerical_Variables = BW[['Budget','BoxOfficeCollection','YoutubeViews','YoutubeLikes',
sns.set_context("paper", font_scale= 1.5)
sns.heatmap(Numerical_Variables.corr(), cmap= 'YlGnBu', annot=True)
plt.show()
Numerical_Variables.corr().T
### Yes There is a Positive high Correlation among Budget, BoxOfficeCollection, Youtube
In [15]:
Heart = pd.read_csv('SAheart.csv')
Heart.head()
Out[15]: sbp tobacco ldl adiposity famhist typea obesity alcohol age chd
sbp tobacco ldl adiposity famhist typea obesity alcohol age chd
In [16]:
print(Heart.shape)
Heart.info()
(462, 10)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 462 entries, 0 to 461
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 sbp 462 non-null int64
1 tobacco 462 non-null float64
2 ldl 462 non-null float64
3 adiposity 462 non-null float64
4 famhist 462 non-null object
5 typea 462 non-null int64
6 obesity 462 non-null float64
7 alcohol 462 non-null float64
8 age 462 non-null int64
9 chd 462 non-null object
dtypes: float64(5), int64(3), object(2)
memory usage: 36.2+ KB
In [17]:
Group_data = Heart.groupby('chd')['famhist'].count().reset_index(name="famhist_count")
sns.set_context("paper", font_scale= 1.5)
plt.title('famhist_count vs chd')
sns.barplot(x = 'chd',y='famhist_count',data = Group_data)
plt.show()
Group_data.head()
0 No 302
chd famhist_count
1 Si 160
In [18]:
sns.set_context("paper", font_scale= 1.5)
sns.lmplot(y="age", x="sbp", data= Heart)
# Yes there is correlation between age and sbp
In [19]:
yes_chd = Heart[(Heart['chd'] == 'Si')]
No_chd = Heart[(Heart['chd'] == 'No')]
No_chd.head(2)
plt.figure(figsize=(12,10))
sns.distplot(yes_chd['tobacco'], hist = True, color = 'r', label = 'yes_chd')
sns.distplot(No_chd['tobacco'], hist = True, color = 'b', label = 'No_chd')
plt.title('yes_chd vs No_chd', fontsize = 16)
plt.xlabel('Values', fontsize = 14)
plt.ylabel('Frequency', fontsize = 14)
plt.legend(loc = 'upper left', fontsize = 13)
plt.show()
### Distribution show that those who consume tobacco there are higher chances of gettin
In [20]:
plt.figure(figsize=(10,8))
Numerical_Variables1 = Heart[['sbp','obesity','age','ldl']]
sns.set_context("paper", font_scale= 1.5)
sns.heatmap(Numerical_Variables1.corr(), cmap= 'YlGnBu', annot=True)
plt.show()
Numerical_Variables1.corr().T
In [21]:
# her we define the threshhold or our age groups
age_groups = [0,15,35,55,64]
Out[21]: sbp tobacco ldl adiposity famhist typea obesity alcohol age chd Age_group
In [22]:
chd_cases = Heart[(Heart['chd'] == 'Si')]
Group_data1 = chd_cases.groupby('Age_group')['chd'].count().reset_index(name="chd_count
sns.set_context("paper", font_scale= 1.5)
plt.title('chd_count vs Age_group')
sns.barplot(x = 'Age_group',y='chd_count',data = Group_data1)
plt.show()
Group_data1.head(4)
0 Young 0
1 adults 18
2 mid 81
3 old 61
In [23]:
sns.set_context("paper", font_scale= 1.5)
plt.figure(figsize=(10,8))
sns.boxplot(x="Age_group", y="ldl", data= Heart, palette="Set3")
plt.xticks(rotation= 80)
plt.show()